<a href="https://colab.research.google.com/github/miketeu/Challenge1_responsive/blob/main/AI_Fundamentals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Package Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import math
import re
from sklearn.cluster import KMeans

### TF-IDF Example

In [None]:
documents = [
    "A clumsy British woman awkwardly navigates her career, family pressure, and embarrassing love life while keeping a brutally honest diary of wine soaked regrets.",
    "An old man reads emotional diary entries about young love, heartbreak, emotional growth, and two wildly different people who fall in and out of love.",
    "A rich orphan develops bat-themed PTSD and takes it out on a mentally ill clown, using gadgets, suits, and intense voice work.",
    "An old boat crashes and love dies because two people can’t share a wooden door, while class tension sinks along with the romance.",
    "A man wears a metal suit to cope with his trauma, invents dangerous tech, and builds laser weapons while dealing with his ego and wealth.",
    "Talking toys develop existential dread when their child loses interest in playing with them, questioning loyalty, friendship, and shelf life.",
    "A rat controls a French man by pulling his hair, cooks gourmet food, and challenges the kitchen hierarchy through teamwork and hygiene violations.",
    "An ice lady sings one song and causes a seasonal climate disaster, learns self-acceptance, and accidentally becomes queen of emotional repression.",
    "A short-sighted child defeats an evil snake man with the power of love, friendship, and conveniently-timed magical plot armour.",
    "A time-traveling teen nearly kisses his mother to save a rock concert, breaks the timeline, and invents 1980s pop culture by accident."
]

In [None]:
# Split each description into word/"tokens" + remove punctuation
doc_words = [
    [re.sub(r'[^\w\s]', '', word.lower()) for word in doc.split()]
    for doc in documents
]
print(doc_words)
print(len(doc_words))

[['a', 'clumsy', 'british', 'woman', 'awkwardly', 'navigates', 'her', 'career', 'family', 'pressure', 'and', 'embarrassing', 'love', 'life', 'while', 'keeping', 'a', 'brutally', 'honest', 'diary', 'of', 'wine', 'soaked', 'regrets'], ['an', 'old', 'man', 'reads', 'emotional', 'diary', 'entries', 'about', 'young', 'love', 'heartbreak', 'emotional', 'growth', 'and', 'two', 'wildly', 'different', 'people', 'who', 'fall', 'in', 'and', 'out', 'of', 'love'], ['a', 'rich', 'orphan', 'develops', 'batthemed', 'ptsd', 'and', 'takes', 'it', 'out', 'on', 'a', 'mentally', 'ill', 'clown', 'using', 'gadgets', 'suits', 'and', 'intense', 'voice', 'work'], ['an', 'old', 'boat', 'crashes', 'and', 'love', 'dies', 'because', 'two', 'people', 'cant', 'share', 'a', 'wooden', 'door', 'while', 'class', 'tension', 'sinks', 'along', 'with', 'the', 'romance'], ['a', 'man', 'wears', 'a', 'metal', 'suit', 'to', 'cope', 'with', 'his', 'trauma', 'invents', 'dangerous', 'tech', 'and', 'builds', 'laser', 'weapons', 'whi

In [None]:
# number of times the word 'diary' appears in each description - try changing the word & see what happens!
for words in doc_words:
  term_count = words.count('diary')
  print(term_count)

1
1
0
0
0
0
0
0
0
0


In [None]:
# now we divide by length of the particular film description to get TF
for words in doc_words:
  term_count = words.count('diary')
  tf_value = term_count/len(words)
  print(tf_value)

0.041666666666666664
0.04
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [None]:
# df of specific word 'diary'
df = sum(1 for words in doc_words if 'diary' in words)
print(df)

2


In [None]:
# idf of specific word 'diary'
num_docs = 10
idf = math.log(num_docs / df)
print(idf)

1.6094379124341003


In [None]:
# tf-idf for 'diary' within each document
for words in doc_words:
  term_count = words.count('diary')
  tf_value = term_count/len(words)
  tfidf = tf_value * idf
  print(tfidf)

0.0670599130180875
0.064377516497364
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


### Let's put it all together with multiple words at a time

In [None]:
# Define words of interest
terms = ["and", "love", "diary"]

# Compute Term Frequency (TF)
tf = {}
for term in terms:
    tf[term] = []
    for words in doc_words:
        term_count = words.count(term)
        tf_value = term_count / len(words)  # Frequency of term in document
        tf[term].append(tf_value)

# Compute Document Frequency (DF)
df = {}
for term in terms:
    df[term] = sum(1 for words in doc_words if term in words)

# Compute Inverse Document Frequency (IDF)
num_docs = len(documents)
idf = {term: math.log(num_docs / df[term]) for term in terms}

# Compute TF-IDF
tfidf = {}
for term in terms:
    tfidf[term] = [tf[term][i] * idf[term] for i in range(num_docs)]

# Print TF-IDF values
for term in terms:
    print(f"TF-IDF for '{term}': {tfidf[term]}")

TF-IDF for 'and': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
TF-IDF for 'love': [0.03817878049475646, 0.07330325854993242, 0.0, 0.039838727472789354, 0.0, 0.0, 0.0, 0.0, 0.04822582799337658, 0.0]
TF-IDF for 'diary': [0.0670599130180875, 0.064377516497364, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


### Cosine similarity for document 1 and 2

In [None]:
# Transpose & group values by index (doc)
num_docs = len(next(iter(tfidf.values())))
document_vectors = []
for i in range(num_docs):
    vector = [tfidf[word][i] for word in tfidf]
    document_vectors.append(vector)
# Document_vectors[i] is a vector for doc i
print(document_vectors)

[[0.0, 0.03817878049475646, 0.0670599130180875], [0.0, 0.07330325854993242, 0.064377516497364], [0.0, 0.0, 0.0], [0.0, 0.039838727472789354, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.04822582799337658, 0.0], [0.0, 0.0, 0.0]]


In [None]:
# Calculating the cosine similarity for document 1 and document 2
doc1_vector = document_vectors[0]
doc2_vector = document_vectors[1]
print(doc1_vector)
print(doc2_vector)

dot_product = np.dot(doc1_vector, doc2_vector)
norm1 = np.linalg.norm(doc1_vector)
norm2 = np.linalg.norm(doc2_vector)
cosine_similarity = dot_product / (norm1 * norm2)
print(cosine_similarity)

[0.0, 0.03817878049475646, 0.0670599130180875]
[0.0, 0.07330325854993242, 0.064377516497364]
0.9452034609448741


## Implementing K-Means

In [None]:
# Choose number of clusters
num_clusters = 3

# document names
doc_names = ['Bridget Jones', 'The Notebook', 'The Dark Knight', 'Titanic', 'Iron Man', 'Toy Story', 'Ratatouille', 'Frozen', 'Harry Potter', 'Back to the Future']

# Run K-Means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(document_vectors)
# Get cluster labels
labels = kmeans.labels_

# Print document name + cluster assignment
for doc_name, label in zip(doc_names, labels):
    print(f"{doc_name} is in cluster {label}")

Bridget Jones is in cluster 2
The Notebook is in cluster 2
The Dark Knight is in cluster 1
Titanic is in cluster 0
Iron Man is in cluster 1
Toy Story is in cluster 1
Ratatouille is in cluster 1
Frozen is in cluster 1
Harry Potter is in cluster 0
Back to the Future is in cluster 1
