<h1>TF IDF + Preprocessing</h1>

In [31]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter
from gensim.models import Word2Vec # Needed for Word2Vec section later
import re # For punctuation removal
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [33]:
dataset = ["I love playing football on the weekends",
"I enjoy hiking and camping in the mountains",
"I like to read books and watch movies",
"I prefer playing video games over sports",
"I love listening to music and going to concerts"]

In [35]:
def preprocess_text(text):
    text = text.lower() # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    tokens = word_tokenize(text) # Tokenize
    stop_words = set(stopwords.words('english')) # Get English stop words
    cleaned_tokens = [word for word in tokens if word not in stop_words and word.isalpha()] # Remove stop words and non-alphabetic tokens
    return ' '.join(cleaned_tokens) # Join tokens back into a string (required for TfidfVectorizer)

In [63]:
dataset = [preprocess_text(doc) for doc in dataset]
print("Preprocessed Dataset (TF-IDF):")
for doc in dataset: print(f"- {doc}")

Preprocessed Dataset (TF-IDF):
- love playing football weekends
- enjoy hiking camping mountains
- like read books watch movies
- prefer playing video games sports
- love listening music going concerts


In [39]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)
# Predict the clusters for each document
y_pred = km.predict(X)
# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                               Predicted Cluster
-----------------------------------  -------------------
love playing football weekends                         1
enjoy hiking camping mountains                         0
like read books watch movies                           1
prefer playing video games sports                      1
love listening music going concerts                    1


In [41]:
# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" % i)
for ind in order_centroids[i, :10]:
    print(' %s' % terms[ind])
    print()


Top terms per cluster:
Cluster 0:
Cluster 1:
 love

 playing

 football

 weekends

 going

 sports

 music

 concerts

 video

 games



In [43]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.8


<h1>Word2Vec + Preprocessing</h1>

In [45]:
from gensim.models import Word2Vec

In [47]:
dataset = ["I love playing football on the weekends",
"I enjoy hiking and camping in the mountains",
"I like to read books and watch movies",
"I prefer playing video games over sports",
"I love listening to music and going to concerts"]

In [49]:
def preprocess_text(text):
    text = text.lower() # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    tokens = word_tokenize(text) # Tokenize
    stop_words = set(stopwords.words('english')) # Get English stop words
    cleaned_tokens = [word for word in tokens if word not in stop_words and word.isalpha()] # Remove stop words and non-alphabetic tokens
    return ' '.join(cleaned_tokens) # Join tokens back into a string (required for TfidfVectorizer)

In [67]:
dataset = [preprocess_text(doc) for doc in dataset]
print("Preprocessed Dataset (Word2Vec):")
for doc in dataset: print(f"- {doc}")

Preprocessed Dataset (Word2Vec):
- love playing football weekends
- enjoy hiking camping mountains
- like read books watch movies
- prefer playing video games sports
- love listening music going concerts


In [55]:
tokenized_dataset = [doc.split() for doc in dataset]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, window=5, min_count=1, workers=4)

In [57]:
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in word2vec_model.wv], axis=0) for doc in dataset])

In [59]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)
# Predict the clusters for each document
y_pred = km.predict(X)
# Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                               Predicted Cluster
-----------------------------------  -------------------
love playing football weekends                         1
enjoy hiking camping mountains                         1
like read books watch movies                           1
prefer playing video games sports                      0
love listening music going concerts                    1




In [61]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.8
