In [50]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter

#Exercise
import re
import string
from nltk.corpus import stopwords
import nltk

In [51]:
#Exercise
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
#Exercise
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [53]:
dataset = ["I love playing football on the weekends",
    "I enjoy hiking and camping in the mountains",
    "I like to read books and watch movies",
    "I prefer playing video games over sports",
    "I love listening to music and going to concerts"]

In [54]:
#Exercise
preprocessed_dataset = [preprocess(doc) for doc in dataset]

In [55]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_dataset)

In [56]:
k = 2
km = KMeans(n_clusters=k)
km.fit(X)

y_pred = km.predict(X)

table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:,::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" %i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print()

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            0
I enjoy hiking and camping in the mountains                        1
I like to read books and watch movies                              0
I prefer playing video games over sports                           0
I love listening to music and going to concerts                    0

Top terms per cluster:
Cluster 0:
 love
 playing
 football
 weekends
 going
 sports
 music
 concerts
 video
 games

Cluster 1:
 camping
 enjoy
 hiking
 mountains
 weekends
 listening
 concerts
 football
 games
 going



In [57]:
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples

print("Purity:", purity)

Purity: 0.8


##### Do the Purity differ when applying text preprocessing before vectorization?
###### The purity is higher after applying preprocessing steps as it reduces noise and irrelevant features