In [1]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter

In [2]:
dataset = ["I love playing football on the weekends",
"I enjoy hiking and camping in the mountains",
"I like to read books and watch movies",
"I prefer playing video games over sports",
"I love listening to music and going to concerts"]

In [3]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# Function to preprocess a sentence
def preprocess(text):
    text = text.lower()  # 1. Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # 2. Remove punctuation
    words = text.split()
    words = [word for word in words if word not in stop_words]  # 3. Remove stopwords
    return ' '.join(words)

# Apply to dataset
preprocessed_dataset = [preprocess(sentence) for sentence in dataset]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jeyas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_dataset)

In [7]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)
# Predict the clusters for each document
y_pred = km.predict(X)
# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(preprocessed_dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
        print()

Document                               Predicted Cluster
-----------------------------------  -------------------
love playing football weekends                         0
enjoy hiking camping mountains                         1
like read books watch movies                           0
prefer playing video games sports                      0
love listening music going concerts                    0

Top terms per cluster:
Cluster 0:
 love

 playing

 football

 weekends

 going

 sports

 music

 concerts

 video

 games

Cluster 1:
 camping

 enjoy

 hiking

 mountains

 weekends

 listening

 concerts

 football

 games

 going



In [8]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.8
