### TEXT CLUSTERING USING TF-IDF VECTORIZER 

In [90]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [91]:
dataset = ["I love playing football on the weekends",
 "I enjoy hiking and camping in the mountains",
 "I like to read books and watch movies",
 "I prefer playing video games over sports",
 "I love listening to music and going to concerts"]

In [92]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Init tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define preprocessing function
def preprocess_text(text):
    # Lowercase and remove punctuation/URLs
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenize, remove stopwords, lemmatize
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

# Preprocess
preprocessed_dataset = [preprocess_text(text) for text in dataset]

print(preprocessed_dataset)

['love playing football weekend', 'enjoy hiking camping mountain', 'like read book watch movie', 'prefer playing video game sport', 'love listening music going concert']


In [93]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_dataset)

In [94]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

# Predict the clusters for each document
y_pred = km.predict(X)

# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(preprocessed_dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
 print("Cluster %d:" % i)
 for ind in order_centroids[i, :10]:
  print(' %s' % terms[ind])
  print()


Document                              Predicted Cluster
----------------------------------  -------------------
love playing football weekend                         1
enjoy hiking camping mountain                         0
like read book watch movie                            1
prefer playing video game sport                       1
love listening music going concert                    1

Top terms per cluster:
Cluster 0:
 camping

 enjoy

 hiking

 mountain

 weekend

 listening

 concert

 football

 game

 going

Cluster 1:
 love

 playing

 football

 weekend

 going

 sport

 music

 concert

 video

 game



In [96]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.8


### TEXT CLUSTERING USING WORD2VEC VECTORIZER

In [97]:
import numpy as np
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter

In [98]:
tokenized_dataset = [doc.split() for doc in preprocessed_dataset]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100,
window=5, min_count=1, workers=4)

In [99]:
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in
word2vec_model.wv], axis=0) for doc in dataset])

In [100]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

# Predict the clusters for each document
y_pred = km.predict(X)

# Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(preprocessed_dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                              Predicted Cluster
----------------------------------  -------------------
love playing football weekend                         0
enjoy hiking camping mountain                         1
like read book watch movie                            1
prefer playing video game sport                       0
love listening music going concert                    1


In [101]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.6
