In [2]:
import numpy as np 
import pandas as pd
from sklearn.cluster import KMeans 
from gensim.models import Word2Vec 
from tabulate import tabulate 
from collections import Counter

dataset = ["I love playing football on the weekends", 
           "I enjoy hiking and camping in the mountains", 
           "I like to read books and watch movies", 
           "I prefer playing video games over sports", 
           "I love listening to music and going to concerts"] 

tokenized_dataset = [doc.split() for doc in dataset]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, window=5, min_count=1, workers=4) 

X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in word2vec_model.wv], axis=0) for doc in dataset]) 

k = 2  # Define the number of clusters 
km = KMeans(n_clusters=k) 
km.fit(X) 
 
# Predict the clusters for each document 
y_pred = km.predict(X) 
 
# Tabulate the document and predicted cluster 
table_data = [["Document", "Predicted Cluster"]] 
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)]) 
print(tabulate(table_data, headers="firstrow"))

# Calculate purity 
total_samples = len(y_pred) 
cluster_label_counts = [Counter(y_pred)] 
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples 
print("Purity:", purity)

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        1
I like to read books and watch movies                              0
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    0
Purity: 0.6




In [6]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import wordpunct_tokenize  # More reliable than word_tokenize

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Sample dataset
dataset = [
    "I love playing football on the weekends",
    "I enjoy hiking and camping in the mountains",
    "I like to read books and watch movies",
    "I prefer playing video games over sports",
    "I love listening to music and going to concerts"
]

# Preprocessing function
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = wordpunct_tokenize(text)  # Replaces word_tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

# Preprocess dataset
preprocessed_dataset = [preprocess(doc) for doc in dataset]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=preprocessed_dataset, vector_size=100, window=5, min_count=1, workers=4, seed=42)

# Convert documents to vectors by averaging word embeddings
def document_vector(doc):
    vectors = [word2vec_model.wv[word] for word in doc if word in word2vec_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(word2vec_model.vector_size)

X = np.array([document_vector(doc) for doc in preprocessed_dataset])

# Clustering
k = 2
km = KMeans(n_clusters=k, random_state=42)
km.fit(X)

# Predict clusters
y_pred = km.predict(X)

# Display results
table_data = [["Original Document", "Preprocessed", "Predicted Cluster"]]
table_data.extend([[orig, ' '.join(prep), cluster] for orig, prep, cluster in zip(dataset, preprocessed_dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity (estimated):", purity)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\naufa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\naufa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original Document                                Preprocessed                          Predicted Cluster
-----------------------------------------------  ----------------------------------  -------------------
I love playing football on the weekends          love playing football weekend                         1
I enjoy hiking and camping in the mountains      enjoy hiking camping mountain                         0
I like to read books and watch movies            like read book watch movie                            1
I prefer playing video games over sports         prefer playing video game sport                       1
I love listening to music and going to concerts  love listening music going concert                    1
Purity (estimated): 0.8


