In [6]:
import numpy as np 
from sklearn.cluster import KMeans 
from sklearn.feature_extraction.text import TfidfVectorizer 
from tabulate import tabulate 
from collections import Counter

# Sample dataset
dataset = [
    "I love playing football on the weekends",
    "I enjoy hiking and camping in the mountains",
    "I like to read books and watch movies",
    "I prefer playing video games over sports",
    "I love listening to music and going to concerts"
]

# Vectorize text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

# Clustering
k = 2
km = KMeans(n_clusters=k, random_state=42)
km.fit(X)

# Predict clusters
y_pred = km.predict(X)

# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Optional: calculate purity (if you had ground-truth labels)
# Since we don't have true labels, this section is not applicable here

# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

for i in range(k):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:
        print(f" {terms[ind]}")
    print()

# Calculate purity 
total_samples = len(y_pred) 
cluster_label_counts = [Counter(y_pred)] 
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples 
print("Purity:", purity) 

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            0
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              1
I prefer playing video games over sports                           0
I love listening to music and going to concerts                    1

Top terms per cluster:
Cluster 0:
 playing
 the
 weekends
 on
 football
 video
 sports
 prefer
 over
 games

Cluster 1:
 to
 and
 read
 watch
 movies
 like
 books
 concerts
 going
 music

Purity: 0.6


In [10]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import wordpunct_tokenize

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Sample dataset
dataset = [
    "I love playing football on the weekends",
    "I enjoy hiking and camping in the mountains",
    "I like to read books and watch movies",
    "I prefer playing video games over sports",
    "I love listening to music and going to concerts"
]

# Preprocessing function
def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Tokenize using wordpunct_tokenize (avoids punkt_tab error)
    tokens = wordpunct_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing
preprocessed_dataset = [preprocess(doc) for doc in dataset]

# Vectorize text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_dataset)

# Clustering
k = 2
km = KMeans(n_clusters=k, random_state=42)
km.fit(X)

# Predict clusters
y_pred = km.predict(X)

# Display the document and its predicted cluster in a table
table_data = [["Original Document", "Preprocessed", "Predicted Cluster"]]
table_data.extend([[orig, prep, cluster] for orig, prep, cluster in zip(dataset, preprocessed_dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

for i in range(k):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:
        print(f" {terms[ind]}")
    print()

# Estimate purity (not real without true labels, just for structure)
total_samples = len(y_pred)
cluster_counts = Counter(y_pred)
purity = sum(cluster_counts.values()) / total_samples  # Always 1.0 without real labels
print("Purity (estimated):", purity)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\naufa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\naufa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original Document                                Preprocessed                          Predicted Cluster
-----------------------------------------------  ----------------------------------  -------------------
I love playing football on the weekends          love playing football weekend                         1
I enjoy hiking and camping in the mountains      enjoy hiking camping mountain                         0
I like to read books and watch movies            like read book watch movie                            1
I prefer playing video games over sports         prefer playing video game sport                       1
I love listening to music and going to concerts  love listening music going concert                    1

Top terms per cluster:
Cluster 0:
 camping
 enjoy
 hiking
 mountain
 weekend
 listening
 concert
 football
 game
 going

Cluster 1:
 love
 playing
 football
 weekend
 going
 sport
 music
 concert
 video
 game

Purity (estimated): 1.0
