In [1]:
!pip install pyLDAvis
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
# NLTK resources
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")



[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
DATA_PATH = "/kaggle/input/unsupervised-model/"

train = pd.read_csv(
    DATA_PATH + "twitter_training.csv",
    header=None,
    names=["id", "entity", "sentiment", "tweet"]
)

valid = pd.read_csv(
    DATA_PATH + "twitter_validation.csv",
    header=None,
    names=["id", "entity", "sentiment", "tweet"]
)

print("Training shape:", train.shape)
print("Validation shape:", valid.shape)
print(train.head())
print(train["sentiment"].value_counts())


Training shape: (74682, 4)
Validation shape: (1000, 4)
     id       entity sentiment  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

                                               tweet  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  
sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64


In [3]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text: str) -> str:
    text = str(text)
    text = re.sub(r"http\S+", " ", text)          # URLs
    text = re.sub(r"@[A-Za-z0-9_]+", " ", text)   # mentions
    text = re.sub(r"[^a-zA-Z ]", " ", text)       # non letters
    text = text.lower()
    tokens = text.split()
    tokens = [
        lemmatizer.lemmatize(tok)
        for tok in tokens
        if tok not in stop_words and len(tok) > 2
    ]
    return " ".join(tokens)

train["clean"] = train["tweet"].apply(preprocess)
valid["clean"] = valid["tweet"].apply(preprocess)
train[["tweet", "clean"]].head()

Unnamed: 0,tweet,clean
0,im getting on borderlands and i will murder yo...,getting borderland murder
1,I am coming to the borders and I will kill you...,coming border kill
2,im getting on borderlands and i will kill you ...,getting borderland kill
3,im coming on borderlands and i will murder you...,coming borderland murder
4,im getting on borderlands 2 and i will murder ...,getting borderland murder


In [4]:
train_filtered = train[train["sentiment"].isin(["Positive", "Negative"])].copy()
print("Filtered training size:", train_filtered.shape)

Filtered training size: (43374, 5)


In [5]:
# TF-IDF on cleaned text
vectorizer = TfidfVectorizer(
    max_df=0.95,
    min_df=3,
    max_features=15000,
    ngram_range=(1, 3),
    sublinear_tf=True,
    stop_words="english"
)
X_tfidf = vectorizer.fit_transform(train_filtered["clean"])

print("TF-IDF shape:", X_tfidf.shape)

# Reduce to dense 100D space for stronger KMeans
svd = TruncatedSVD(n_components=100, random_state=42)
X_reduced = svd.fit_transform(X_tfidf)
print("Reduced shape:", X_reduced.shape)

TF-IDF shape: (43374, 15000)
Reduced shape: (43374, 100)


In [6]:
# Final KMeans exactly as required by the project: k = 10

k_required = 10

kmeans_required = KMeans(
    n_clusters=k_required,
    random_state=42,
    n_init=30,
    max_iter=500
)

clusters_k10 = kmeans_required.fit_predict(X_reduced)

sil_k10 = silhouette_score(X_reduced, clusters_k10)
print(f"Silhouette score for k = {k_required}: {sil_k10:.4f}")

train_filtered["cluster_k10"] = clusters_k10
train_filtered[["clean", "sentiment", "cluster_k10"]].head()


Silhouette score for k = 10: 0.1801


Unnamed: 0,clean,sentiment,cluster_k10
0,getting borderland murder,Positive,1
1,coming border kill,Positive,1
2,getting borderland kill,Positive,1
3,coming borderland murder,Positive,1
4,getting borderland murder,Positive,1


In [7]:
from sklearn.decomposition import NMF

# Number of topics (requirement: 10)
n_topics_nmf = 10

nmf_model = NMF(
    n_components=n_topics_nmf,
    random_state=42,
    init="nndsvd",
    max_iter=400
)

W_nmf = nmf_model.fit_transform(X_tfidf)         # document-topic distribution
H_nmf = nmf_model.components_                    # topic-word distribution

print("NMF W shape:", W_nmf.shape)
print("NMF H shape:", H_nmf.shape)


NMF W shape: (43374, 10)
NMF H shape: (10, 15000)


In [8]:
feature_names = np.array(vectorizer.get_feature_names_out())
top_n = 15

for topic_idx, topic in enumerate(H_nmf):
    top_indices = topic.argsort()[::-1][:top_n]
    top_words = feature_names[top_indices]
    print(f"\nNMF Topic {topic_idx}:")
    print(", ".join(top_words))



NMF Topic 0:
twitter com, twitter, pic, pic twitter, pic twitter com, com, rhandlerr, rhandlerr rhandlerr, rhandlerr pic twitter, rhandlerr pic, rhandlerr rhandlerr rhandlerr, facebook, thank, game pic, game pic twitter

NMF Topic 1:
nan, black, cold, war, ops, cold war, black ops, fucking, ops cold, black ops cold, ops cold war, duty, cod, duty black, duty black ops

NMF Topic 2:
game, good, like, play, wait, really, time, new, best, playing, fun, look, fix, year, fucking

NMF Topic 3:
unk, unk unk, real, depot, home, thank, smh, happy, unk love, game unk, fucking, thank unk, fuck unk, youtu, really

NMF Topic 4:
fuck, fuck game, fuck fuck, fucking, holy fuck, say, yes, bitch, holy, fuck overwatch, nvidia, want, overwatch, depot, home

NMF Topic 5:
wow, incredible, know, dark, good, played, thought, look, wow good, wow really, gotta, really, salute, luck, thought fifa

NMF Topic 6:
love, love game, love new, new, warcraft, love guy, game love, amazon, love playing, guy, borderland, l

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

count_vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=5,
    max_features=15000,
    ngram_range=(1, 2),
    stop_words='english'
)

X_count = count_vectorizer.fit_transform(train_filtered["clean"])
print("Count matrix shape:", X_count.shape)


Count matrix shape: (43374, 15000)


In [10]:
n_topics_lda = 10  # or 15 if your instructor required
lda_model = LatentDirichletAllocation(
    n_components=n_topics_lda,
    random_state=42,
    learning_method="batch",
    max_iter=50
)
lda_model.fit(X_count)

KeyboardInterrupt: 

In [None]:
terms = np.array(count_vectorizer.get_feature_names_out())
top_n = 15
for topic_idx, topic in enumerate(lda_model.components_):
    top_indices = topic.argsort()[::-1][:top_n]
    top_words = terms[top_indices]
    print(f"\nLDA Topic {topic_idx}:")
    print(", ".join(top_words))

In [None]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
lda_vis = pyLDAvis.sklearn.prepare(
    lda_model,
    X_count,
    count_vectorizer
)
lda_vis

In [None]:
def evaluate_kmeans(X, k_values):
    results = []
    for k in k_values:
        kmeans_temp = KMeans(
            n_clusters=k,
            random_state=42,
            n_init=20,
            max_iter=400
        )
        labels_temp = kmeans_temp.fit_predict(X)
        score = silhouette_score(X, labels_temp)
        results.append((k, score))
        print(f"k = {k}, silhouette score = {score:.4f}")
    return pd.DataFrame(results, columns=["k", "silhouette"])

k_values = range(2, 11)
sil_df = evaluate_kmeans(X_reduced, k_values)
sil_df


In [None]:
best_k = int(sil_df.loc[sil_df["silhouette"].idxmax(), "k"])
print("Best k from search =", best_k)

In [None]:
kmeans_final = KMeans(
    n_clusters=best_k,
    random_state=42,
    n_init=30,
    max_iter=500
)

clusters_train = kmeans_final.fit_predict(X_reduced)
sil_final = silhouette_score(X_reduced, clusters_train)

print(f"Final silhouette score (k={best_k}) = {sil_final:.4f}")

train_filtered["cluster"] = clusters_train
train_filtered[["clean", "sentiment", "cluster"]].head()


In [None]:
crosstab = pd.crosstab(train_filtered["cluster"], train_filtered["sentiment"])
print(crosstab)


In [None]:
terms = np.array(vectorizer.get_feature_names_out())
centers_reduced = kmeans_final.cluster_centers_        
centers_tfidf = centers_reduced @ svd.components_      
top_n = 20
for c in range(best_k):
    print(f"\nCluster {c}")
    idx = np.argsort(centers_tfidf[c])[::-1][:top_n]
    print(", ".join(terms[idx]))

In [None]:
sample_size = min(4000, X_reduced.shape[0])
idx_sample = np.random.choice(X_reduced.shape[0], sample_size, replace=False)

X_sample = X_reduced[idx_sample]
labels_sample = clusters_train[idx_sample]

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_sample)

plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels_sample, s=8)
plt.title(f"Tweet Clusters (K-Means, k={best_k})")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid(True, linewidth=0.3)
plt.show()


In [None]:
X_valid_tfidf = vectorizer.transform(valid["clean"])
X_valid_reduced = svd.transform(X_valid_tfidf)

valid_clusters = kmeans_final.predict(X_valid_reduced)
valid["cluster"] = valid_clusters

print(valid["cluster"].value_counts().sort_index())
pd.crosstab(valid["cluster"], valid["sentiment"])


In [None]:
plt.figure(figsize=(7, 5))
plt.plot(sil_df["k"], sil_df["silhouette"], marker="o")
plt.xticks(sil_df["k"])
plt.xlabel("Number of clusters (k)")
plt.ylabel("Silhouette score")
plt.title("Silhouette score as a function of k")
plt.grid(True, linewidth=0.3)

# Highlight the best k
best_row = sil_df.loc[sil_df["k"] == best_k].iloc[0]
plt.scatter(best_row["k"], best_row["silhouette"], s=80)

# Better spacing and placement
plt.annotate(
    f"best k = {best_k}\nscore = {best_row['silhouette']:.3f}",
    xy=(best_row["k"], best_row["silhouette"]),
    xytext=(best_row["k"] + 0.5, best_row["silhouette"] + 0.05),
    arrowprops=dict(arrowstyle="->", linewidth=0.8),
    fontsize=10,
    bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="black", lw=0.5)
)

plt.tight_layout()
plt.show()



In [None]:
from sklearn.decomposition import PCA

# Sample for readability
sample_size = min(4000, X_reduced.shape[0])
idx_sample = np.random.choice(X_reduced.shape[0], sample_size, replace=False)

X_sample = X_reduced[idx_sample]
labels_sample = clusters_train[idx_sample]

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_sample)

plt.figure(figsize=(8, 6))

for c in range(best_k):
    mask = labels_sample == c
    plt.scatter(
        X_pca[mask, 0],
        X_pca[mask, 1],
        s=12,
        alpha=0.7,
        label=f"Cluster {c}"
    )

plt.title(f"Tweet clusters in 2D PCA space (k = {best_k})")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(frameon=True)
plt.grid(True, linewidth=0.3)
plt.show()


In [None]:
cluster_counts = train_filtered["cluster"].value_counts().sort_index()

plt.figure(figsize=(6, 4))
plt.bar(cluster_counts.index, cluster_counts.values)
plt.xlabel("Cluster")
plt.ylabel("Number of tweets")
plt.title("Cluster size distribution (training set)")
for idx, val in enumerate(cluster_counts.values):
    plt.text(cluster_counts.index[idx], val, str(val), ha="center", va="bottom", fontsize=8)
plt.show()


In [None]:
# Contingency table
ct = pd.crosstab(train_filtered["cluster"], train_filtered["sentiment"])
print(ct)  # good to show as a table too

values = ct.values
rows = ct.index.astype(str).tolist()
cols = ct.columns.astype(str).tolist()

plt.figure(figsize=(6, 4))
plt.imshow(values, aspect="auto")
plt.colorbar(label="Count")

plt.xticks(range(len(cols)), cols, rotation=45)
plt.yticks(range(len(rows)), [f"Cluster {r}" for r in rows])
plt.title("Cluster vs. sentiment (training set)")

# annotate each cell with the value
for i in range(values.shape[0]):
    for j in range(values.shape[1]):
        plt.text(j, i, int(values[i, j]), ha="center", va="center", fontsize=8)

plt.tight_layout()
plt.show()


In [None]:
train_filtered["tweet_len"] = train_filtered["clean"].str.split().apply(len)

data = [train_filtered[train_filtered["cluster"] == c]["tweet_len"]
        for c in sorted(train_filtered["cluster"].unique())]

plt.figure(figsize=(7, 5))
plt.boxplot(data, labels=[f"C{c}" for c in sorted(train_filtered["cluster"].unique())])
plt.xlabel("Cluster")
plt.ylabel("Tweet length (tokens)")
plt.title("Distribution of tweet lengths by cluster")
plt.grid(axis="y", linewidth=0.3)
plt.show()


In [None]:
from sklearn.cluster import KMeans

inertia_scores = []

K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(
        n_clusters=k,
        random_state=42,
        n_init=20,
        max_iter=400
    )
    kmeans.fit(X_reduced)
    inertia_scores.append(kmeans.inertia_)
    
plt.figure(figsize=(7,5))
plt.plot(K_range, inertia_scores, marker='o')
plt.xticks(K_range)
plt.title("Elbow Method: K-Means Clustering")
plt.xlabel("Number of clusters (k)")
plt.ylabel("Inertia (Within-Cluster SSE)")
plt.grid(True, linewidth=0.3)

plt.show()


In [None]:
explained = svd.explained_variance_ratio_
cum_explained = explained.cumsum()
components = range(1, len(cum_explained) + 1)

plt.figure(figsize=(7, 5))
plt.plot(components, cum_explained, marker="o")
plt.xlabel("Number of components")
plt.ylabel("Cumulative explained variance")
plt.title("Cumulative explained variance of SVD components")
plt.grid(True, linewidth=0.3)
plt.show()


In [None]:
from sklearn.metrics import silhouette_samples

X = X_reduced
labels = clusters_train
n_clusters = best_k

silhouette_vals = silhouette_samples(X, labels)

plt.figure(figsize=(7, 5))

y_lower = 10
for c in range(n_clusters):
    cluster_sil_vals = silhouette_vals[labels == c]
    cluster_sil_vals.sort()
    size = cluster_sil_vals.shape[0]
    y_upper = y_lower + size

    plt.fill_betweenx(
        np.arange(y_lower, y_upper),
        0,
        cluster_sil_vals
    )
    plt.text(-0.02, y_lower + 0.5 * size, str(c))
    y_lower = y_upper + 10

plt.axvline(silhouette_vals.mean(), linestyle="--")
plt.xlabel("Silhouette coefficient")
plt.ylabel("Cluster")
plt.title(f"Silhouette plot for k = {n_clusters}")
plt.grid(True, linewidth=0.3)
plt.show()


In [None]:
from sklearn.decomposition import PCA

sample_size = min(4000, X_reduced.shape[0])
idx_sample = np.random.choice(X_reduced.shape[0], sample_size, replace=False)

X_sample = X_reduced[idx_sample]
labels_sample = clusters_train[idx_sample]

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_sample)

plt.figure(figsize=(8, 6))
for c in range(best_k):
    mask = labels_sample == c
    plt.scatter(
        X_pca[mask, 0],
        X_pca[mask, 1],
        s=10,
        alpha=0.7,
        label=f"Cluster {c}"
    )

plt.title(f"Tweet clusters in 2D PCA space (k = {best_k})")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend(frameon=True)
plt.grid(True, linewidth=0.3)
plt.show()


In [None]:
train_filtered["tweet_len"] = train_filtered["clean"].str.split().apply(len)

plt.figure(figsize=(7, 5))
plt.hist(train_filtered["tweet_len"], bins=40)
plt.xlabel("Tweet length (tokens)")
plt.ylabel("Frequency")
plt.title("Distribution of tweet length in filtered training set")
plt.grid(True, linewidth=0.3)
plt.show()


In [None]:
data = [
    train_filtered[train_filtered["cluster"] == c]["tweet_len"]
    for c in sorted(train_filtered["cluster"].unique())
]

plt.figure(figsize=(7, 5))
plt.boxplot(data, labels=[f"C{c}" for c in sorted(train_filtered["cluster"].unique())])
plt.xlabel("Cluster")
plt.ylabel("Tweet length (tokens)")
plt.title("Tweet length distribution per cluster")
plt.grid(axis="y", linewidth=0.3)
plt.show()


In [None]:
cluster_counts = train_filtered["cluster"].value_counts().sort_index()

plt.figure(figsize=(6, 4))
plt.bar(cluster_counts.index, cluster_counts.values)
plt.xlabel("Cluster")
plt.ylabel("Number of tweets")
plt.title("Cluster size distribution (training set)")
for idx, val in enumerate(cluster_counts.values):
    plt.text(cluster_counts.index[idx], val, str(val),
             ha="center", va="bottom", fontsize=8)
plt.grid(axis="y", linewidth=0.3)
plt.show()


In [None]:
plt.figure(figsize=(6, 6))
plt.pie(
    cluster_counts.values,
    labels=[f"C{c}" for c in cluster_counts.index],
    autopct="%1.1f%%"
)
plt.title("Proportion of tweets in each cluster")
plt.show()


In [None]:
terms = np.array(vectorizer.get_feature_names_out())
centers_reduced = kmeans_final.cluster_centers_
centers_tfidf = centers_reduced @ svd.components_

top_n = 10  
for c in range(best_k):
    idx = np.argsort(centers_tfidf[c])[::-1][:top_n]
    top_terms = terms[idx]
    top_vals = centers_tfidf[c, idx]

    plt.figure(figsize=(7, 4))
    y_pos = np.arange(len(top_terms))
    plt.barh(y_pos, top_vals)
    plt.yticks(y_pos, top_terms)
    plt.xlabel("Approx. TF-IDF weight")
    plt.title(f"Top {top_n} terms in cluster {c}")
    plt.gca().invert_yaxis() 
    plt.grid(axis="x", linewidth=0.3)
    plt.tight_layout()
    plt.show()


In [14]:
# ================================
# 1. Imports
# ================================
import re
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# ================================
# 2. NLTK resources
# ================================
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

# ================================
# 3. Load dataset
# ================================
DATA_PATH = "/kaggle/input/unsupervised-model/"

train = pd.read_csv(
    DATA_PATH + "twitter_training.csv",
    header=None,
    names=["id", "entity", "sentiment", "tweet"]
)

valid = pd.read_csv(
    DATA_PATH + "twitter_validation.csv",
    header=None,
    names=["id", "entity", "sentiment", "tweet"]
)

print("Training shape:", train.shape)
print("Validation shape:", valid.shape)
print(train.head())
print(train["sentiment"].value_counts())


# ================================
# 4. Text preprocessing
# ================================
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text: str) -> str:
    """Basic cleaning + tokenization + stopword removal + lemmatization."""
    text = str(text)
    text = re.sub(r"http\S+", " ", text)          # URLs
    text = re.sub(r"@[A-Za-z0-9_]+", " ", text)   # mentions
    text = re.sub(r"[^a-zA-Z ]", " ", text)       # non letters
    text = text.lower()
    tokens = text.split()
    tokens = [
        lemmatizer.lemmatize(tok)
        for tok in tokens
        if tok not in stop_words and len(tok) > 2
    ]
    return " ".join(tokens)

train["clean"] = train["tweet"].apply(preprocess)
valid["clean"] = valid["tweet"].apply(preprocess)

print(train[["tweet", "clean"]].head())


# ================================
# 5. Optional: filter to clear sentiment (Positive / Negative)
#    (still unsupervised; labels only used to choose subset)
# ================================
train_filtered = train[train["sentiment"].isin(["Positive", "Negative"])].copy()
print("Filtered training size:", train_filtered.shape)


# ================================
# 6. TF-IDF representation
# ================================
vectorizer = TfidfVectorizer(
    max_df=0.95,
    min_df=3,
    max_features=15000,
    ngram_range=(1, 3),
    sublinear_tf=True,
    stop_words="english"
)

X_tfidf = vectorizer.fit_transform(train_filtered["clean"])
print("TF-IDF shape:", X_tfidf.shape)


# ================================
# 7. Dimensionality reduction (SVD / LSA)
# ================================
svd = TruncatedSVD(n_components=100, random_state=42)
X_reduced = svd.fit_transform(X_tfidf)
print("Reduced shape (SVD):", X_reduced.shape)


# ================================
# 8. K-Means: search for best k (silhouette)
# ================================
def evaluate_kmeans(X, k_values):
    results = []
    for k in k_values:
        kmeans_temp = KMeans(
            n_clusters=k,
            random_state=42,
            n_init=20,
            max_iter=400
        )
        labels_temp = kmeans_temp.fit_predict(X)
        score = silhouette_score(X, labels_temp)
        results.append((k, score))
        print(f"k = {k}, silhouette score = {score:.4f}")
    return pd.DataFrame(results, columns=["k", "silhouette"])

k_values = range(2, 11)
sil_df = evaluate_kmeans(X_reduced, k_values)
print("\nSilhouette results:")
print(sil_df)

# choose k with highest silhouette
best_k = int(sil_df.loc[sil_df["silhouette"].idxmax(), "k"])
print(f"\nBest k from silhouette search = {best_k}")


# ================================
# 9. Final K-Means with best_k
# ================================
kmeans_final = KMeans(
    n_clusters=best_k,
    random_state=42,
    n_init=30,
    max_iter=500
)

clusters_best = kmeans_final.fit_predict(X_reduced)
sil_final = silhouette_score(X_reduced, clusters_best)

print(f"Final silhouette score (k={best_k}) = {sil_final:.4f}")

train_filtered["cluster_best_k"] = clusters_best
print(train_filtered[["clean", "sentiment", "cluster_best_k"]].head())


# ================================
# 10. K-Means with k=10 (as required in project)
# ================================
k_required = 10

kmeans_k10 = KMeans(
    n_clusters=k_required,
    random_state=42,
    n_init=30,
    max_iter=500
)

clusters_k10 = kmeans_k10.fit_predict(X_reduced)
sil_k10 = silhouette_score(X_reduced, clusters_k10)

print(f"\nSilhouette score for k = {k_required}: {sil_k10:.4f}")

train_filtered["cluster_k10"] = clusters_k10
print(train_filtered[["clean", "sentiment", "cluster_k10"]].head())


# ===== pyLDAvis using gensim LDA (no sklearn import) =====

import pyLDAvis
import pyLDAvis.gensim_models

import gensim
from gensim.corpora import Dictionary

pyLDAvis.enable_notebook()

# Tokenize cleaned tweets
tokenized_docs = [doc.split() for doc in train_filtered["clean"]]

# Build dictionary and corpus
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(text) for text in tokenized_docs]

print("Number of documents:", len(corpus))
print("Vocabulary size:", len(dictionary))

# Train a small gensim LDA model for visualization
num_topics_vis = 10  # you can match n_topics_lda

gensim_lda = gensim.models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics_vis,
    random_state=42,
    passes=5,
    alpha='auto'
)

# Prepare pyLDAvis visualization
lda_vis = pyLDAvis.gensim_models.prepare(
    gensim_lda,
    corpus,
    dictionary
)

lda_vis


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Training shape: (74682, 4)
Validation shape: (1000, 4)
     id       entity sentiment  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

                                               tweet  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  
sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64
                                               tweet  \
0  im getting on borderlands and i will murder yo...   
1  I am coming to the borders and I will kill you...   
2  im getting on borderlands and i will kill you ...   
3  im coming on borderlands and i will murder you...   
4  im getting 

In [11]:
count_vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=5,
    max_features=15000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_count = count_vectorizer.fit_transform(train_filtered["clean"])
print("Count matrix shape:", X_count.shape)

n_topics_lda = 10  # or 15/20 if instructor specified

lda_model = LatentDirichletAllocation(
    n_components=n_topics_lda,
    random_state=42,
    learning_method="batch",
    max_iter=50
)

lda_model.fit(X_count)

terms = np.array(count_vectorizer.get_feature_names_out())
top_n = 15

for topic_idx, topic in enumerate(lda_model.components_):
    top_indices = topic.argsort()[::-1][:top_n]
    top_words = terms[top_indices]
    print(f"\nLDA Topic {topic_idx}:")
    print(", ".join(top_words))


Count matrix shape: (43374, 15000)

LDA Topic 0:
really, like, game, play, feel, gta, new, good, playing, world, fun, lol, unk, people, want

LDA Topic 1:
love, thank, stream, great, com, twitch, really, today, watch, new, awesome, amazing, thanks, excited, happy

LDA Topic 2:
game, fix, server, fuck, got, rainbow, update, help, verizon, rainbow game, problem, time, access, ghostrecon, health

LDA Topic 3:
fifa, war, wtf, black, com, cold, unk, ops, nan, black ops, facebook, cold war, playstation, money, time

LDA Topic 4:
twitter, com, pic, twitter com, pic twitter, rhandlerr, game, assassin, creed, assassin creed, xbox, rhandlerr rhandlerr, series, look, like

LDA Topic 5:
good, look, game, like, year, right, johnson, damn, actually, thing, think, look like, time, perfect, pretty

LDA Topic 6:
league, microsoft, legend, wait, wow, league legend, nvidia, game, time, xbox, dota, news, like, hate, pubg

LDA Topic 7:
game, shit, play, day, year, time, fifa, playing, make, played, nice, t

In [18]:
# ===== pyLDAvis using gensim LDA (no sklearn import) =====

import pyLDAvis
import pyLDAvis.gensim_models

import gensim
from gensim.corpora import Dictionary

pyLDAvis.enable_notebook()

# Tokenize cleaned tweets
tokenized_docs = [doc.split() for doc in train_filtered["clean"]]

# Build dictionary and corpus
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(text) for text in tokenized_docs]

print("Number of documents:", len(corpus))
print("Vocabulary size:", len(dictionary))

# Train a small gensim LDA model for visualization
num_topics_vis = 10  # you can match n_topics_lda

gensim_lda = gensim.models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics_vis,
    random_state=42,
    passes=5,
    alpha='auto'
)

# Prepare pyLDAvis visualization
lda_vis = pyLDAvis.gensim_models.prepare(
    gensim_lda,
    corpus,
    dictionary
)

lda_vis


Number of documents: 43374
Vocabulary size: 16248
