In [None]:
# ==========================================
# Probabilistic Latent Semantic Indexing (PLSI)
# ==========================================

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
import kagglehub


path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

data = pd.read_csv(f"{path}/IMDB Dataset.csv")
data = data.sample(2000, random_state=42)  # smaller subset for demo
texts = data['review'].tolist()

vectorizer = CountVectorizer(stop_words='english', max_features=2000)
X = vectorizer.fit_transform(texts)
X = X.toarray()

n_docs, n_words = X.shape
n_topics = 10


np.random.seed(0)
P_z = np.random.dirichlet(alpha=[1]*n_topics)
P_w_z = np.random.dirichlet(alpha=[1]*n_words, size=n_topics)
P_z_d = np.random.dirichlet(alpha=[1]*n_topics, size=n_docs)


max_iter = 30
for iteration in tqdm(range(max_iter), desc="Training PLSI"):

    P_z_dw = np.zeros((n_docs, n_words, n_topics))
    for z in range(n_topics):
        P_z_dw[:, :, z] = P_z_d[:, z][:, np.newaxis] * P_w_z[z, np.newaxis, :]

    P_z_dw /= np.sum(P_z_dw, axis=2, keepdims=True) + 1e-12


    for z in range(n_topics):
        P_w_z[z, :] = np.sum(X * P_z_dw[:, :, z], axis=0)
        P_w_z[z, :] /= np.sum(P_w_z[z, :]) + 1e-12


    for d in range(n_docs):
        P_z_d[d, :] = np.sum(X[d, :, np.newaxis] * P_z_dw[d, :, :], axis=0)
        P_z_d[d, :] /= np.sum(P_z_d[d, :]) + 1e-12


    P_z = np.sum(P_z_d, axis=0)
    P_z /= np.sum(P_z)

print("âœ… Training completed!")


vocab = np.array(vectorizer.get_feature_names_out())
for z in range(n_topics):
    top_words = vocab[np.argsort(P_w_z[z])[-10:][::-1]]
    print(f"\nTopic {z+1}: {', '.join(top_words)}")


def compute_perplexity(X, P_w_z, P_z_d):
    n_docs, n_words = X.shape
    total_words = np.sum(X)
    log_likelihood = 0.0

    for d in range(n_docs):
        for w in range(n_words):
            if X[d, w] > 0:
                # P(w|d) = sum_z P(w|z) * P(z|d)
                p_w_d = np.sum(P_w_z[:, w] * P_z_d[d, :])
                log_likelihood += X[d, w] * np.log(p_w_d + 1e-12)

    perplexity = np.exp(-log_likelihood / total_words)
    return perplexity

perplexity = compute_perplexity(X, P_w_z, P_z_d)
print(f"\nðŸ“‰ Model Perplexity: {perplexity:.2f}")


Training PLSI: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 30/30 [00:45<00:00,  1.51s/it]


âœ… Training completed!

Topic 1: like, movie, people, funny, think, say, just, don, watch, did

Topic 2: film, movie, character, man, films, plot, story, like, time, don

Topic 3: br, film, time, real, life, old, man, way, new, true

Topic 4: film, horror, just, story, good, scene, killer, scenes, way, movie

Topic 5: just, time, br, film, like, episode, things, work, people, really

Topic 6: seen, time, just, great, funny, ve, performance, like, dvd, good

Topic 7: br, movie, good, great, just, 10, acting, story, bad, plot

Topic 8: movie, bad, film, really, like, just, don, movies, acting, seen

Topic 9: film, br, story, great, best, good, role, book, love, wonderful

Topic 10: br, young, love, people, like, new, world, girl, time, life

ðŸ“‰ Model Perplexity: 706.75
