In [11]:
import numpy as np
import pandas as pd
import random
from collections import defaultdict
import pickle

In [12]:
from datasets import load_dataset
import pandas as pd

# Load a small subset (Books category, English reviews)
dataset = load_dataset("amazon_polarity", split="train[:5%]")  # only 5% for speed

# Convert to Pandas DataFrame
df = pd.DataFrame(dataset)

# Rename columns to expected names
df = df.rename(columns={"title": "asin", "content": "reviewText"})
df["reviewerID"] = ["user_" + str(i % 5000) for i in range(len(df))]  # simulate ~5k users

# Keep only required columns
df = df[["reviewerID", "asin", "reviewText"]].dropna()

print("✅ Loaded dataset automatically")
print("Number of reviews:", len(df))
print(df.head())



✅ Loaded dataset automatically
Number of reviews: 180000
  reviewerID                                               asin  \
0     user_0                     Stuning even for the non-gamer   
1     user_1              The best soundtrack ever to anything.   
2     user_2                                           Amazing!   
3     user_3                               Excellent Soundtrack   
4     user_4  Remember, Pull Your Jaw Off The Floor After He...   

                                          reviewText  
0  This sound track was beautiful! It paints the ...  
1  I'm reading a lot of reviews saying that this ...  
2  This soundtrack is my favorite music of all ti...  
3  I truly like this soundtrack and I enjoy video...  
4  If you've played the game, you know how divine...  


In [13]:
# Keep users and documents with minimum activity
min_reviews_user = 2
min_reviews_doc = 2

# Count user & doc frequencies
user_counts = df['reviewerID'].value_counts()
doc_counts = df['asin'].value_counts()

users_keep = user_counts[user_counts>=min_reviews_user].index
docs_keep  = doc_counts[doc_counts>=min_reviews_doc].index

df2 = df[df['reviewerID'].isin(users_keep) & df['asin'].isin(docs_keep)].copy()
print("Filtered reviews:", len(df2))

# Relabel user_ids and doc_ids for indices
user2idx = {u:i for i,u in enumerate(df2['reviewerID'].unique())}
doc2idx  = {d:i for i,d in enumerate(df2['asin'].unique())}

df2['u_idx'] = df2['reviewerID'].map(user2idx)
df2['d_idx'] = df2['asin'].map(doc2idx)

U = len(user2idx)
D = len(doc2idx)
print("Num users:", U, "Num documents:", D)


Filtered reviews: 34846
Num users: 4997 Num documents: 7336


In [14]:
import re
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already present
try:
    stopwords = stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
    stopwords = stopwords.words('english')


# Basic tokenization and cleaning with stop word removal
def preprocess(text):
    text = text.lower()
    # Remove non-alphanumeric characters and punctuation
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = text.split()
    # Remove stop words
    tokens = [word for word in tokens if word not in stopwords]
    return tokens

# Build vocabulary from reviewText of df2
all_texts = df2['reviewText'].tolist()
vectorizer = CountVectorizer(max_features=2000, tokenizer=preprocess)
X_counts = vectorizer.fit_transform(all_texts)
vocab = vectorizer.get_feature_names_out()
W = len(vocab)
print("Vocabulary size:", W)

word2idx = {w:i for i,w in enumerate(vocab)}

# Build triplets (u, d, w, count)
triplet_counts = defaultdict(int)
for idx, row in df2.iterrows():
    u = row['u_idx']
    d = row['d_idx']
    text = row['reviewText']
    tokens = preprocess(text)
    for t in tokens:
        if t in word2idx:
            w = word2idx[t]
            triplet_counts[(u,d,w)] += 1

triplets = [(u, d, w, c) for (u,d,w),c in triplet_counts.items() if c>0]
print("Num non-zero triplets:", len(triplets))



Vocabulary size: 2000
Num non-zero triplets: 737937


In [15]:
# Shuffle and split
random.shuffle(triplets)
split = int(0.9 * len(triplets))
train_triplets = triplets[:split]
test_triplets  = triplets[split:]
print("Train size:", len(train_triplets), "Test size:", len(test_triplets))


Train size: 664143 Test size: 73794


In [16]:
def train_triadic_plsi(train_triplets, U, D, W, K=10, max_iter=30, eps=1e-12):
    # Initialize
    Pz_u = np.random.rand(U, K)
    Pz_u /= Pz_u.sum(axis=1, keepdims=True)

    Pd_z = np.random.rand(K, D)
    Pd_z /= Pd_z.sum(axis=1, keepdims=True)

    Pw_z = np.random.rand(K, W)
    Pw_z /= Pw_z.sum(axis=1, keepdims=True)

    for it in range(max_iter):
        # E-step: compute q(z|u,d,w) for each triplet
        Q = []
        for (u,d,w,c) in train_triplets:
            val = Pz_u[u, :] * Pd_z[:, d] * Pw_z[:, w]
            s   = val.sum()
            if s < eps:
                q = np.ones(K) / K
            else:
                q = val / s
            Q.append((u,d,w,c,q))

        # M-step: update counts
        Pz_u_new = np.zeros_like(Pz_u)
        Pd_z_new = np.zeros_like(Pd_z)
        Pw_z_new = np.zeros_like(Pw_z)

        for (u,d,w,c,q) in Q:
            Pz_u_new[u, :] += c * q
            Pd_z_new[:, d] += c * q
            Pw_z_new[:, w] += c * q

        # Normalize
        Pz_u = (Pz_u_new.T / Pz_u_new.sum(axis=1)).T
        Pd_z = (Pd_z_new.T / Pd_z_new.sum(axis=1)).T
        Pw_z = (Pw_z_new.T / Pw_z_new.sum(axis=1)).T

        # Compute log-likelihood on train
        ll = 0.0
        for (u,d,w,c) in train_triplets:
            prob = np.sum(Pz_u[u, :] * Pd_z[:, d] * Pw_z[:, w])
            ll += c * np.log(prob + eps)
        print(f"Iter {it+1}/{max_iter}, train log-likelihood = {ll:.2f}")

    return Pz_u, Pd_z, Pw_z


In [17]:
K = 7
Pz_u, Pd_z, Pw_z = train_triadic_plsi(train_triplets, U, D, W, K=K, max_iter=50) # Increased max_iter to 50

Iter 1/50, train log-likelihood = -11189043.50
Iter 2/50, train log-likelihood = -11116381.96
Iter 3/50, train log-likelihood = -11027043.86
Iter 4/50, train log-likelihood = -10926509.89
Iter 5/50, train log-likelihood = -10830524.61
Iter 6/50, train log-likelihood = -10752825.55
Iter 7/50, train log-likelihood = -10697832.47
Iter 8/50, train log-likelihood = -10661004.68
Iter 9/50, train log-likelihood = -10635685.81
Iter 10/50, train log-likelihood = -10617497.43
Iter 11/50, train log-likelihood = -10604014.40
Iter 12/50, train log-likelihood = -10593884.99
Iter 13/50, train log-likelihood = -10586065.35
Iter 14/50, train log-likelihood = -10579673.85
Iter 15/50, train log-likelihood = -10574304.94
Iter 16/50, train log-likelihood = -10569813.57
Iter 17/50, train log-likelihood = -10566008.67
Iter 18/50, train log-likelihood = -10562710.11
Iter 19/50, train log-likelihood = -10559813.63
Iter 20/50, train log-likelihood = -10557223.05
Iter 21/50, train log-likelihood = -10554904.82
I

In [18]:
def compute_perplexity(triplets, Pz_u, Pd_z, Pw_z, eps=1e-12):
    N = len(triplets)
    ll = 0.0
    for (u,d,w,c) in triplets:
        prob = np.sum(Pz_u[u, :] * Pd_z[:, d] * Pw_z[:, w])
        ll   += c * np.log(prob + eps)
    perp = np.exp(-ll / N)
    return ll, perp

ll_test, perp_test = compute_perplexity(test_triplets, Pz_u, Pd_z, Pw_z)
print("Test log-likelihood = %.2f, perplexity = %.2f" % (ll_test, perp_test))


Test log-likelihood = -1174206.73, perplexity = 8137178.03


In [19]:
# Top unique words per topic
top_n = 10
for z in range(K):
    # Calculate the probability of a word given a topic divided by the sum of probabilities of that word across all topics
    # This highlights words that are more unique to a topic
    word_uniqueness = Pw_z[z, :] / np.sum(Pw_z, axis=0)
    top_w_idx = np.argsort(word_uniqueness)[-top_n:]
    top_words   = [vocab[w] for w in top_w_idx]
    print(f"Topic {z}: {top_words}")

# Sample a user and inspect user’s topic distribution
u_sample = 0
print("User", u_sample, "topic distribution:", Pz_u[u_sample,:].round(3))

Topic 0: ['ipod', 'card', 'wire', 'wood', 'diaper', 'cutting', 'claims', 'oil', 'images', 'footage']
Topic 1: ['practical', 'ice', 'pc', 'elizabeth', 'luck', 'johnny', 'tiny', 'mirror', 'ii', 'letter']
Topic 2: ['potter', 'rack', 'rice', 'teaching', 'oxo', 'rush', 'la', 'r', 'de', 'hobbit']
Topic 3: ['3d', 'classics', 'reach', 'discovered', 'ahead', 'disney', 'nicely', 'usb', 'remove', 'explanation']
Topic 4: ['brush', 'shut', 'steps', 'ice', 'situations', 'publisher', 'killing', 'network', 'terrific', 'johnny']
Topic 5: ['roll', 'wind', 'brian', 'workout', 'plane', 'receiver', 'training', 'community', 'giver', 'jonas']
Topic 6: ['anne', 'everyday', 'frankly', 'flow', 'comic', 'concerned', 'exam', 'mirror', 'avid', 'dust']
User 0 topic distribution: [0.623 0.    0.377 0.    0.    0.    0.   ]


In [20]:
# Build doc-word counts
dw_counts = defaultdict(int)
for (u,d,w,c) in train_triplets:
    dw_counts[(d,w)] += c
dw_triplets = [(d,w,c) for (d,w),c in dw_counts.items()]

def train_plsi_doc_word(dw_triplets, D, W, K=20, max_iter=20, eps=1e-12):
    Pz_d = np.random.rand(D, K)
    Pz_d /= Pz_d.sum(axis=1, keepdims=True)
    Pw_z  = np.random.rand(K, W)
    Pw_z  /= Pw_z.sum(axis=1, keepdims=True)

    for it in range(max_iter):
        Q = []
        for (d,w,c) in dw_triplets:
            val = Pz_d[d, :] * Pw_z[:, w]
            s   = val.sum()
            if s < eps:
                q = np.ones(K) / K
            else:
                q = val / s
            Q.append((d,w,c,q))

        Pz_d_new = np.zeros_like(Pz_d)
        Pw_z_new = np.zeros_like(Pw_z)

        for (d,w,c,q) in Q:
            Pz_d_new[d, :] += c * q
            Pw_z_new[:, w]  += c * q

        Pz_d = (Pz_d_new.T / (Pz_d_new.sum(axis=1) + eps)).T # Added epsilon here
        Pw_z  = (Pw_z_new.T  / (Pw_z_new.sum(axis=1) + eps)).T # Added epsilon here

        # log-likelihood
        ll = 0.0
        for (d,w,c) in dw_triplets:
            prob = np.sum(Pz_d[d, :] * Pw_z[:, w])
            ll += c * np.log(prob + eps)
        print(f"Vanilla PLSI iter {it+1}/{max_iter}, ll = {ll:.2f}")
    return Pz_d, Pw_z

Pz_d_bw, Pw_z_bw = train_plsi_doc_word(dw_triplets, D, W, K=K, max_iter=20)
# Evaluate baseline perplexity
# build test doc-word triplets similarly:
dw_test_counts = defaultdict(int)
for (u,d,w,c) in test_triplets:
    dw_test_counts[(d,w)] += c
dw_test_triplets = [(d,w,c) for (d,w),c in dw_test_counts.items()]
# compute perplexity for baseline
ll_bw, perp_bw = None, None
# compute
ll_bw = 0.0
for (d,w,c) in dw_test_triplets:
    prob = np.sum(Pz_d_bw[d, :] * Pw_z_bw[:, w])
    ll_bw   += c * np.log(prob + 1e-12)
perp_bw = np.exp(- ll_bw / len(dw_test_triplets))
print("Baseline (vanilla PLSI) test perplexity = %.2f" % perp_bw)

Vanilla PLSI iter 1/20, ll = -5127030.75
Vanilla PLSI iter 2/20, ll = -5123302.43
Vanilla PLSI iter 3/20, ll = -5120115.96
Vanilla PLSI iter 4/20, ll = -5117117.70
Vanilla PLSI iter 5/20, ll = -5114062.91
Vanilla PLSI iter 6/20, ll = -5110741.12
Vanilla PLSI iter 7/20, ll = -5106934.30
Vanilla PLSI iter 8/20, ll = -5102393.54
Vanilla PLSI iter 9/20, ll = -5096840.01
Vanilla PLSI iter 10/20, ll = -5090004.55
Vanilla PLSI iter 11/20, ll = -5081724.31
Vanilla PLSI iter 12/20, ll = -5072087.46
Vanilla PLSI iter 13/20, ll = -5061535.24
Vanilla PLSI iter 14/20, ll = -5050777.45
Vanilla PLSI iter 15/20, ll = -5040521.68
Vanilla PLSI iter 16/20, ll = -5031232.41
Vanilla PLSI iter 17/20, ll = -5023080.93
Vanilla PLSI iter 18/20, ll = -5016038.86
Vanilla PLSI iter 19/20, ll = -5009994.90
Vanilla PLSI iter 20/20, ll = -5004823.06
Baseline (vanilla PLSI) test perplexity = 3138.24
