In [1]:
# ─── Cell 1: Reproducibility ────────────────────────────────────────────────────
import random
import numpy as np

my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)


In [3]:
# ─── Cell 2: Imports & Load Data + Derive Extra User Features ─────────────────
import os
import pandas as pd

# 2.1 Paths
dataset_folder = "/Users/hamidahmad/Desktop/DataSet"

# 2.2 Products
products = pd.read_csv(
    os.path.join(dataset_folder, 'product_info.csv'),
    low_memory=False
).rename(columns={'product_id':'itemID'}).dropna(subset=['itemID'])
products['itemID'] = products['itemID'].astype(str)

# 2.3 Reviews
review_files = [f for f in os.listdir(dataset_folder) if f.startswith('reviews')]
print("Found review files:", review_files)   # sanity check

reviews = pd.concat(
    [
      pd.read_csv(os.path.join(dataset_folder, f), low_memory=False)
      for f in review_files
    ],
    ignore_index=True
).rename(columns={
    'author_id':   'userID',
    'product_id':  'itemID',
    'rating':      'rating',
    'review_text': 'review_text'
})

# clean & cast
reviews['userID']      = reviews['userID'].astype(str)
reviews['itemID']      = reviews['itemID'].astype(str)
reviews['rating']      = pd.to_numeric(reviews['rating'], errors='coerce')
reviews = reviews.dropna(subset=['userID','itemID','rating'])
reviews['rating']      = reviews['rating'].astype(float)
reviews['review_text'] = reviews.get('review_text', '').fillna('').astype(str)

# 2.4 Derive extra per-user features from reviews
user_extra = (
    reviews
      .groupby('userID')
      .agg(
         recommend_ratio           = ('is_recommended',       'mean'),
         helpfulness_avg           = ('helpfulness',          'mean'),
         total_feedback_count      = ('total_feedback_count', 'mean'),
         total_pos_feedback_count  = ('total_pos_feedback_count','mean'),
         total_neg_feedback_count  = ('total_neg_feedback_count','mean'),
         review_count              = ('itemID',               'count')
      )
      .reset_index()
)

print("Products:", products.shape, "Reviews:", reviews.shape, "User extra:", user_extra.shape)
display(reviews.head())


Found review files: ['reviews_0-250.csv', 'reviews_1250-end.csv', 'reviews_750-1250.csv', 'reviews_250-500.csv', 'reviews_500-750.csv']
Products: (8494, 27) Reviews: (1094411, 19) User extra: (503216, 7)


Unnamed: 0.1,Unnamed: 0,userID,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,review_title,skin_tone,eye_color,skin_type,hair_color,itemID,product_name,brand_name,price_usd
0,0,1741593524,5.0,1.0,1.0,2,0,2,2023-02-01,I use this with the Nudestix “Citrus Clean Bal...,Taught me how to double cleanse!,,brown,dry,black,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0
1,1,31423088263,1.0,0.0,,0,0,0,2023-03-21,I bought this lip mask after reading the revie...,Disappointed,,,,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
2,2,5061282401,5.0,1.0,,0,0,0,2023-03-21,My review title says it all! I get so excited ...,New Favorite Routine,light,brown,dry,blonde,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
3,3,6083038851,5.0,1.0,,0,0,0,2023-03-20,I’ve always loved this formula for a long time...,Can't go wrong with any of them,,brown,combination,black,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
4,4,47056667835,5.0,1.0,,0,0,0,2023-03-20,"If you have dry cracked lips, this is a must h...",A must have !!!,light,hazel,combination,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0


In [5]:
from surprise import Dataset, Reader, KNNBasic, SVD


In [11]:
# ─── Cell 3: Filter to Top-N Users, Split, and Run KNNBasic ────────────────────
import numpy as np
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split

# How many users to keep
MAX_USERS = 200

# 1) Pick the N users with the most ratings
user_counts = reviews['userID'].value_counts()
top_users   = user_counts.nlargest(MAX_USERS).index

# 2) Subset reviews to only those users
reviews_sub = reviews[reviews['userID'].isin(top_users)].copy()
print("After limiting to top users:", reviews_sub.shape)

# 3) Build & split Surprise dataset
reader    = Reader(rating_scale=(1,5))
data_sub  = Dataset.load_from_df(
    reviews_sub[['userID','itemID','rating']],
    reader
)
trainset, testset = train_test_split(
    data_sub,
    test_size=0.25,
    random_state=my_seed
)
print("Trainset users/items:", trainset.n_users, trainset.n_items)

# 4) Fit user-based KNNBasic on the smaller trainset
sim_options = {'name':'pearson','user_based':True,'min_support':1}
algo_knn   = KNNBasic(sim_options=sim_options)
algo_knn.fit(trainset)
predictions_knn = algo_knn.test(testset)
print(f"Generated {len(predictions_knn)} predictions")


After limiting to top users: (17236, 19)
Trainset users/items: 200 1313
Computing the pearson similarity matrix...
Done computing similarity matrix.
Generated 4309 predictions


In [13]:
from surprise import KNNBasic

sim_options = {'name':'pearson','user_based':True,'min_support':1}
algo_knn   = KNNBasic(sim_options=sim_options)

algo_knn.fit(trainset)
predictions_knn = algo_knn.test(testset)
print(f"Generated {len(predictions_knn)} predictions.")


Computing the pearson similarity matrix...
Done computing similarity matrix.
Generated 4309 predictions.


In [17]:
# ─── Cell 5: MSE, RMSE & MAE for KNNBasic ───────────────────────────────────────
from surprise.accuracy import mse, rmse, mae   # ensure these are imported

print("KNNBasic MSE:", mse(predictions_knn))
print("KNNBasic RMSE:", rmse(predictions_knn))
print("KNNBasic MAE:",  mae(predictions_knn))


MSE: 0.6469
KNNBasic MSE: 0.6468565868272248
RMSE: 0.8043
KNNBasic RMSE: 0.8042739501110456
MAE:  0.5324
KNNBasic MAE: 0.5323723337262427


In [19]:
# ─── Cell 6: Inspect First Predictions ─────────────────────────────────────────
predictions_knn[:2]


[Prediction(uid='25244252137', iid='P479352', r_ui=5.0, est=4.65668755318326, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid='27829619285', iid='P483643', r_ui=5.0, est=4.0, details={'actual_k': 1, 'was_impossible': False})]

In [25]:
# ─── Cell 7: Precision, Recall, F1, Accuracy per User ──────────────────────────
from collections import defaultdict
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def precision_recall_f1(preds, threshold=4.0):
    user_r = defaultdict(list)
    for uid, iid, true_r, est, _ in preds:
        user_r[uid].append((est, true_r))
    precisions, recalls, f1s, accs = [], [], [], []
    for uid, ur in user_r.items():
        y_true = [1 if t>=threshold else 0 for _,t in ur]
        y_pred = [1 if e>=threshold else 0 for e,_ in ur]
        precisions.append( precision_score(y_true, y_pred, zero_division=0) )
        recalls.append(    recall_score   (y_true, y_pred, zero_division=0) )
        f1s.append(        f1_score       (y_true, y_pred, zero_division=0) )
        accs.append(       accuracy_score (y_true, y_pred) )
    return np.mean(precisions), np.mean(recalls), np.mean(f1s), np.mean(accs)


In [27]:
# ─── Cell 8: Print Classification Metrics ───────────────────────────────────────
p, r, f1, acc = precision_recall_f1(predictions_knn, threshold=4.0)
print(f"Avg. Precision: {p:.3f}")
print(f"Avg. Recall:    {r:.3f}")
print(f"Avg. F1:        {f1:.3f}")
print(f"Avg. Accuracy:  {acc:.3f}")


Avg. Precision: 0.937
Avg. Recall:    0.937
Avg. F1:        0.934
Avg. Accuracy:  0.899


In [29]:
# ─── Cell 9: Precision@K & nDCG@K ───────────────────────────────────────────────
from sklearn.metrics import average_precision_score, ndcg_score

def measures_at_k(preds, k=5, threshold=4.0):
    user_r = defaultdict(list)
    for uid, iid, true_r, est, _ in preds:
        user_r[uid].append((est, true_r))
    avg_precisions, precs_at_k, ndcgs = [], [], []
    for uid, ur in user_r.items():
        ur.sort(key=lambda x: x[0], reverse=True)
        y_true = [1 if t>=threshold else 0 for _,t in ur]
        y_pred = [1 if e>=threshold else 0 for e,_ in ur]
        if sum(y_true)>0:
            avg_precisions.append( average_precision_score(y_true, y_pred) )
        else:
            avg_precisions.append(0)
        precs_at_k.append( precision_score(y_true[:k], y_pred[:k], zero_division=0) )
        ndcgs.append( ndcg_score([y_true], [y_pred], k=k) )
    return np.mean(avg_precisions), np.mean(precs_at_k), np.mean(ndcgs)


In [31]:
# ─── Cell 10: Print @5 Metrics ─────────────────────────────────────────────────
ap, p5, n5 = measures_at_k(predictions_knn, k=5, threshold=4.0)
print(f"Avg. AvgPrecision: {ap:.3f}")
print(f"Avg. Precision@5:  {p5:.3f}")
print(f"Avg. nDCG@5:        {n5:.3f}")


Avg. AvgPrecision: 0.938
Avg. Precision@5:  0.956
Avg. nDCG@5:        0.938


In [33]:
# ─── Cell 11: SVD Baseline ─────────────────────────────────────────────────────
algo_svd = SVD(n_epochs=10, lr_all=0.005, reg_all=0.4)
algo_svd.fit(trainset)
predictions_svd = algo_svd.test(testset)

print("SVD MSE:", mse(predictions_svd))
print("SVD RMSE:", rmse(predictions_svd))
print("SVD MAE:", mae(predictions_svd))


MSE: 0.3454
SVD MSE: 0.3454002194903286
RMSE: 0.5877
SVD RMSE: 0.5877075969309301
MAE:  0.3668
SVD MAE: 0.3667851541594751


In [35]:
# ─── Cell 12: Prepare Item “Content” ────────────────────────────────────────────
# Concatenate product_name, brand_name, primary_category
products['content'] = (
    products['product_name'].fillna('') + ' '
    + products['brand_name'].fillna('') + ' '
    + products['primary_category'].fillna('')
)


In [37]:
# ─── Cell 13: Build Content Dict ───────────────────────────────────────────────
item_content_dict = dict(
    zip(products['itemID'], products['content'])
)


In [39]:
# ─── Cell 14: Attach content to Surprise trainset ──────────────────────────────
# Map rawID → innerID and innerID → rawID
raw2inner_i = trainset._raw2inner_id_items
inner2raw_i = {inner: raw for raw, inner in raw2inner_i.items()}

# trainset.n_items gives number of inner items
trainset.content = [
    item_content_dict.get(inner2raw_i[i], '')
    for i in range(trainset.n_items)
]


In [45]:
# ─── Cell 15: Define ContentKNN (fixed) ─────────────────────────────────────────
from surprise import AlgoBase, PredictionImpossible

class ContentKNN(AlgoBase):
    def __init__(self, k=5):
        super().__init__()
        self.k = k

    def fit(self, trainset):
        super().fit(trainset)
        # Build TF-IDF index as before
        tokenized = [list(tokenize(doc)) for doc in trainset.content]
        dct       = Dictionary(tokenized)
        corpus    = [dct.doc2bow(doc) for doc in tokenized]
        model     = TfidfModel(corpus)
        self.index   = SparseMatrixSimilarity(model[corpus], num_features=len(dct))
        self.corpus  = corpus
        self.dct     = dct
        return self

    def estimate(self, u, i):
        # 1) Ensure user is known
        if not self.trainset.knows_user(u):
            raise PredictionImpossible("Unknown user.")

        # 2) Convert raw itemID -> innerID
        try:
            i_inner = self.trainset.to_inner_iid(i)
        except ValueError:
            raise PredictionImpossible("Unknown item.")

        # 3) Compute similarities vector for this item
        sims = self.index[self.corpus[i_inner]]

        # 4) Gather all items this user has rated (inner IDs & ratings)
        user_ratings = self.trainset.ur[u]  # list of (inner_iid, rating)

        # 5) Build a dict of sim scores for only those items
        sim_dict = {iid: sims[iid] for (iid, _) in user_ratings}

        # 6) Select top-k most similar
        top_k = sorted(sim_dict.items(), key=lambda x: x[1], reverse=True)[: self.k]
        # Filter out non-positive similarities
        top_k = [(iid, sim) for iid, sim in top_k if sim > 0]

        if not top_k:
            raise PredictionImpossible("No similar items found.")

        # 7) Compute weighted average
        num = 0.0
        den = 0.0
        for iid, sim in top_k:
            # find the true rating for this iid
            rating = next(r for (inner_id, r) in user_ratings if inner_id == iid)
            num += sim * rating
            den += sim

        if den == 0:
            raise PredictionImpossible("All similarities are zero.")

        est = num / den
        # 8) Clip to valid rating range
        low, high = self.trainset.rating_scale
        return min(high, max(low, est))


In [49]:
# ─── Cell 16: Train & Eval ContentKNN ──────────────────────────────────────────
algo_ct = ContentKNN(k=5)
algo_ct.fit(trainset)
predictions_ct = algo_ct.test(testset)

print("ContentKNN MSE:", mse(predictions_ct))
print("ContentKNN RMSE:", rmse(predictions_ct))
print("ContentKNN MAE:", mae(predictions_ct))


MSE: 0.5280
ContentKNN MSE: 0.5279998999924175
RMSE: 0.7266
ContentKNN RMSE: 0.7266360161679418
MAE:  0.5159
ContentKNN MAE: 0.5158751295701698


In [51]:
# ─── Cell 17: Inspect ContentKNN Predictions ───────────────────────────────────
predictions_ct[:10]


[Prediction(uid='25244252137', iid='P479352', r_ui=5.0, est=4.65668755318326, details={'was_impossible': True, 'reason': 'Unknown item.'}),
 Prediction(uid='27829619285', iid='P483643', r_ui=5.0, est=4.65668755318326, details={'was_impossible': True, 'reason': 'Unknown item.'}),
 Prediction(uid='11950294958', iid='P475951', r_ui=4.0, est=4.65668755318326, details={'was_impossible': True, 'reason': 'Unknown item.'}),
 Prediction(uid='2776710623', iid='P415667', r_ui=5.0, est=4.65668755318326, details={'was_impossible': True, 'reason': 'Unknown item.'}),
 Prediction(uid='2661325544', iid='P474832', r_ui=5.0, est=4.65668755318326, details={'was_impossible': True, 'reason': 'Unknown item.'}),
 Prediction(uid='6333148846', iid='P500288', r_ui=4.0, est=4.65668755318326, details={'was_impossible': True, 'reason': 'Unknown item.'}),
 Prediction(uid='12640060683', iid='P461935', r_ui=5.0, est=4.65668755318326, details={'was_impossible': True, 'reason': 'Unknown item.'}),
 Prediction(uid='251079

In [53]:
# ─── Cell 18: Cold-Start Demo ──────────────────────────────────────────────────
uid_demo = reviews['userID'].iloc[0]
iid_demo = str(products['itemID'].iloc[-1])  # likely unseen by this user

print("ContentKNN cold-start:", algo_ct.predict(uid_demo, iid_demo, r_ui=3.0, verbose=True))
print("KNNBasic   cold-start:", algo_knn.predict(uid_demo, iid_demo, r_ui=3.0, verbose=True))


user: 1741593524 item: P505461    r_ui = 3.00   est = 4.66   {'was_impossible': True, 'reason': 'Unknown user.'}
ContentKNN cold-start: user: 1741593524 item: P505461    r_ui = 3.00   est = 4.66   {'was_impossible': True, 'reason': 'Unknown user.'}
user: 1741593524 item: P505461    r_ui = 3.00   est = 4.66   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
KNNBasic   cold-start: user: 1741593524 item: P505461    r_ui = 3.00   est = 4.66   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}


In [55]:
# ─── Cell 19: Define Hybrid (Content + KNNBasic) ───────────────────────────────
class ContentKNNBasicHybrid(AlgoBase):
    def __init__(self, k=5, sim_opts=None):
        super().__init__()
        self.k = k
        self.sim_opts = sim_opts or {'name':'pearson','user_based':True,'min_support':1}

    def fit(self, trainset):
        super().fit(trainset)
        self.content = ContentKNN(self.k).fit(trainset)
        self.knn     = KNNBasic(sim_options=self.sim_opts).fit(trainset)
        return self

    def estimate(self, u, i):
        if not self.trainset.knows_user(u):
            raise PredictionImpossible("Unknown user.")
        r1 = self.content.estimate(u, i)
        r2 = self.knn.estimate(u, i)
        return (r1 + r2) / 2


In [57]:
# ─── Cell 20: Train & Eval Hybrid ──────────────────────────────────────────────
hyb = ContentKNNBasicHybrid(k=5, sim_opts=sim_options)
hyb.fit(trainset)
predictions_hyb = hyb.test(testset)

print("Hybrid MSE:", mse(predictions_hyb))
print("Hybrid RMSE:", rmse(predictions_hyb))
print("Hybrid MAE:", mae(predictions_hyb))


Computing the pearson similarity matrix...
Done computing similarity matrix.
MSE: 0.5280
Hybrid MSE: 0.5279998999924175
RMSE: 0.7266
Hybrid RMSE: 0.7266360161679418
MAE:  0.5159
Hybrid MAE: 0.5158751295701698


In [59]:
# ─── Cell 21: Top-N Utility ────────────────────────────────────────────────────
def get_top_n(preds, n=10):
    top_n = defaultdict(list)
    for uid,iid,true,est,_ in preds:
        top_n[uid].append((iid,est))
    for uid,ratings in top_n.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = ratings[:n]
    return top_n


In [61]:
# ─── Cell 22: Show Top-5 for a Sample User ─────────────────────────────────────
top_n = get_top_n(predictions_hyb, n=5)
sample_uid = reviews['userID'].iloc[0]
print(f"Top-5 for user {sample_uid}:")
for i,(iid,score) in enumerate(top_n[sample_uid],1):
    name = products.loc[products['itemID']==iid,'product_name'].iloc[0]
    print(f"{i}. {name} (score {score:.3f})")


Top-5 for user 1741593524:


In [65]:
# ─── Cell 23: Full-Data GridSearchCV for SVD ────────────────────────────────────
import random
from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV

# Rebuild the full Dataset object
reader = Reader(rating_scale=(1,5))
data   = Dataset.load_from_df(reviews[['userID','itemID','rating']], reader)

# 1) Shuffle & split raw_ratings into A (80%) and B (20%)
raw_ratings = data.raw_ratings.copy()
random.shuffle(raw_ratings)
split = int(0.8 * len(raw_ratings))
A_raw = raw_ratings[:split]
B_raw = raw_ratings[split:]

data.raw_ratings = A_raw  # now data contains only A

# 2) GridSearchCV on A
param_grid = {
    'n_epochs': [5, 10],
    'lr_all'  : [0.002, 0.005],
    'reg_all' : [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, n_jobs=-1)
gs.fit(data)

print("Best validation RMSE:", gs.best_score['rmse'])
print("Best params:", gs.best_params['rmse'])

# 3) Retrain on full A and evaluate biased/unbiased
best_algo = gs.best_estimator['rmse']
full_train = data.build_full_trainset()
best_algo.fit(full_train)

from surprise.accuracy import rmse

# Biased accuracy on A
pred_A = best_algo.test(full_train.build_testset())
print("Biased RMSE on A:", rmse(pred_A))

# Unbiased accuracy on B
pred_B = best_algo.test(data.construct_testset(B_raw))
print("Unbiased RMSE on B:", rmse(pred_B))


Best validation RMSE: 1.0979544588370889
Best params: {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
RMSE: 1.0334
Biased RMSE on A: 1.0333957182336815
RMSE: 1.0951
Unbiased RMSE on B: 1.0951016979093584


In [71]:
# ─── Cell 24: Show Held-Out Positives for sample_uid in B_raw ─────────────────
uid = sample_uid  # as defined in Cell 22

# B_raw contains tuples like (user, item, rating[, ...])
# We only care about positions 0 (user) and 1 (item)
relevant = [entry[1] for entry in B_raw if entry[0] == uid]

print("Held-out positives for user:", uid)
for iid in relevant:
    name = products.loc[products['itemID'] == iid, 'product_name'].iloc[0]
    print(f" • {iid} → {name}")


Held-out positives for user: 1741593524
