In [1]:
# ─── Cell 1: Reproducibility ────────────────────────────────────────────────────
import random
import numpy as np

my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)


In [3]:
# ─── Cell 2: Imports, Load & Clean Data, Build & Clean User Features ───────────
import os
import pandas as pd

# 2.1 Paths
dataset_folder = "/Users/hamidahmad/Desktop/DataSet"

# 2.2 Products
products = pd.read_csv(
    os.path.join(dataset_folder, 'product_info.csv'),
    low_memory=False
)
# Drop any stray “Unnamed” index columns
products = products.loc[:, ~products.columns.str.startswith('Unnamed')]
products = (
    products
      .rename(columns={'product_id':'itemID'})
      .dropna(subset=['itemID'])
)
products['itemID'] = products['itemID'].astype(str)

# 2.3 Reviews
review_files = [f for f in os.listdir(dataset_folder) if f.startswith('reviews')]
print("Found review files:", review_files)

dfs = []
for f in review_files:
    df = pd.read_csv(os.path.join(dataset_folder, f), low_memory=False)
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]
    dfs.append(df)
reviews = pd.concat(dfs, ignore_index=True)

reviews = reviews.rename(columns={
    'author_id':   'userID',
    'product_id':  'itemID',
    'rating':      'rating',
    'review_text': 'review_text'
})
reviews['userID'] = reviews['userID'].astype(str)
reviews['itemID'] = reviews['itemID'].astype(str)
reviews['rating'] = pd.to_numeric(reviews['rating'], errors='coerce')
reviews = reviews.dropna(subset=['rating'])
reviews['rating'] = reviews['rating'].astype(float)
reviews['review_text'] = reviews.get('review_text','').fillna('').astype(str)

# 2.4 Build full user_features as per teammate’s selection

user_stats = (
    reviews
      .groupby('userID')
      .agg(
         rating_avg      = ('rating',         'mean'),
         rating_count    = ('rating',         'count'),
         recommend_ratio = ('is_recommended', 'mean'),
         helpfulness_avg = ('helpfulness',    'mean')
      )
)

user_stats['recommend_missing'] = user_stats['recommend_ratio'].isna().astype(int)
global_recommend = reviews['is_recommended'].mean()
user_stats['recommend_ratio'] = user_stats['recommend_ratio'].fillna(global_recommend)

profile = (
    reviews
      .sort_values('submission_time')
      .dropna(subset=['skin_tone','skin_type','eye_color','hair_color'])
      .groupby('userID')
      .first()[['skin_tone','skin_type','eye_color','hair_color']]
)

tone_dummies = pd.get_dummies(profile['skin_tone'], prefix='tone').astype(int)
type_dummies = pd.get_dummies(profile['skin_type'], prefix='type').astype(int)
eye_dummies  = pd.get_dummies(profile['eye_color'], prefix='eye').astype(int)
hair_dummies = pd.get_dummies(profile['hair_color'], prefix='hair').astype(int)

user_profile = pd.concat(
    [profile.index.to_series(), tone_dummies, type_dummies, eye_dummies, hair_dummies],
    axis=1
).set_index('userID')

user_features = user_stats.join(user_profile, how='left').reset_index()

# ─── Fill any remaining NaNs in user_features ──────────────────────────────────
# 1) Fill helpfulness_avg with global mean
global_help = reviews['helpfulness'].mean()
user_features['helpfulness_avg'] = user_features['helpfulness_avg'].fillna(global_help)

# 2) Fill numeric columns with 0
num_cols   = ['rating_avg','rating_count','recommend_ratio','helpfulness_avg','recommend_missing']
user_features[num_cols] = user_features[num_cols].fillna(0)

# 3) Fill all dummies with 0
dummy_cols = [c for c in user_features.columns if c.startswith(('tone_','type_','eye_','hair_'))]
user_features[dummy_cols] = user_features[dummy_cols].fillna(0).astype(int)

# Outputs
print("Products shape:     ", products.shape)
print("Reviews shape:      ", reviews.shape)
print("User_features shape:", user_features.shape)
display(user_features.head(), reviews.head())


Found review files: ['reviews_0-250.csv', 'reviews_1250-end.csv', 'reviews_750-1250.csv', 'reviews_250-500.csv', 'reviews_500-750.csv']
Products shape:      (8494, 27)
Reviews shape:       (1094411, 18)
User_features shape: (503216, 37)


Unnamed: 0,userID,rating_avg,rating_count,recommend_ratio,helpfulness_avg,recommend_missing,tone_dark,tone_deep,tone_ebony,tone_fair,...,eye_gray,eye_green,eye_hazel,hair_auburn,hair_black,hair_blonde,hair_brown,hair_brunette,hair_gray,hair_red
0,10000015049,5.0,1,1.0,0.767782,0,0,0,0,1,...,0,1,0,0,0,0,1,0,0,0
1,1000001620,4.0,1,1.0,0.625,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,10000049623,5.0,1,1.0,0.767782,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,10000060335,5.0,1,1.0,0.767782,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,10000098796,5.0,2,0.839962,0.767782,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,userID,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,review_title,skin_tone,eye_color,skin_type,hair_color,itemID,product_name,brand_name,price_usd
0,1741593524,5.0,1.0,1.0,2,0,2,2023-02-01,I use this with the Nudestix “Citrus Clean Bal...,Taught me how to double cleanse!,,brown,dry,black,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0
1,31423088263,1.0,0.0,,0,0,0,2023-03-21,I bought this lip mask after reading the revie...,Disappointed,,,,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
2,5061282401,5.0,1.0,,0,0,0,2023-03-21,My review title says it all! I get so excited ...,New Favorite Routine,light,brown,dry,blonde,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
3,6083038851,5.0,1.0,,0,0,0,2023-03-20,I’ve always loved this formula for a long time...,Can't go wrong with any of them,,brown,combination,black,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
4,47056667835,5.0,1.0,,0,0,0,2023-03-20,"If you have dry cracked lips, this is a must h...",A must have !!!,light,hazel,combination,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0


In [5]:
from surprise import Dataset, Reader, KNNBasic, SVD


In [61]:
# ─── Cell 3: Filter by Item Popularity & User Activity, then Build & Split ────
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split

# Parameters
MAX_USERS         = 2000   # keep this many top users
MIN_ITEM_REVIEWS  = 20     # only keep items with at least this many reviews

# 1) Filter to popular items
item_counts   = reviews['itemID'].value_counts()
popular_items = item_counts[item_counts >= MIN_ITEM_REVIEWS].index
reviews_pop   = reviews[reviews['itemID'].isin(popular_items)].copy()
print("After filtering popular items:", reviews_pop.shape)

# 2) From that, pick the top-N most active users
user_counts = reviews_pop['userID'].value_counts()
top_users   = user_counts.nlargest(MAX_USERS).index
reviews_sub = reviews_pop[reviews_pop['userID'].isin(top_users)].copy()
print("After filtering top users:     ", reviews_sub.shape)

# 3) Build Surprise Dataset and split
reader   = Reader(rating_scale=(1,5))
data_sub = Dataset.load_from_df(
    reviews_sub[['userID','itemID','rating']],
    reader
)
trainset, testset = train_test_split(
    data_sub,
    test_size=0.25,
    random_state=my_seed
)
print("Final trainset (users, items):", trainset.n_users, trainset.n_items)
print("Final testset length:         ", len(testset))


After filtering popular items: (1090700, 18)
After filtering top users:      (81555, 18)
Final trainset (users, items): 2000 1752
Final testset length:          20389


In [63]:
# ─── Sanity-Check Preprocessed Data ─────────────────────────────────────────────
print("=== Reviews (first 5 rows) ===")
display(reviews.head())

print(f"Total reviews: {len(reviews)}")
print(f"Unique users: {reviews['userID'].nunique()}")
print(f"Unique items: {reviews['itemID'].nunique()}")

print("\n=== User-level features (first rows) ===")
display(user_features.head())

print("\n=== Products (+ content if present) ===")
cols = ['itemID', 'product_name', 'brand_name', 'primary_category']
if 'content' in products.columns:
    cols.append('content')
display(products[cols].head())

print("\n=== Train/Test split sizes ===")
print(" Trainset size (users, items):", trainset.n_users, trainset.n_items)
print(" Testset length:", len(testset))


=== Reviews (first 5 rows) ===


Unnamed: 0,userID,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,review_title,skin_tone,eye_color,skin_type,hair_color,itemID,product_name,brand_name,price_usd
0,1741593524,5.0,1.0,1.0,2,0,2,2023-02-01,I use this with the Nudestix “Citrus Clean Bal...,Taught me how to double cleanse!,,brown,dry,black,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0
1,31423088263,1.0,0.0,,0,0,0,2023-03-21,I bought this lip mask after reading the revie...,Disappointed,,,,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
2,5061282401,5.0,1.0,,0,0,0,2023-03-21,My review title says it all! I get so excited ...,New Favorite Routine,light,brown,dry,blonde,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
3,6083038851,5.0,1.0,,0,0,0,2023-03-20,I’ve always loved this formula for a long time...,Can't go wrong with any of them,,brown,combination,black,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
4,47056667835,5.0,1.0,,0,0,0,2023-03-20,"If you have dry cracked lips, this is a must h...",A must have !!!,light,hazel,combination,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0


Total reviews: 1094411
Unique users: 503216
Unique items: 2351

=== User-level features (first rows) ===


Unnamed: 0,userID,rating_avg,rating_count,recommend_ratio,helpfulness_avg,recommend_missing,tone_dark,tone_deep,tone_ebony,tone_fair,...,eye_gray,eye_green,eye_hazel,hair_auburn,hair_black,hair_blonde,hair_brown,hair_brunette,hair_gray,hair_red
0,10000015049,5.0,1,1.0,0.767782,0,0,0,0,1,...,0,1,0,0,0,0,1,0,0,0
1,1000001620,4.0,1,1.0,0.625,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,10000049623,5.0,1,1.0,0.767782,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,10000060335,5.0,1,1.0,0.767782,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,10000098796,5.0,2,0.839962,0.767782,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0



=== Products (+ content if present) ===


Unnamed: 0,itemID,product_name,brand_name,primary_category,content
0,P473671,Fragrance Discovery Set,19-69,Fragrance,Fragrance Discovery Set 19-69 Fragrance
1,P473668,La Habana Eau de Parfum,19-69,Fragrance,La Habana Eau de Parfum 19-69 Fragrance
2,P473662,Rainbow Bar Eau de Parfum,19-69,Fragrance,Rainbow Bar Eau de Parfum 19-69 Fragrance
3,P473660,Kasbah Eau de Parfum,19-69,Fragrance,Kasbah Eau de Parfum 19-69 Fragrance
4,P473658,Purple Haze Eau de Parfum,19-69,Fragrance,Purple Haze Eau de Parfum 19-69 Fragrance



=== Train/Test split sizes ===
 Trainset size (users, items): 2000 1752
 Testset length: 20389


In [65]:
# ─── Cell 4: KNNBasic Baseline (filter impossible preds) ───────────────────────
from surprise import KNNBasic

sim_options = {'name':'pearson', 'user_based':True, 'min_support':1}
algo_knn   = KNNBasic(sim_options=sim_options)

algo_knn.fit(trainset)
predictions_knn = algo_knn.test(testset)

# Filter out any was_impossible=True via the details dict
clean_knn = [
    p for p in predictions_knn
    if not p.details.get('was_impossible', False)
]

print(f"Generated {len(clean_knn)}/{len(predictions_knn)} valid predictions.")


Computing the pearson similarity matrix...
Done computing similarity matrix.
Generated 13648/20389 valid predictions.


In [67]:
# ─── Cell 5: MSE, RMSE & MAE for KNNBasic (on valid preds) ────────────────────
from surprise.accuracy import mse, rmse, mae

print("KNNBasic MSE:", mse(clean_knn))
print("KNNBasic RMSE:", rmse(clean_knn))
print("KNNBasic MAE:",  mae(clean_knn))


MSE: 0.7968
KNNBasic MSE: 0.7968202756535013
RMSE: 0.8926
KNNBasic RMSE: 0.8926479012765903
MAE:  0.6081
KNNBasic MAE: 0.6081420544887881


In [69]:
# ─── Cell 6: Inspect First Predictions ─────────────────────────────────────────
predictions_knn[:2]


[Prediction(uid='2384176250', iid='P482551', r_ui=3.0, est=4.529948245399124, details={'actual_k': 37, 'was_impossible': False}),
 Prediction(uid='1282164317', iid='P443700', r_ui=5.0, est=4.571272428388722, details={'actual_k': 10, 'was_impossible': False})]

In [71]:
# ─── Cell 7: Precision, Recall, F1, Accuracy per User ──────────────────────────
from collections import defaultdict
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

def precision_recall_f1(preds, threshold=4.0):
    # 1) filter out any impossible predictions
    valid = [p for p in preds if not getattr(p, 'was_impossible', False)]
    
    # 2) group by user
    user_r = defaultdict(list)
    for uid, iid, true_r, est, _ in valid:
        user_r[uid].append((est, true_r))
    
    # 3) compute per-user metrics
    precisions, recalls, f1s, accs = [], [], [], []
    for uid, ur in user_r.items():
        y_true = [1 if t >= threshold else 0 for _, t in ur]
        y_pred = [1 if e >= threshold else 0 for e, _ in ur]
        precisions.append( precision_score(y_true, y_pred, zero_division=0) )
        recalls.append(    recall_score   (y_true, y_pred, zero_division=0) )
        f1s.append(        f1_score       (y_true, y_pred, zero_division=0) )
        accs.append(       accuracy_score (y_true, y_pred)           )
    
    return np.mean(precisions), np.mean(recalls), np.mean(f1s), np.mean(accs)


In [73]:
# ─── Cell 8: Print Classification Metrics for KNNBasic ─────────────────────────
# filter & compute
p, r, f1, acc = precision_recall_f1(predictions_knn, threshold=4.0)

print(f"Avg. Precision: {p:.3f}")
print(f"Avg. Recall:    {r:.3f}")
print(f"Avg. F1:        {f1:.3f}")
print(f"Avg. Accuracy:  {acc:.3f}")


Avg. Precision: 0.915
Avg. Recall:    0.913
Avg. F1:        0.905
Avg. Accuracy:  0.867


In [75]:
# ─── Cell 9 (updated): measures_at_k with safe nDCG handling ──────────────────
from collections import defaultdict
from sklearn.metrics import average_precision_score, precision_score, ndcg_score
import numpy as np

def measures_at_k(predictions, k=10, threshold=3.5):
    """Return (avg AP, avg Precision@k, avg nDCG@k) across users, 
       skipping nDCG when there's only one item."""
    # 1) Group predictions by user
    user_ratings = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_ratings[uid].append((est, true_r))
    
    ap_list, p_at_k, ndcg_at_k = [], [], []
    
    for uid, ur in user_ratings.items():
        # sort by estimated score descending
        ur.sort(key=lambda x: x[0], reverse=True)
        y_score = [est for est, _ in ur]
        y_true  = [1 if true_r >= threshold else 0 for _, true_r in ur]
        
        # Average Precision (AP)
        if sum(y_true) > 0:
            ap_list.append(average_precision_score(y_true, y_score))
        else:
            ap_list.append(0.0)
        
        # Precision@k
        y_true_k = y_true[:k]
        y_pred_k = [1 if s >= threshold else 0 for s in y_score[:k]]
        p_at_k.append(precision_score(y_true_k, y_pred_k, zero_division=0))
        
        # nDCG@k (only if more than one document)
        if len(y_true) > 1:
            ndcg_at_k.append(ndcg_score([y_true], [y_score], k=k))
        else:
            ndcg_at_k.append(0.0)
    
    # 4) Return averages
    return (
        np.mean(ap_list),
        np.mean(p_at_k),
        np.mean(ndcg_at_k)
    )


In [77]:
# ─── Cell 10: Print @5 Metrics ─────────────────────────────────────────────────
ap, p5, n5 = measures_at_k(predictions_knn, k=5, threshold=4.0)
print(f"Avg. AvgPrecision: {ap:.3f}")
print(f"Avg. Precision@5:  {p5:.3f}")
print(f"Avg. nDCG@5:        {n5:.3f}")


Avg. AvgPrecision: 0.939
Avg. Precision@5:  0.923
Avg. nDCG@5:        0.936


In [79]:
# ─── Cell 11: SVD Baseline ─────────────────────────────────────────────────────
from surprise import SVD       # ensure SVD is imported
from surprise.accuracy import mse, rmse, mae  # if you need these here

# instantiate & train
algo_svd = SVD(n_epochs=10, lr_all=0.005, reg_all=0.4)
algo_svd.fit(trainset)

# predict & evaluate
predictions_svd = algo_svd.test(testset)
print("SVD MSE:", mse(predictions_svd))
print("SVD RMSE:", rmse(predictions_svd))
print("SVD MAE:", mae(predictions_svd))


MSE: 0.4819
SVD MSE: 0.4818860685196562
RMSE: 0.6942
SVD RMSE: 0.6941801412599299
MAE:  0.4473
SVD MAE: 0.4472717345441487


In [81]:
# ─── Cell 12: Prepare Item “Content” ────────────────────────────────────────────
# Concatenate product_name, brand_name, primary_category
products['content'] = (
    products['product_name'].fillna('') + ' '
    + products['brand_name'].fillna('') + ' '
    + products['primary_category'].fillna('')
)


In [83]:
# ─── Cell 13: Build Content Dict ───────────────────────────────────────────────
item_content_dict = dict(
    zip(products['itemID'], products['content'])
)


In [85]:
# ─── Cell 14: Attach content to Surprise trainset ──────────────────────────────
# Map rawID → innerID and innerID → rawID
raw2inner_i = trainset._raw2inner_id_items
inner2raw_i = {inner: raw for raw, inner in raw2inner_i.items()}

# trainset.n_items gives number of inner items
trainset.content = [
    item_content_dict.get(inner2raw_i[i], '')
    for i in range(trainset.n_items)
]


In [87]:
# ─── Cell 15: Define ContentKNN with Imports ───────────────────────────────────
from surprise import AlgoBase, PredictionImpossible
from gensim.utils import tokenize
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseMatrixSimilarity
import numpy as np

class ContentKNN(AlgoBase):
    def __init__(self, k=5):
        super().__init__()
        self.k = k

    def fit(self, trainset):
        super().fit(trainset)
        # Tokenize each item’s content string
        tokenized = [list(tokenize(doc)) for doc in trainset.content]
        dct       = Dictionary(tokenized)
        corpus    = [dct.doc2bow(doc) for doc in tokenized]
        model     = TfidfModel(corpus)
        self.index  = SparseMatrixSimilarity(model[corpus], num_features=len(dct))
        self.corpus = corpus
        self.dct    = dct
        return self

    def estimate(self, u, i):
        # Fallback for unknown user
        if not self.trainset.knows_user(u):
            return self.trainset.global_mean

        # Convert raw iid to inner or fallback
        try:
            i_inner = self.trainset.to_inner_iid(i)
        except ValueError:
            return self.trainset.global_mean

        sims         = self.index[self.corpus[i_inner]]
        user_ratings = self.trainset.ur[u]

        # Build similarity dict for items the user rated
        sim_dict = {iid: sims[iid] for iid, _ in user_ratings}

        # Top-k positive similarities
        top_k = sorted(sim_dict.items(), key=lambda x: x[1], reverse=True)[:self.k]
        top_k = [(iid, sim) for iid, sim in top_k if sim > 0]

        # Fallback if no neighbors
        if not top_k:
            ratings = [r for (_, r) in user_ratings]
            return np.mean(ratings) if ratings else self.trainset.global_mean

        # Weighted average
        num = den = 0.0
        for iid, sim in top_k:
            rating = next(r for (inner_id, r) in user_ratings if inner_id == iid)
            num += sim * rating
            den += sim

        est = num/den if den > 0 else self.trainset.global_mean
        low, high = self.trainset.rating_scale
        return min(high, max(low, est))


In [89]:
# ─── Cell 16: Train & Eval ContentKNN (with empty‐list guard) ────────────────
from surprise.accuracy import mse, rmse, mae

algo_ct = ContentKNN(k=5)
algo_ct.fit(trainset)
predictions_ct = algo_ct.test(testset)

# Filter out impossible predictions via details
clean_ct = [p for p in predictions_ct 
            if not p.details.get('was_impossible', False)]
print(f"Valid ContentKNN predictions: {len(clean_ct)}/{len(predictions_ct)}")

if clean_ct:
    print("ContentKNN MSE: ", mse(clean_ct))
    print("ContentKNN RMSE:", rmse(clean_ct))
    print("ContentKNN MAE: ", mae(clean_ct))
else:
    print("No valid ContentKNN predictions to evaluate.")


Valid ContentKNN predictions: 20389/20389
MSE: 0.6613
ContentKNN MSE:  0.6612627904813975
RMSE: 0.8132
ContentKNN RMSE: 0.8131806628796564
MAE:  0.5845
ContentKNN MAE:  0.5844555301103637


In [91]:
# ─── Cell 17: Inspect ContentKNN Predictions ───────────────────────────────────
predictions_ct[:10]


[Prediction(uid='2384176250', iid='P482551', r_ui=3.0, est=4.612742373213877, details={'was_impossible': False}),
 Prediction(uid='1282164317', iid='P443700', r_ui=5.0, est=4.612742373213877, details={'was_impossible': False}),
 Prediction(uid='5174374359', iid='P455936', r_ui=5.0, est=4.612742373213877, details={'was_impossible': False}),
 Prediction(uid='23374285741', iid='P474832', r_ui=5.0, est=4.612742373213877, details={'was_impossible': False}),
 Prediction(uid='9473501380', iid='P479841', r_ui=5.0, est=4.612742373213877, details={'was_impossible': False}),
 Prediction(uid='32113426444', iid='P269122', r_ui=5.0, est=4.612742373213877, details={'was_impossible': False}),
 Prediction(uid='12404252268', iid='P469503', r_ui=5.0, est=4.612742373213877, details={'was_impossible': False}),
 Prediction(uid='23207278907', iid='P471546', r_ui=1.0, est=4.612742373213877, details={'was_impossible': False}),
 Prediction(uid='1976782934', iid='P443352', r_ui=5.0, est=4.612742373213877, detail

In [93]:
# ─── Cell 18: Cold-Start Demo ──────────────────────────────────────────────────
uid_demo = reviews['userID'].iloc[0]
iid_demo = str(products['itemID'].iloc[-1])  # likely unseen by this user

print("ContentKNN cold-start:", algo_ct.predict(uid_demo, iid_demo, r_ui=3.0, verbose=True))
print("KNNBasic   cold-start:", algo_knn.predict(uid_demo, iid_demo, r_ui=3.0, verbose=True))


user: 1741593524 item: P505461    r_ui = 3.00   est = 4.61   {'was_impossible': False}
ContentKNN cold-start: user: 1741593524 item: P505461    r_ui = 3.00   est = 4.61   {'was_impossible': False}
user: 1741593524 item: P505461    r_ui = 3.00   est = 4.61   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
KNNBasic   cold-start: user: 1741593524 item: P505461    r_ui = 3.00   est = 4.61   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}


In [95]:
# ─── Cell 19: Define Hybrid via .predict().est ─────────────────────────────────
from surprise import AlgoBase
import numpy as np

class ContentKNNBasicHybrid(AlgoBase):
    def __init__(self, k=5, sim_opts=None):
        super().__init__()
        self.k        = k
        self.sim_opts = sim_opts or {
            'name':'pearson',
            'user_based':True,
            'min_support':1
        }

    def fit(self, trainset):
        super().fit(trainset)
        # content‐based
        self.content = ContentKNN(self.k).fit(trainset)
        # collaborative
        from surprise import KNNBasic
        self.knn = KNNBasic(sim_options=self.sim_opts).fit(trainset)
        return self

    def estimate(self, u_inner, i_inner):
        # convert to raw IDs for .predict
        try:
            raw_u = self.trainset.to_raw_uid(u_inner)
            raw_i = self.trainset.to_raw_iid(i_inner)
        except Exception:
            # unseen user or item
            return self.trainset.global_mean

        # 1) content pred
        try:
            p1 = self.content.predict(raw_u, raw_i, r_ui=None, verbose=False)
            r1 = p1.est
        except Exception:
            r1 = self.trainset.global_mean

        # 2) knn pred
        try:
            p2 = self.knn.predict(raw_u, raw_i, r_ui=None, verbose=False)
            r2 = p2.est
        except Exception:
            r2 = self.trainset.global_mean

        # 3) average
        return (r1 + r2) / 2


In [97]:
# ─── Cell 20: Train & Eval Hybrid ──────────────────────────────────────────────
from surprise.accuracy import mse, rmse, mae

hyb = ContentKNNBasicHybrid(k=5, sim_opts=sim_options)
hyb.fit(trainset)
predictions_hyb = hyb.test(testset)

# Filter out any impossible
clean_hyb = [p for p in predictions_hyb if not p.details.get('was_impossible', False)]
print(f"Hybrid: {len(clean_hyb)}/{len(predictions_hyb)} valid preds")
print("Hybrid MSE: ", mse(clean_hyb))
print("Hybrid RMSE:", rmse(clean_hyb))
print("Hybrid MAE: ", mae(clean_hyb))


Computing the pearson similarity matrix...
Done computing similarity matrix.
Hybrid: 20389/20389 valid preds
MSE: 0.6102
Hybrid MSE:  0.6102481567209332
RMSE: 0.7812
Hybrid RMSE: 0.7811838174981182
MAE:  0.5657
Hybrid MAE:  0.5656876980093128


In [99]:
# ─── Cell 21: Top-N Utility ────────────────────────────────────────────────────
def get_top_n(preds, n=10):
    top_n = defaultdict(list)
    for uid,iid,true,est,_ in preds:
        top_n[uid].append((iid,est))
    for uid,ratings in top_n.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = ratings[:n]
    return top_n


In [101]:
# ─── Cell 22: Show Top-5 for a Sample User ─────────────────────────────────────
top_n = get_top_n(predictions_hyb, n=5)
sample_uid = reviews['userID'].iloc[0]
print(f"Top-5 for user {sample_uid}:")
for i,(iid,score) in enumerate(top_n[sample_uid],1):
    name = products.loc[products['itemID']==iid,'product_name'].iloc[0]
    print(f"{i}. {name} (score {score:.3f})")


Top-5 for user 1741593524:


In [103]:
# ─── Cell 23: Full-Data GridSearchCV for SVD ────────────────────────────────────
import random
from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV

# Rebuild the full Dataset object
reader = Reader(rating_scale=(1,5))
data   = Dataset.load_from_df(reviews[['userID','itemID','rating']], reader)

# 1) Shuffle & split raw_ratings into A (80%) and B (20%)
raw_ratings = data.raw_ratings.copy()
random.shuffle(raw_ratings)
split = int(0.8 * len(raw_ratings))
A_raw = raw_ratings[:split]
B_raw = raw_ratings[split:]

data.raw_ratings = A_raw  # now data contains only A

# 2) GridSearchCV on A
param_grid = {
    'n_epochs': [5, 10],
    'lr_all'  : [0.002, 0.005],
    'reg_all' : [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, n_jobs=-1)
gs.fit(data)

print("Best validation RMSE:", gs.best_score['rmse'])
print("Best params:", gs.best_params['rmse'])

# 3) Retrain on full A and evaluate biased/unbiased
best_algo = gs.best_estimator['rmse']
full_train = data.build_full_trainset()
best_algo.fit(full_train)

from surprise.accuracy import rmse

# Biased accuracy on A
pred_A = best_algo.test(full_train.build_testset())
print("Biased RMSE on A:", rmse(pred_A))

# Unbiased accuracy on B
pred_B = best_algo.test(data.construct_testset(B_raw))
print("Unbiased RMSE on B:", rmse(pred_B))


Best validation RMSE: 1.0991484105869183
Best params: {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
RMSE: 1.0345
Biased RMSE on A: 1.034491358698007
RMSE: 1.0905
Unbiased RMSE on B: 1.0904865512854083


In [105]:
# ─── Cell 24: Show Held-Out Positives for sample_uid in B_raw ─────────────────
uid = sample_uid  # as defined in Cell 22

# B_raw contains tuples like (user, item, rating[, ...])
# We only care about positions 0 (user) and 1 (item)
relevant = [entry[1] for entry in B_raw if entry[0] == uid]

print("Held-out positives for user:", uid)
for iid in relevant:
    name = products.loc[products['itemID'] == iid, 'product_name'].iloc[0]
    print(f" • {iid} → {name}")


Held-out positives for user: 1741593524
 • P476894 → NUDESKIN Lemon-Aid Detox & Glow Micro-Peel
