In [20]:
from surprise import Dataset, Reader, KNNBasic
from surprise import KNNWithMeans
from surprise import SVD
from surprise.model_selection import train_test_split
from pdb import set_trace

import time

from collections import defaultdict

from surprise.accuracy import mse, rmse, mae
from collections import defaultdict
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import random
import numpy as np
import os
import pandas as pd

my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

# ===== Pandas Display & Warnings =====
import warnings
from pandas.errors import DtypeWarning
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore', category=DtypeWarning)

### Part 1: Paths, and Loading Product Data

In [2]:
pro_inf = pd.read_csv('..\\sephora_dataset\\product_info.csv')
pro_inf_original = pro_inf.copy()
rew_1 = pd.read_csv('..\\sephora_dataset\\reviews_0-250.csv')
rew_2 = pd.read_csv('..\\sephora_dataset\\reviews_250-500.csv')
rew_3 = pd.read_csv('..\\sephora_dataset\\reviews_500-750.csv')
rew_4 = pd.read_csv('..\\sephora_dataset\\reviews_750-1250.csv')
rew_5 = pd.read_csv('..\\sephora_dataset\\reviews_1250-end.csv')

files = [
    "reviews_0-250.csv",
    "reviews_250-500.csv",
    "reviews_500-750.csv",
    "reviews_750-1250.csv",
    "reviews_1250-end.csv"
]

df_reviews = pd.concat([rew_1, rew_2, rew_3, rew_4, rew_5], ignore_index=True, axis=0)
# Remove any "Unnamed" columns
df_reviews = df_reviews.loc[:, ~df_reviews.columns.str.startswith('Unnamed')]
print("Combined shape:", df_reviews.shape)

Combined shape: (1094411, 18)


In [3]:
pro_inf.head(2)

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,variation_desc,ingredients,price_usd,value_price_usd,sale_price_usd,limited_edition,new,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count,child_max_price,child_min_price
0,P473671,Fragrance Discovery Set,6342,19-69,6320,3.6364,11.0,,,,,"['Capri Eau de Parfum:', 'Alcohol Denat. (SD A...",35.0,,,0,0,1,0,0,"['Unisex/ Genderless Scent', 'Warm &Spicy Scen...",Fragrance,Value & Gift Sets,Perfume Gift Sets,0,,
1,P473668,La Habana Eau de Parfum,6342,19-69,3827,4.1538,13.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,,"['Alcohol Denat. (SD Alcohol 39C), Parfum (Fra...",195.0,,,0,0,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,85.0,30.0


In [4]:
df_reviews.head(2)

Unnamed: 0,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,review_title,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd
0,1741593524,5,1.0,1.0,2,0,2,2023-02-01,I use this with the Nudestix “Citrus Clean Bal...,Taught me how to double cleanse!,,brown,dry,black,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0
1,31423088263,1,0.0,,0,0,0,2023-03-21,I bought this lip mask after reading the revie...,Disappointed,,,,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0


### Get unique counts for the columns

In [5]:
# Get unique counts for the columns
unique_stats = {
    "Unique product_id in df_reviews": df_reviews["product_id"].nunique(),
    "Unique product_id in pro_inf": pro_inf["product_id"].nunique(),
    "Unique author_id in df_reviews": df_reviews["author_id"].nunique()
}

for key, value in unique_stats.items():
    print(f"{key}: {value}")


Unique product_id in df_reviews: 2351
Unique product_id in pro_inf: 8494
Unique author_id in df_reviews: 578653


#### The difference in product_id is because df_reviews focuses on Skincare products that have user reviews, whereas pro_inf includes all products in the Sephora store.

### Check whether each author rated each product only once, cause it's necessary for KNN

In [6]:
# Check for duplicate author_id and product_id pairs
duplicates = df_reviews.duplicated(subset=['author_id', 'product_id'], keep=False)

# Show duplicates
duplicate_rows = df_reviews[duplicates]

# Count of duplicate (author_id, product_id) pairs
num_duplicates = duplicate_rows.shape[0]

if num_duplicates > 0:
    print(f"There are {num_duplicates} duplicate (author_id, product_id) entries.")
else:
    print("Each author rated each product only once.")

There are 10471 duplicate (author_id, product_id) entries.


### Check for Missing Values

In [7]:
missing_values = df_reviews[['author_id', 'product_id', 'rating']].isnull().sum()

print(f"Missing values in author_id: {missing_values['author_id']}")
print(f"Missing values in product_id: {missing_values['product_id']}")
print(f"Missing values in rating: {missing_values['rating']}")

Missing values in author_id: 0
Missing values in product_id: 0
Missing values in rating: 0


### Aggregating Ratings by author_id and product_id. We'll group by author_id and product_id and compute the mean rating for each pair.

In [8]:
# Group by author_id and product_id, and aggregate ratings by mean
df_aggregated = df_reviews.groupby(['author_id', 'product_id'], as_index=False)['rating'].mean()

# Check how many NaN values exist in the aggregated ratings
missing_values_after_aggregation = df_aggregated['rating'].isnull().sum()
print(f"Missing values in rating after aggregation: {missing_values_after_aggregation}")


Missing values in rating after aggregation: 0


In [9]:
df_aggregated.head()

Unnamed: 0,author_id,product_id,rating
0,538863,P420652,1.0
1,549704,P218700,5.0
2,557770,P232903,5.0
3,561736,P421998,5.0
4,561736,P445951,5.0


In [10]:
# Check for duplicate author_id and product_id pairs
duplicates = df_aggregated.duplicated(subset=['author_id', 'product_id'], keep=False)

# Show duplicates
duplicate_rows = df_aggregated[duplicates]

# Count of duplicate (author_id, product_id) pairs
num_duplicates = duplicate_rows.shape[0]

if num_duplicates > 0:
    print(f"There are {num_duplicates} duplicate (author_id, product_id) entries.")
else:
    print("Each author rated each product only once.")

Each author rated each product only once.


In [11]:
# Load data into dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_aggregated[['author_id', 'product_id', 'rating']], reader)

### KNNBasic

In [53]:
# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=0.25, random_state=my_seed)

# Define a custom similarity function for the KNNBasic algorithm
sim_options = {'name': 'pearson', 'user_based': False, 'min_support': 1}

algo_knn = KNNBasic(sim_options=sim_options)

# Train the algorithm on the trainset, and predict ratings for the testset
algo_knn.fit(trainset)
predictions = algo_knn.test(testset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


#### Initially, we attempted to use a user-based collaborative filtering model (user_based=True) with the KNNBasic algorithm. However, this approach failed due to the extremely large number of unique users in the dataset (more than 450,000). The algorithm tries to compute a full user-user similarity matrix, which requires over 850 GB of memory — far exceeding the capacity of a standard system. This resulted in a MemoryError. To resolve this, we switched to item-based collaborative filtering (user_based=False), which computes a significantly smaller item-item similarity matrix. Since the number of unique items is much lower than the number of users, this approach is computationally feasible and successfully completed training and prediction.

### Compute MSE, RMSE and MAE on the test set predictions

In [54]:
mse(predictions)
rmse(predictions)
mae(predictions);

MSE: 1.1959
RMSE: 1.0936
MAE:  0.7193


#### The model achieved an RMSE of 1.0936 and an MAE of 0.7193 on a 1–5 rating scale. This indicates that, on average, predicted ratings differ from actual ratings by about 1 point (RMSE) or 0.72 points (MAE). While not highly precise, these values are typical for baseline models like KNN and provide a solid starting point for further improvement.

### Categorical Evaluation. Precision, Recall and F1

In [55]:
def precision_recall_f1(predictions, threshold=3.5):
    """Return precision, recall, f1, and accuracy averaged across all users."""
    
    # Group predictions by user, storing 'est' and 'true_r' in separate lists
    user_data = defaultdict(lambda: {'est': [], 'true_r': []})
    for uid, _, true_r, est, _ in predictions:
        user_data[uid]['est'].append(est)
        user_data[uid]['true_r'].append(true_r)
    
    # Lists to hold metrics for each user
    precisions = []
    recalls = []
    f1s = []
    accuracies = []
    
    for uid, data in user_data.items():
        est = np.array(data['est'])
        true_r = np.array(data['true_r'])
        
        # Vectorized computation of binary labels
        y_true = (true_r >= threshold).astype(int)
        y_pred = (est >= threshold).astype(int)
        
        # Compute confusion matrix components
        sum_true = y_true.sum()
        sum_pred = y_pred.sum()
        TP = np.dot(y_true, y_pred)
        FP = sum_pred - TP
        FN = sum_true - TP
        total = len(y_true)
        TN = total - (TP + FP + FN)
        
        # Calculate metrics
        precision = TP / (TP + FP) if (TP + FP) != 0 else 0.0
        recall = TP / (TP + FN) if (TP + FN) != 0 else 0.0
        if (precision + recall) == 0:
            f1 = 0.0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        accuracy = (TP + TN) / total if total != 0 else 0.0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
        accuracies.append(accuracy)
    
    # Compute averages
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    avg_accuracy = np.mean(accuracies)
    
    return avg_precision, avg_recall, avg_f1, avg_accuracy

In [56]:
# Start timer
start_time = time.time()

# get average evaluation measures and print them
avg_precision, avg_recall, avg_f1, avg_accuracy = precision_recall_f1(predictions, threshold=3.5)

# End timer
end_time = time.time()

# Calculate time in minutes
time_minutes = (end_time - start_time) / 60

print(f'Avg. Precision: {avg_precision}')
print(f'Avg. Recall: {avg_recall}')
print(f'Avg. F1: {avg_f1}')
print(f'Avg. Accuracy: {avg_accuracy}')
print(f"Time taken: {time_minutes:.2f} minutes")

Avg. Precision: 0.79512939131075
Avg. Recall: 0.8051404845918121
Avg. F1: 0.7974329507334499
Avg. Accuracy: 0.8305424252034144
Time taken: 0.06 minutes


### Rank-based Evaluation. Average Precision (AP), Precision@k, R-Precision and nDCG

In [57]:
from sklearn.metrics import average_precision_score, ndcg_score

def measures_at_k(predictions, k=10, threshold=3.5):
    """Optimized version with vectorized operations and precomputed metrics"""
    
    # Precompute discount factors for NDCG
    discounts = 1 / np.log2(np.arange(2, k + 2))  # Precompute for vectorized NDCG
    
    # Group data more efficiently using numpy arrays
    user_data = defaultdict(lambda: {'est': [], 'true_r': []})
    for uid, _, true_r, est, _ in predictions:
        user_data[uid]['est'].append(est)
        user_data[uid]['true_r'].append(true_r)
    
    # Initialize metric accumulators
    ap_sum = 0.0
    prec_sum = 0.0
    ndcg_sum = 0.0
    user_count = 0
    
    for uid, data in user_data.items():
        est = np.array(data['est'])
        true_r = np.array(data['true_r'])
        n = len(est)
        if n == 0:
            continue
        
        # Vectorized sorting and threshold checks
        sort_idx = np.argsort(-est)  # Descending sort
        true_sorted = true_r[sort_idx]
        y_true = (true_sorted >= threshold).astype(int)
        y_pred = (est[sort_idx] >= threshold).astype(int)
        
        # Precision@k calculation
        tp = np.sum(y_true[:k] & y_pred[:k])
        prec_sum += tp / np.sum(y_pred[:k]) if np.any(y_pred[:k]) else 0.0
        
        # Average precision calculation
        if np.sum(y_true) > 0:
            ap_sum += average_precision_score(y_true, y_pred)
        else:
            ap_sum += 0.0
        
        # Vectorized NDCG calculation
        if n > 1:
            ideal_sorted = np.sort(true_r)[::-1]
            dcg = np.sum((2 ** true_sorted[:k] - 1) * discounts[:min(k, n)])
            idcg = np.sum((2 ** ideal_sorted[:k] - 1) * discounts[:min(k, n)])
            ndcg_sum += dcg / idcg if idcg > 0 else 0.0
        else:
            ndcg_sum += 1.0 if (est[0] >= threshold and true_r[0] >= threshold) else 0.0
        
        user_count += 1

    # Final averages
    avg_ap = ap_sum / user_count if user_count else 0.0
    avg_prec = prec_sum / user_count if user_count else 0.0
    avg_ndcg = ndcg_sum / user_count if user_count else 0.0
    
    return avg_ap, avg_prec, avg_ndcg

In [31]:
# Start timer
start_time = time.time()

# get average evaluation measures and print them
avg_average_precisions, avg_precisions, avg_ndcgs = measures_at_k(predictions, k=5, threshold=3.5)

# End timer
end_time = time.time()

# Calculate time in minutes
time_minutes = (end_time - start_time) / 60

print(f'Avg. AveragePrecision: {avg_average_precisions}')
print(f'Avg. Precision@5: {avg_precisions}')
print(f'Avg. nDCG@5: {avg_ndcgs}')

print(f"Time taken: {time_minutes:.2f} minutes")

Avg. AveragePrecision: 0.8145375300410189
Avg. Precision@5: 0.7947792811920974
Avg. nDCG@5: 0.8197646839367108
Time taken: 1.25 minutes


#### The model is performing well. The average precision of 0.81 indicates that relevant items are generally ranked highly. With a precision@5 of 0.79, nearly 80% of the top-5 recommendations are relevant, which is a strong result. The nDCG@5 score of 0.82 suggests that relevant items are not only included in the top-5 but are also ranked well. Overall, these metrics show that recommender system is providing highly relevant and well-ordered suggestions.

### KNNWithMeans

In [58]:
# Switch to item-based similarity (much smaller matrix)
sim_options = {'name': 'pearson', 'user_based': False, 'min_support': 1}
algo_knn_means = KNNWithMeans(sim_options=sim_options)
algo_knn_means.fit(trainset)
predictions = algo_knn_means.test(testset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


In [59]:
mse(predictions)
rmse(predictions)
mae(predictions);

MSE: 1.1596
RMSE: 1.0769
MAE:  0.7218


#### Slight improvement of RMSE and MSE, suggests that KNNWithMeans is doing a better job than KNNBasic by correcting for user/item bias — even though it's a relatively small gain, it's a consistent improvement across the board.

In [34]:
# get average evaluation measures and print them
avg_precision, avg_recall, avg_f1, avg_accuracy = precision_recall_f1(predictions, threshold=3.5)
print(f'Avg. Precision: {avg_precision}')
print(f'Avg. Recall: {avg_recall}')
print(f'Avg. F1: {avg_f1}')
print(f'Avg. Accuracy: {avg_accuracy}')

Avg. Precision: 0.7947115799494122
Avg. Recall: 0.8037613639530574
Avg. F1: 0.7965524209479043
Avg. Accuracy: 0.8315693146646884


In [35]:
# get average evaluation measures and print them
avg_average_precisions, avg_precisions, avg_ndcgs = measures_at_k(predictions, k=5, threshold=3.5)
print(f'Avg. AveragePrecision: {avg_average_precisions}')
print(f'Avg. Precision@5: {avg_precisions}')
print(f'Avg. nDCG@5: {avg_ndcgs}')

Avg. AveragePrecision: 0.8145375300410189
Avg. Precision@5: 0.7947792811920974
Avg. nDCG@5: 0.8197646839367108


### Model-based Recommender Systems. SVD

In [36]:
algo = SVD(n_epochs=10, lr_all=0.005, reg_all=0.4)
algo.fit(trainset)
predictions = algo.test(testset)

mse(predictions)
rmse(predictions)
mae(predictions);

MSE: 1.2051
RMSE: 1.0978
MAE:  0.8432


#### The SVD model produced an RMSE of 1.0978 and an MAE of 0.8432, underperforming compared to both KNNBasic and KNNWithMeans. This suggests that, under the current settings, matrix factorization did not model user-item interactions as effectively as memory-based methods. Further tuning of SVD hyperparameters may improve performance. Due to lack of computing power and the size of the dataset, we decided not to experiment on it.

### Content-based Prediction

In [63]:
pro_inf['content'] = (
    pro_inf['product_name'].fillna('') + ' '
    + pro_inf['brand_name'].fillna('') + ' '
    + pro_inf['primary_category'].fillna('')
)

In [64]:
# Build Content Dict 
item_content_dict = dict(
    zip(pro_inf['product_id'], pro_inf['content'])
)

In [65]:
pro_inf.head(2)

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,variation_desc,ingredients,price_usd,value_price_usd,sale_price_usd,limited_edition,new,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count,child_max_price,child_min_price,content
0,P473671,Fragrance Discovery Set,6342,19-69,6320,3.6364,11.0,,,,,"['Capri Eau de Parfum:', 'Alcohol Denat. (SD A...",35.0,,,0,0,1,0,0,"['Unisex/ Genderless Scent', 'Warm &Spicy Scen...",Fragrance,Value & Gift Sets,Perfume Gift Sets,0,,,Fragrance Discovery Set 19-69 Fragrance
1,P473668,La Habana Eau de Parfum,6342,19-69,3827,4.1538,13.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,,"['Alcohol Denat. (SD Alcohol 39C), Parfum (Fra...",195.0,,,0,0,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,85.0,30.0,La Habana Eau de Parfum 19-69 Fragrance


In [66]:
# Attach content to Surprise trainset
# Map rawID → innerID and innerID → rawID
raw2inner_i = trainset._raw2inner_id_items
inner2raw_i = {inner: raw for raw, inner in raw2inner_i.items()}

# trainset.n_items gives number of inner items
trainset.content = [
    item_content_dict.get(inner2raw_i[i], '')
    for i in range(trainset.n_items)
]

In [48]:
# Define ContentKNN with Imports 
from surprise import AlgoBase, PredictionImpossible
from gensim.utils import tokenize
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseMatrixSimilarity
import numpy as np

class ContentKNN(AlgoBase):
    def __init__(self, k=5):
        super().__init__()
        self.k = k

    def fit(self, trainset):
        super().fit(trainset)
        # Tokenize each item’s content string
        tokenized = [list(tokenize(doc)) for doc in trainset.content]
        dct       = Dictionary(tokenized)
        corpus    = [dct.doc2bow(doc) for doc in tokenized]
        model     = TfidfModel(corpus)
        self.index  = SparseMatrixSimilarity(model[corpus], num_features=len(dct))
        self.corpus = corpus
        self.dct    = dct
        return self

    def estimate(self, u, i):
        # Fallback for unknown user
        if not self.trainset.knows_user(u):
            return self.trainset.global_mean

        # Convert raw iid to inner or fallback
        try:
            i_inner = self.trainset.to_inner_iid(i)
        except ValueError:
            return self.trainset.global_mean

        sims         = self.index[self.corpus[i_inner]]
        user_ratings = self.trainset.ur[u]

        # Build similarity dict for items the user rated
        sim_dict = {iid: sims[iid] for iid, _ in user_ratings}

        # Top-k positive similarities
        top_k = sorted(sim_dict.items(), key=lambda x: x[1], reverse=True)[:self.k]
        top_k = [(iid, sim) for iid, sim in top_k if sim > 0]

        # Fallback if no neighbors
        if not top_k:
            ratings = [r for (_, r) in user_ratings]
            return np.mean(ratings) if ratings else self.trainset.global_mean

        # Weighted average
        num = den = 0.0
        for iid, sim in top_k:
            rating = next(r for (inner_id, r) in user_ratings if inner_id == iid)
            num += sim * rating
            den += sim

        est = num/den if den > 0 else self.trainset.global_mean
        low, high = self.trainset.rating_scale
        return min(high, max(low, est))


In [49]:
algo_ct = ContentKNN(k=5)
algo_ct.fit(trainset)
predictions_ct = algo_ct.test(testset)


mse(predictions_ct)
rmse(predictions_ct)
mae(predictions_ct)

MSE: 1.3229
RMSE: 1.1502
MAE:  0.8957


0.8956989363101945

In [50]:
predictions_ct[:10]

[Prediction(uid='5501523111', iid='P474326', r_ui=4.0, est=4.2984068335048775, details={'was_impossible': False}),
 Prediction(uid='6753667067', iid='P443843', r_ui=5.0, est=4.2984068335048775, details={'was_impossible': False}),
 Prediction(uid=2437711485, iid='P421243', r_ui=4.0, est=4.2984068335048775, details={'was_impossible': False}),
 Prediction(uid='7099563866', iid='P12045', r_ui=5.0, est=4.2984068335048775, details={'was_impossible': False}),
 Prediction(uid='1785061628', iid='P440489', r_ui=5.0, est=4.2984068335048775, details={'was_impossible': False}),
 Prediction(uid='1198233191', iid='P433444', r_ui=5.0, est=4.2984068335048775, details={'was_impossible': False}),
 Prediction(uid='1367652116', iid='P479340', r_ui=5.0, est=4.2984068335048775, details={'was_impossible': False}),
 Prediction(uid='930452003', iid='P474953', r_ui=3.0, est=4.2984068335048775, details={'was_impossible': False}),
 Prediction(uid=1834850464, iid='P467750', r_ui=5.0, est=4.2984068335048775, details

In [67]:
# Cold-Start Demo
uid_demo = df_reviews['author_id'].iloc[0]
iid_demo = str(pro_inf['product_id'].iloc[-1])  # likely unseen by this user

print("ContentKNN cold-start:", algo_ct.predict(uid_demo, iid_demo, r_ui=3.0, verbose=True))
print("KNNBasic   cold-start:", algo_knn.predict(uid_demo, iid_demo, r_ui=3.0, verbose=True))

user: 1741593524 item: P505461    r_ui = 3.00   est = 4.30   {'was_impossible': False}
ContentKNN cold-start: user: 1741593524 item: P505461    r_ui = 3.00   est = 4.30   {'was_impossible': False}
user: 1741593524 item: P505461    r_ui = 3.00   est = 4.30   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
KNNBasic   cold-start: user: 1741593524 item: P505461    r_ui = 3.00   est = 4.30   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}


In [68]:
# Define Hybrid via .predict().est
from surprise import AlgoBase
import numpy as np

class ContentKNNBasicHybrid(AlgoBase):
    def __init__(self, k=5, sim_opts=None):
        super().__init__()
        self.k        = k
        self.sim_opts = sim_opts or {
            'name':'pearson',
            'user_based':True,
            'min_support':1
        }

    def fit(self, trainset):
        super().fit(trainset)
        # content‐based
        self.content = ContentKNN(self.k).fit(trainset)
        # collaborative
        from surprise import KNNBasic
        self.knn = KNNBasic(sim_options=self.sim_opts).fit(trainset)
        return self

    def estimate(self, u_inner, i_inner):
        # convert to raw IDs for .predict
        try:
            raw_u = self.trainset.to_raw_uid(u_inner)
            raw_i = self.trainset.to_raw_iid(i_inner)
        except Exception:
            # unseen user or item
            return self.trainset.global_mean

        # 1) content pred
        try:
            p1 = self.content.predict(raw_u, raw_i, r_ui=None, verbose=False)
            r1 = p1.est
        except Exception:
            r1 = self.trainset.global_mean

        # 2) knn pred
        try:
            p2 = self.knn.predict(raw_u, raw_i, r_ui=None, verbose=False)
            r2 = p2.est
        except Exception:
            r2 = self.trainset.global_mean

        # 3) average
        return (r1 + r2) / 2

In [71]:
hyb = ContentKNNBasicHybrid(k=5, sim_opts=sim_options)
hyb.fit(trainset)
predictions_hyb = hyb.test(testset)

mse(clean_hyb)
rmse(clean_hyb)
mae(clean_hyb);

Computing the pearson similarity matrix...
Done computing similarity matrix.
MSE: 1.1487
RMSE: 1.0718
MAE:  0.8002
