## Hybrid Recommender

A simple hybrid recommender that combines content-based recommenders that use embeddings from TF-IDF and a SVD-based collaborative-filtering algorithm. 

In [1]:
from collections import defaultdict

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, roc_auc_score
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split


reviews = pd.read_json("data\\reviews_Musical_Instruments_5.json.zip",
                          lines=True)
ratings = pd.read_csv("data\\ratings_Musical_Instruments.zip", 
                      compression="zip")

reviews = reviews[["reviewerID", "asin", "reviewText"]]
reviews["reviewText"] = reviews["reviewText"].str.lower()

data = pd.merge(reviews,
                ratings,
                left_on = ["reviewerID", "asin"],
                right_on = ["userId", "asin"],
                how="left")

mean_ratings = data.loc[:, ["userId", "rating"]].groupby("userId").mean()

### Create TF-IDF Embeddings for Content-Based Recommender

I tried two different ways of creating the user profile: one way that took a sum of the embeddings and one that a mean-adjusted for each component of the feature vector.  The latter approach has better theoretical properties and performed better.

In [2]:
tfidf = TfidfVectorizer(sublinear_tf = True, # tf is replaced with 1 + log(tf))
                        stop_words = "english",
                        max_features = 5000,
                        max_df = 0.5,
                        min_df = 5,
                        token_pattern = u'(?ui)\\b\\w*[a-z]+\\w*\\b')

tfidf_embeddings = tfidf.fit_transform(data["reviewText"])

item_profiles = {}
for asin in data["asin"].unique():
    indices = data[data["asin"] == asin].index
    if len(indices) > 0:
        item_profiles[asin] = tfidf_embeddings[indices].mean(axis=0)

# User profile is simple sum of items above a threshold
def get_user_profile(user_id, threshold = 3.5):
    user_data = data[(data["reviewerID"] == user_id) & (data["rating"] >= threshold)]
    user_profile = np.zeros((1, tfidf_embeddings.shape[1]))
    for asin in user_data["asin"]:
        if asin in item_profiles:
            user_profile += item_profiles[asin]
    return user_profile

# User profile is weighted averaged based on rating of items
def get_user_profile_2(user, data, embeddings):
    num_features = tfidf_embeddings.shape[1]
    user_ratings = data[(data["reviewerID"] == user_id)]

    num_ratings = np.zeros(num_features)
    csum_ratings = np.zeros(num_features)
    
    for i in range(len(user_ratings)):
        userId = user_ratings.iloc[i,0]
        asin = user_ratings.iloc[i,1]
        embedding = item_profiles[asin]
        
        if embedding.shape[0] > 0:
            mean_user_rating = mean_ratings[mean_ratings.index == userId].iloc[0,0]
            rating = user_ratings.iloc[i, 4]
            for j in embedding.nonzero()[1]:
                num_ratings[j] += 1
                csum_ratings[j] += mean_user_rating - rating
   
    # Calculate avg number of
    user_profile = np.zeros(num_features)
    for i in range(num_features):
        if num_ratings[i] > 0:
            user_profile[i] = csum_ratings[i] / num_ratings[i]
    return user_profile.reshape(1, -1)

### Create SVD Collaborative-Filtering

In [3]:
reader = Reader(rating_scale=(1, 5))
svd_data = Dataset.load_from_df(data[["reviewerID", "asin", "rating"]],
                                     reader)

TEST_SIZE = 0.2
train_data, test_data = train_test_split(svd_data,
                                     test_size = TEST_SIZE,
                                     random_state = 1)
svd = SVD(verbose = False)
svd.fit(train_data)
svd_preds = svd.test(test_data)

svd_df = pd.DataFrame([(pred.uid,
                        pred.iid,
                        pred.est,
                        pred.r_ui) for pred in svd_preds],
                      columns = ["user_id", "item_id", "svd_score", "true_rating"])

### Get Content-Based Recommedations

In [4]:
content_preds = []
for user_id, item_id, _ in test_data:
    if item_id in item_profiles:
        user_profile = get_user_profile_2(user_id,
                                          data,
                                          item_profiles)
        #user_profile = get_user_profile(user_id)
        score = cosine_similarity(np.asarray(user_profile),
                                  np.asarray(item_profiles[item_id])).flatten()[0]
    else:
        score = 0
        
    pred = (user_id, item_id, score)
    content_preds.append(pred)

# Take cosine-based recommnedations and convert to a ratings scale
content = pd.DataFrame(content_preds,
                          columns = ["user_id", "item_id", "content_score"])
content["content_score"] = MinMaxScaler(feature_range = (1, 5)).fit_transform(content[["content_score"]])

### Define helper functions
#### Combine SVD and content-based recommendations using weighted sum

In [5]:
def get_hybrid_recommendations(alpha):
    hybrid = pd.merge(content,
                        svd_df,
                        on = ["user_id", "item_id"])
    hybrid["hybrid_score"] = (alpha * hybrid["svd_score"] + 
                                (1 - alpha) * hybrid["content_score"])
    return hybrid

#### Metrics
Initial versions of these functions were generated from ChatGPT [OpenAI. 2025. ChatGPT (May 12 version) https://chat.openai.com/.]  I reviewed, debugged, and checked them for accuracy.

In [6]:
def precision_at_k(ranked, relevant, k):
    return len([i for i in ranked[:k] if i in relevant]) / k

def recall_at_k(ranked, relevant, k):
    if not relevant:
        return 0
    return len([i for i in ranked[:k] if i in relevant]) / len(relevant)

def dcg_at_k(ranked, relevant, k):
    n = np.min([k, len(ranked)])
    return sum((1 if ranked[i] in relevant else 0) / np.log2(i + 2) for i in range(n))

def idcg_at_k(relevant, k):
    return sum(1 / np.log2(i + 2) for i in range(min(len(relevant), k)))

def ndcg_at_k(ranked, relevant, k):
    dcg = dcg_at_k(ranked, relevant, k)
    idcg = idcg_at_k(relevant, k)
    return dcg / idcg if idcg > 0 else 0

def average_precision(ranked, relevant, k):
    hits = 0
    sum_precisions = 0
    for i in range(min(len(ranked), k)):
        if ranked[i] in relevant:
            hits += 1
            sum_precisions += hits / (i + 1)
    return sum_precisions / min(len(relevant), k) if relevant else 0

### Find hybrid recommendations and associated metrics
#### Equal weightings for SVD-based collaborative filtering and TF-IDF content-based

In [7]:
K = 10
THRESHOLD = 3.5
recs_by_user = defaultdict(list)
relevant_by_user = defaultdict(set)

ALPHA = 0.5
hybrid = get_hybrid_recommendations(ALPHA)

for _, row in hybrid.iterrows():
    user_id = row["user_id"]
    recs_by_user[user_id].append((row["item_id"], row["hybrid_score"]))
    if row["true_rating"] >= THRESHOLD:
        relevant_by_user[user_id].add(row["item_id"])

rmse = mean_squared_error(hybrid["true_rating"],
                          hybrid["hybrid_score"])

binary_labels = (hybrid["true_rating"] >= THRESHOLD).astype(int)

auc = roc_auc_score(binary_labels,
                    hybrid["hybrid_score"])

precisions, recalls, ndcgs, MAPs = [], [], [], []

for user_id, recs in recs_by_user.items():
    ranked = [item_id for item_id, _ in sorted(recs,
                                               key = lambda x: x[1],
                                               reverse = True)]
    relevant = relevant_by_user[user_id]
    precisions.append(precision_at_k(ranked, relevant, K))
    recalls.append(recall_at_k(ranked, relevant, K))
    ndcgs.append(ndcg_at_k(ranked, relevant, K))
    MAPs.append(average_precision(ranked, relevant, K))

print(f"Rankings with {ALPHA * 100}% weighting for SVD-based\ncollaborative-filtering recommendations:\n")
print(f"RMSE: {rmse:.4f}")
print(f"AUC: {auc:.4f}")
print(f"Precision@{K}: {np.mean(precisions):.4f}")
print(f"Recall@{K}: {np.mean(recalls):.4f}")
print(f"nDCG@{K}: {np.mean(ndcgs):.4f}")
print(f"MAP@{K}: {np.mean(MAPs):.4f}")

Rankings with 50.0% weighting for SVD-based
collaborative-filtering recommendations:

RMSE: 2.0868
AUC: 0.1252
Precision@10: 0.1630
Recall@10: 0.9343
nDCG@10: 0.8847
MAP@10: 0.8671


#### 80/20 weighting for SVD-based collaborative filtering (80%) and TF-IDF content-based (20%)

In [8]:
K = 10
THRESHOLD = 3.5
recs_by_user = defaultdict(list)
relevant_by_user = defaultdict(set)

ALPHA = 0.8
hybrid = get_hybrid_recommendations(ALPHA)

for _, row in hybrid.iterrows():
    user_id = row["user_id"]
    recs_by_user[user_id].append((row["item_id"], row["hybrid_score"]))
    if row["true_rating"] >= THRESHOLD:
        relevant_by_user[user_id].add(row["item_id"])

rmse = mean_squared_error(hybrid["true_rating"],
                          hybrid["hybrid_score"])

binary_labels = (hybrid["true_rating"] >= THRESHOLD).astype(int)

auc = roc_auc_score(binary_labels,
                    hybrid["hybrid_score"])

precisions, recalls, ndcgs, MAPs = [], [], [], []

for user_id, recs in recs_by_user.items():
    ranked = [item_id for item_id, _ in sorted(recs,
                                               key = lambda x: x[1],
                                               reverse = True)]
    relevant = relevant_by_user[user_id]
    precisions.append(precision_at_k(ranked, relevant, K))
    recalls.append(recall_at_k(ranked, relevant, K))
    ndcgs.append(ndcg_at_k(ranked, relevant, K))
    MAPs.append(average_precision(ranked, relevant, K))

print(f"Rankings with {ALPHA * 100}% weighting for SVD-based\ncollaborative-filtering recommendations:\n")
print(f"RMSE: {rmse:.4f}")
print(f"AUC: {auc:.4f}")
print(f"Precision@{K}: {np.mean(precisions):.4f}")
print(f"Recall@{K}: {np.mean(recalls):.4f}")
print(f"nDCG@{K}: {np.mean(ndcgs):.4f}")
print(f"MAP@{K}: {np.mean(MAPs):.4f}")

Rankings with 80.0% weighting for SVD-based
collaborative-filtering recommendations:

RMSE: 1.1025
AUC: 0.3862
Precision@10: 0.1630
Recall@10: 0.9343
nDCG@10: 0.8944
MAP@10: 0.8794
