In [6]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
# import os
from surprise import BaselineOnly, Dataset, Reader, SVD, NMF, SVDpp, accuracy, PredictionImpossible, KNNWithMeans, KNNBasic
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV, PredefinedKFold
from surprise.model_selection.split import LeaveOneOut
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error
from collections import defaultdict
import pandas as pd
import tempfile

In [7]:

np.random.seed(42)

genre_cols = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 
              'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
              'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

column_names = ["item","title","genres","movie_name","movie_year","(no genres listed)","Action","Adventure","Animation","Children","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir","Horror","IMAX","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western","user","rating","rating_timestamp","rating_year","rating_month","rating_season,tag","tag_timestamp","cleaned_tag","tag_length","tag_year"]

data = pd.read_csv('../samples/combined_movies_ratings_tags.csv', names=column_names, index_col=False, skiprows=0)

reader = Reader(rating_scale=(1, 5))

ratings = data[['user', 'item', 'rating']]
ratings = ratings.iloc[1:]
ratings["rating"] = ratings["rating"].astype(float)

custom_data = Dataset.load_from_df(ratings, reader)

print(custom_data)

  data = pd.read_csv('../samples/combined_movies_ratings_tags.csv', names=column_names, index_col=False, skiprows=0)
  data = pd.read_csv('../samples/combined_movies_ratings_tags.csv', names=column_names, index_col=False, skiprows=0)


<surprise.dataset.DatasetAutoFolds object at 0x31d63f310>


In [8]:
# Train-test split
trainset, testset = train_test_split(custom_data, test_size=0.2)

# Convert trainset to dataframe (for content-based model)
train_df = pd.DataFrame(trainset.build_testset(), columns=['user', 'item', 'rating'])



Grid Search CV with SVD

In [9]:
svd_param_grid = {
    "n_factors":[50, 100],
    "lr_all":[0.002, 0.005],
    "reg_all": [0.02, 0.1]
}

svd_grid = GridSearchCV(SVD, svd_param_grid, measures=["rmse", "mae"], cv=5)
svd_grid.fit(custom_data)

In [10]:
# Train the SVD model
# svd = SVD()
# svd.fit(trainset)
svd = svd_grid.best_estimator["rmse"]
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x14a537d90>

In [11]:

movies = data[['item', 'title'] + genre_cols]
movies[genre_cols] = movies[genre_cols].astype(bool)
movies[genre_cols] = movies[genre_cols].astype(int)
# movies = movies.rename(columns={'movieId': 'item'})
movies = movies.drop_duplicates(subset='item', keep='first')

# Check for duplicate 'item' IDs
duplicate_items = movies[movies.duplicated(subset='item', keep=False)]

# Display the duplicates
print(duplicate_items)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies[genre_cols] = movies[genre_cols].astype(bool)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies[genre_cols] = movies[genre_cols].astype(int)


Empty DataFrame
Columns: [item, title, Action, Adventure, Animation, Children, Comedy, Crime, Documentary, Drama, Fantasy, Film-Noir, Horror, Musical, Mystery, Romance, Sci-Fi, Thriller, War, Western]
Index: []


In [12]:
# 4. Build User Profiles for Content-Based Recommender
# Step 4.1: Merge ratings and movie genres
train_merged = pd.merge(train_df[['user', 'item']], data, on=['user', 'item'], how='inner')
train_merged = train_merged[['user', 'item', 'rating', "title"]+genre_cols]
train_merged[genre_cols] = train_merged[genre_cols].astype(bool)
train_merged[genre_cols] = train_merged[genre_cols].astype(int)

# Step 4.2: Create user profiles based on genres
user_profiles = train_merged.groupby('user')[genre_cols].mean()

# Normalize user profiles (optional, helps with cosine similarity)
user_profiles = user_profiles.div(user_profiles.sum(axis=1), axis=0)


# 5. Build Item Profile Matrix
item_profiles = movies.set_index('item')[genre_cols]
item_profiles = item_profiles.div(item_profiles.sum(axis=1), axis=0)

# Count the number of ratings per user
user_rating_counts = train_df['user'].value_counts()

# Debugging: Check the shape of item_profiles
print(f"Shape of item_profiles: {item_profiles.shape}")

# Check for duplicate item_ids in item_profiles
if item_profiles.index.duplicated().any():
    print("Duplicate item_ids found in item_profiles.")
else:
    print("No duplicate item_ids found in item_profiles.")

Shape of item_profiles: (15318, 18)
No duplicate item_ids found in item_profiles.


In [13]:
# Step 4: Make Predictions
# Helper function: Content-based rating prediction
def predict_content_based(user_id, item_id):
    try:
        # Get the user profile and item profile
        user_vector = user_profiles.loc[user_id]
        
        # Ensure item_id is a valid index in item_profiles
        if item_id in item_profiles.index:
            item_vector = item_profiles.loc[item_id]
            if item_vector.ndim == 1:
                # Cosine similarity between user and item vectors
                similarity = np.dot(user_vector, item_vector) / (np.linalg.norm(user_vector) * np.linalg.norm(item_vector))
                # Scale similarity to rating scale (1-5)
                # Since similarity can be from -1 to 1, we adjust it:
                predicted_rating = 2.5 + 2.5 * similarity  # Center at 2.5, range approx 1-5
                return np.clip(predicted_rating, 1.0, 5.0)
            else:
                print(f"item_vector for item {item_id} is not 1-dimensional.")
                return train_df['rating'].mean()
        else:
            print(f"Item {item_id} not found in item_profiles.")
            return train_df['rating'].mean()
    except KeyError:
        # If user or item not found (cold start for genre), return global mean
        return train_df['rating'].mean()




In [14]:
# Hybrid prediction function
def hybrid_predict(user_id, item_id, svd_weight=0.5, content_weight=0.5):
    # SVD prediction
    try:
        svd_pred = svd.predict(user_id, item_id).est
    except:
        svd_pred = train_df['rating'].mean()
    
    # Content-based prediction
    content_pred = predict_content_based(user_id, item_id)
    
    # Weighted combination
    hybrid_pred = (svd_weight * svd_pred) + (content_weight * content_pred)
    return np.clip(hybrid_pred, 1.0, 5.0)

In [15]:
# Predict on the testset
hybrid_predictions = []
true_ratings = []

for (user, item, true_r) in testset:
    pred = hybrid_predict(user, item, svd_weight=0.7, content_weight=0.3)
    # Ensure predictions and true ratings are not NaN
    if not np.isnan(pred) and not np.isnan(true_r):
        hybrid_predictions.append(pred)
        true_ratings.append(true_r)

In [16]:
# Step 5: Evaluate Hybrid Model
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(true_ratings, hybrid_predictions))
mae = mean_absolute_error(true_ratings, hybrid_predictions)

print(f"Hybrid Model RMSE: {rmse:.4f}")
print(f"Hybrid Model MAE: {mae:.4f}")

Hybrid Model RMSE: 0.8569
Hybrid Model MAE: 0.6666


<h1>Same model with weighted average for cold start problem</h1>

In [17]:
# Hybrid prediction function
def weighted_hybrid_predict(user_id, item_id):
    # Determine the number of ratings for the user
    num_ratings = user_rating_counts.get(user_id, 0)
    
    # Set weights based on the number of ratings
    if num_ratings == 0:
        svd_weight = 0.0
        content_weight = 1.0
    elif 1 <= num_ratings <= 5:
        svd_weight = 0.2
        content_weight = 0.8
    elif 6 <= num_ratings <= 10:
        svd_weight = 0.5
        content_weight = 0.5
    else:
        svd_weight = 0.8
        content_weight = 0.2
    
    # SVD prediction
    try:
        svd_pred = svd.predict(user_id, item_id).est
    except:
        svd_pred = train_df['rating'].mean()
    
    # Content-based prediction
    content_pred = predict_content_based(user_id, item_id)
    
    # Weighted combination
    hybrid_pred = (svd_weight * svd_pred) + (content_weight * content_pred)
    return np.clip(hybrid_pred, 1.0, 5.0)


In [18]:
# Predict on the testset
hybrid_predictions = []
true_ratings = []

for (user, item, true_r) in testset:
    pred = weighted_hybrid_predict(user, item)
    # Ensure predictions and true ratings are not NaN
    if not np.isnan(pred) and not np.isnan(true_r):
        hybrid_predictions.append(pred)
        true_ratings.append(true_r)

# Step 5: Evaluate Hybrid Model
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(true_ratings, hybrid_predictions))
mae = mean_absolute_error(true_ratings, hybrid_predictions)

print(f"Weighted Hybrid Model RMSE: {rmse:.4f}")
print(f"Weighted Hybrid Model MAE: {mae:.4f}")

Weighted Hybrid Model RMSE: 0.8472
Weighted Hybrid Model MAE: 0.6561


<h1>Hybrid using KNN modal over cosine</h1>

In [19]:

# 4. Train the KNN Model
knn = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
knn.fit(trainset)

Computing the cosine similarity matrix...


KeyboardInterrupt: 

In [None]:
# Hybrid prediction function
def knn_hybrid_predict(user_id, item_id):
    # Determine the number of ratings for the user
    num_ratings = user_rating_counts.get(user_id, 0)
    
    # Set weights based on the number of ratings
    if num_ratings == 0:
        svd_weight = 0.0
        knn_weight = 1.0
        # knn_weight = 0.0
    elif 1 <= num_ratings <= 5:
        svd_weight = 0.2
        knn_weight = 0.8
        # knn_weight = 0.0
    elif 6 <= num_ratings <= 10:
        svd_weight = 0.5
        knn_weight = 0.5
        # knn_weight = 0.0
    else:
        svd_weight = 0.8
        knn_weight = 0.2
        # knn_weight = 0.0
    
    # SVD prediction
    try:
        svd_pred = svd.predict(user_id, item_id).est
    except:
        svd_pred = train_df['rating'].mean()
    
    # KNN prediction
    try:
        knn_pred = knn.predict(user_id, item_id).est
    except:
        knn_pred = train_df['rating'].mean()
    
    # Content-based prediction
    content_pred = predict_content_based(user_id, item_id)
    
    # Weighted combination
    hybrid_pred = (svd_weight * svd_pred) + (knn_weight * knn_pred) 
    return np.clip(hybrid_pred, 1.0, 5.0)

In [None]:
# Predict on the testset
hybrid_predictions = []
true_ratings = []

for (user, item, true_r) in testset:
    pred = knn_hybrid_predict(user, item)
    # Ensure predictions and true ratings are not NaN
    if not np.isnan(pred) and not np.isnan(true_r):
        hybrid_predictions.append(pred)
        true_ratings.append(true_r)

# Step 6: Evaluate Hybrid Model
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(true_ratings, hybrid_predictions))
mae = mean_absolute_error(true_ratings, hybrid_predictions)
print(f"Knn Hybrid Model RMSE: {rmse:.4f}")
print(f"Knn Hybrid Model MAE: {mae:.4f}")

Knn Hybrid Model RMSE: 0.8313
Knn Hybrid Model MAE: 0.6430


<h1>Hybrid using cosine and SVD++</h1>

In [None]:
# Train the SVD model
svdpp = SVDpp()
svdpp.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x33b8b4af0>

In [None]:

# Hybrid prediction function
def svdpp_weighted_hybrid_predict(user_id, item_id):
    # Determine the number of ratings for the user
    num_ratings = user_rating_counts.get(user_id, 0)
    
    # Set weights based on the number of ratings
    if num_ratings == 0:
        svd_weight = 0.0
        content_weight = 1.0
    elif 1 <= num_ratings <= 5:
        svd_weight = 0.2
        content_weight = 0.8
    elif 6 <= num_ratings <= 10:
        svd_weight = 0.5
        content_weight = 0.5
    else:
        svd_weight = 0.8
        content_weight = 0.2
    
    # SVD prediction
    try:
        svdpp_pred = svdpp.predict(user_id, item_id).est
    except:
        svdpp_pred = train_df['rating'].mean()
    
    # Content-based prediction
    content_pred = predict_content_based(user_id, item_id)
    
    # Weighted combination
    hybrid_pred = (svd_weight * svdpp_pred) + (content_weight * content_pred)
    return np.clip(hybrid_pred, 1.0, 5.0)


In [None]:
# Predict on the testset
hybrid_predictions = []
true_ratings = []

for (user, item, true_r) in testset:
    pred = svdpp_weighted_hybrid_predict(user, item)
    # Ensure predictions and true ratings are not NaN
    if not np.isnan(pred) and not np.isnan(true_r):
        hybrid_predictions.append(pred)
        true_ratings.append(true_r)

# Step 6: Evaluate Hybrid Model
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(true_ratings, hybrid_predictions))
mae = mean_absolute_error(true_ratings, hybrid_predictions)
print(f"SVDpp Hybrid Model RMSE: {rmse:.4f}")
print(f"SVDpp Hybrid Model MAE: {mae:.4f}")

SVDpp Hybrid Model RMSE: 0.8328
SVDpp Hybrid Model MAE: 0.6432
