In [23]:
import pandas as pd
# importing relevant libraries
from surprise.model_selection import cross_validate, GridSearchCV
from surprise.prediction_algorithms import SVD, KNNWithMeans, KNNBasic, KNNBaseline
# from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
# from surprise.model_selection import GridSearchCV
import numpy as np

from surprise import Dataset, Reader

In [14]:

# Load the ratings data
ratings_df = pd.read_csv('ratings.csv')

# Display the first few rows of the dataset
print(ratings_df.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [15]:
ratings_df = ratings_df.drop(columns=['timestamp'], axis=1)

In [16]:

# Load the movies data
movies_df = pd.read_csv('movies.csv')

# Display the first few rows of the dataset
movies_df.head()

# we should be having our recommendation systems having similar genres

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [17]:

# Merge ratings with movie titles
movie_ratings = pd.merge(ratings_df, movies_df, on="movieId")

# Display the first few rows of the merged dataset
movie_ratings.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [29]:
movie_ratings.isna().sum()

userId     0
movieId    0
rating     0
title      0
genres     0
dtype: int64

In [18]:
movie_ratings['rating'].value_counts()

rating
4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: count, dtype: int64

In [None]:
# Define the rating scale (e.g., 0.5 to 5.0)
reader = Reader(rating_scale=(0.5, 5.0))

# Convert the DataFrame to a Surprise dataset
data = Dataset.load_from_df(movie_ratings[["userId", "movieId", "rating"]], reader)
data

# rating 

<surprise.dataset.DatasetAutoFolds at 0x287511d5b20>

In [20]:
dataset = data.build_full_trainset()
print('Number of Users: ', dataset.n_users, '\n')

print('Number of items: ', dataset.n_items)


# By considering the neighborhood-based methods use the majority.. the higher number of users or items
# hence use the item Based Recommender system

Number of Users:  610 

Number of items:  9724


In [21]:

from surprise.model_selection import train_test_split

# Split the dataset
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [22]:
trainset

<surprise.trainset.Trainset at 0x2874dfac430>

## Building the Item-Based Collaborative Filtering Model**

In this step, we will build the recommendation model using **item-based collaborative filtering**. This approach recommends items (movies) based on the similarities between them.

In [24]:
# perform the grid search with SVD..
grid_params = {'n_factors': [20, 50, 100],
               'reg_all': [0.02, 0.05, 0.1]}

g_s_svd = GridSearchCV(SVD, param_grid = grid_params, n_jobs = -1)

# fit the model 
g_s_svd.fit(data)

In [25]:
# print out optimal parameters for SVD after GridSearch
print(g_s_svd.best_params)
# print(g_s_svd.best_score)

# 'n_factors': 20: The number of latent factors in the factorization model. Latent factors are features extracted from 
# the data that represent underlying patterns.
# 'reg_all': 0.05: The regularization term to prevent overfitting by penalizing large coefficients.

# The RMSE value for the model indicates that the std of the residuals (predictions errors) for a lower value indicates a better fot

{'rmse': {'n_factors': 20, 'reg_all': 0.02}, 'mae': {'n_factors': 20, 'reg_all': 0.02}}


In [26]:
knn_basic = KNNBasic(sim_options = {'name': 'pearson', 'user_based': True})
cv_knn_basic = cross_validate(knn_basic, data, n_jobs = 1)

# for user_based: True indicates that the similarity would be computed between users

# for user_based: False indicates that the similarity would be computed between items

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [28]:
# here we print out the average RMSE for the test set
for i in cv_knn_basic.items():
    print(i)
    
print("-----")
print(np.mean(cv_knn_basic['test_rmse']))

# ('test_rmse', array([0.97007273, 0.97707591, 0.96604194, 0.97962937, 0.97465146]))
# these are the RMSE values obtained from each fold of the cross-validation. Lower values indicate better performance.

# 0.9734942842329734 this is the aggregated score the mean rmse across all the folds which provides a single value summarizing the performance of 
# the model

('test_rmse', array([0.97007273, 0.97707591, 0.96604194, 0.97962937, 0.97465146]))
('test_mae', array([0.74595899, 0.75328186, 0.74746559, 0.75922036, 0.75212552]))
('fit_time', (0.8752462863922119, 0.9752388000488281, 0.9298880100250244, 0.9385738372802734, 0.8902149200439453))
('test_time', (2.090041399002075, 2.0745530128479004, 2.319974184036255, 1.9959254264831543, 2.0770535469055176))
-----
0.9734942842329734
