In [27]:
!pip install scikit-surprise
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
import numpy as np




In [28]:
movies = pd.read_csv('/Users/shreya/Desktop/sem2/cs578/project/movies.csv', sep=',', usecols=['movieId', 'title', 'genres'], encoding='latin-1')
ratings = pd.read_csv('/Users/shreya/Desktop/sem2/cs578/project/ratings.csv', sep=',', usecols=['userId', 'movieId', 'rating', 'timestamp'], encoding='latin-1')



In [29]:
def content_model_recommendation(userId, movies, ratings):
    tfimovies_matr = TfidfVectorizer(token_pattern='[a-zA-Z0-9\-]+').fit_transform(
        movies['genres'].replace("(no genres listed)", ""))
    movie_cosinesimilarity = linear_kernel(tfimovies_matr, tfimovies_matr)
    movie_recommendations = set()
    movies_list = []
    for _, r in ratings[ratings["userId"] == userId].iterrows():
        movie_name = movies.loc[movies['movieId'] == r['movieId'], 'title'].values[0]
        movies_list.append(movie_name)
        movie_index = movies.loc[movies['title'] == movie_name].index
        movie_similarity_scores = sorted(enumerate(movie_cosinesimilarity[movie_index][0]), key=lambda x: x[1],
                                         reverse=True)[1:3]
        movie_recommendations.update(movies['title'].iloc[[i[0] for i in movie_similarity_scores]])
    movie_recommendations.difference_update(movies_list)
    return movie_recommendations



In [30]:
def hybrid_model(user_id, input_ratings):
    recommended_movies = content_model_recommendation(user_id, movies, input_ratings)
    recommended_movies = movies[movies['title'].isin(recommended_movies)].copy()  # Create a copy
    
    data = Dataset.load_from_df(input_ratings[['userId', 'movieId', 'rating']], Reader())
    svd = SVD()
    trainset = data.build_full_trainset()
    svd.fit(trainset) 
    recommended_movie_ids = recommended_movies['movieId'].unique()
    dfs = []
    for movie_id in recommended_movie_ids:
        predicted_rating = svd.predict(user_id, movie_id).est
        dfs.append(pd.DataFrame({'userId': [user_id], 'movieId': [movie_id], 'predictedRating': [predicted_rating]}))
    result_df = pd.concat(dfs, ignore_index=True)
    result_df = result_df.sort_values(by='predictedRating', ascending=False).head(20)
    return result_df  


In [32]:
import sys
from sklearn.metrics import mean_squared_error

def _progress(num, users):
    sys.stdout.write('\rRating predictions. Progress status: %.1f%%' % (float(num / len(users)) * 100.0))
    sys.stdout.flush()

tot_loss, num = 0, 0
users = list(set(list(ratings['userId'])))

for n in users:
    n_5_ratings = ratings[ratings['userId'] == n].iloc[0:5].reset_index(drop=True)
    n_rest_ratings = ratings[ratings['userId'] == n].iloc[5:].reset_index(drop=True)
    not_n_ratings = ratings[ratings['userId'] != n].reset_index(drop=True)
    input_ratings = pd.concat([n_5_ratings, not_n_ratings], ignore_index=True)
    df = hybrid_model(n, input_ratings,)
    
    
    comparison_df = pd.merge(n_rest_ratings, df, how='inner', on=['userId', 'movieId'])   
    comparison_df = comparison_df[['userId', 'movieId', 'rating', 'predictedRating']]  
    if not comparison_df.empty:
        true_ratings = list(comparison_df['rating'])
        predicted_ratings = list(comparison_df['predictedRating'])
        tot_loss += mean_squared_error(true_ratings, predicted_ratings) * len(true_ratings)
    num += 1
    _progress(num, users)

print('\nTotal Loss:', round(tot_loss, 2))
print('Loss per user:', round(tot_loss / len(users), 2))

# Total Loss: 390.76
# Loss per user: 0.64

Rating predictions. Progress status: 100.0%
Total Loss: 390.76
Loss per user: 0.64
