****
**IMPORTING USEFUL LIBRARIES**
****

In [1]:
import numpy as np
import pandas as pd

****
**LOADING DATASETS**
****

In [2]:
#loading movies.csv

movies = pd.read_csv('Resources/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
#loading ratings.csv

ratings = pd.read_csv('Resources/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


****
**PREPROCESSING**
****

In [4]:
movies.shape

(87585, 3)

In [5]:
movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [6]:
#cleaning the title column in movies dataframe by removing any characters that are not alphanumeic or spaces.

import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)

In [7]:
#cleaning the genres column in movies df by removing "|" pipe character.

movies['genres_list'] = movies['genres'].str.replace('|' , ' ')
movies['clean_title'] = movies['title'].apply(clean_title)

In [8]:
movies_data = movies[['movieId' , 'clean_title' , 'genres_list']]
movies_data.head()

Unnamed: 0,movieId,clean_title,genres_list
0,1,Toy Story 1995,Adventure Animation Children Comedy Fantasy
1,2,Jumanji 1995,Adventure Children Fantasy
2,3,Grumpier Old Men 1995,Comedy Romance
3,4,Waiting to Exhale 1995,Comedy Drama Romance
4,5,Father of the Bride Part II 1995,Comedy


In [9]:
ratings.shape

(32000204, 4)

In [10]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [11]:
#as timestamps do not provide much helpful information in a movie recommendation system, we drop it.

ratings_data = ratings.drop(['timestamp'] , axis = 1)
ratings_data.head()

Unnamed: 0,userId,movieId,rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,30,5.0
4,1,32,5.0


In [12]:
final_data = pd.merge(ratings_data , movies_data , on = 'movieId')
final_data.head()

Unnamed: 0,userId,movieId,rating,clean_title,genres_list
0,1,17,4.0,Sense and Sensibility 1995,Drama Romance
1,1,25,1.0,Leaving Las Vegas 1995,Drama Romance
2,1,29,2.0,City of Lost Children The Cit des enfants perd...,Adventure Drama Fantasy Mystery Sci-Fi
3,1,30,5.0,Shanghai Triad Yao a yao yao dao waipo qiao 1995,Crime Drama
4,1,32,5.0,Twelve Monkeys aka 12 Monkeys 1995,Mystery Sci-Fi Thriller


In [13]:
final_data.shape

(32000204, 5)

In [14]:
final_data.isna().sum()

userId         0
movieId        0
rating         0
clean_title    0
genres_list    0
dtype: int64

****
**DEFINING SEARCH FUNCTION**
****

In [15]:
#searching for top n movies with similar title

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer_title = TfidfVectorizer(ngram_range=(1,2))
tfidf_title = vectorizer_title.fit_transform(movies_data['clean_title'])

def search_by_title(title):
    title = clean_title(title)
    query_vec = vectorizer_title.transform([title])
    similarity = cosine_similarity(query_vec, tfidf_title).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies_data.iloc[indices][::-1]
    return results

movie_results = search_by_title("Toy Story")
print(movie_results)


       movieId               clean_title  \
3021      3114          Toy Story 2 1999   
14815    78499          Toy Story 3 2010   
60708   201588          Toy Story 4 2019   
0            1            Toy Story 1995   
20505   106022  Toy Story of Terror 2013   

                                            genres_list  
3021        Adventure Animation Children Comedy Fantasy  
14815  Adventure Animation Children Comedy Fantasy IMAX  
60708               Adventure Animation Children Comedy  
0           Adventure Animation Children Comedy Fantasy  
20505                         Animation Children Comedy  


In [16]:
#searching for top n movies with similar genre

vectorizer_genres = TfidfVectorizer(ngram_range=(1,2))
tfidf_genres = vectorizer_genres.fit_transform(movies_data['genres_list'])

def search_similar_genres(genres):
    query_vec = vectorizer_genres.transform([genres])
    similarity = cosine_similarity(query_vec, tfidf_genres).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = movies_data.iloc[indices][::-1]
    return results

gen = 'Adventure Comedy'
print(search_similar_genres(gen))

       movieId                        clean_title       genres_list
72368   234267              A Dangerous Game 1941  Adventure Comedy
65077   211808              Seawards Journey 2003  Adventure Comedy
39425   156350                 Give Us Wings 1940  Adventure Comedy
9014     26827  Leningrad Cowboys Meet Moses 1994  Adventure Comedy
11353    50798                    Epic Movie 2007  Adventure Comedy
69704   224519                Brutus Vs Csar 2020  Adventure Comedy
34678   145358                       Zenabel 1969  Adventure Comedy
84048   281134                  Mud and Sand 1922  Adventure Comedy
30569   136208                         Noobz 2013  Adventure Comedy
25709   124232          The Fuller Brush Man 1948  Adventure Comedy


****
**MAKING THE RECOMMENDATION**
****

In [17]:
#creating a function to calculate scores for recommendation

def scores_calculator(movie_id):

    #recs from users who like the same movie
    similar_users = final_data[(final_data['movieId']== movie_id) & (final_data['rating']>=4)]['userId'].unique()
    similar_user_recs = final_data[(final_data['userId'].isin(similar_users)) & (final_data['rating']>=4)]['movieId']
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    #print(similar_user_recs)


    #recs from all users who have watched the movies above
    all_users = final_data[(final_data['movieId'].isin(similar_user_recs.index)) & (final_data['rating']>=4)]
    all_users_recs = all_users['movieId'].value_counts() / all_users['userId'].nunique()
    #print(all_users_recs)

    genres_of_selected_movie = final_data[final_data['movieId']==movie_id]['genres_list'].unique()
    genres_of_selected_movie = np.array2string(genres_of_selected_movie)
    movies_with_similar_genres = search_similar_genres(genres_of_selected_movie)

    
    #multiplying movies with similar genres and similar users by a afactor of 1.5
    indices = []
    for index in movies_with_similar_genres[(movies_with_similar_genres['movieId'].isin(similar_user_recs.index))]['movieId']:
        indices.append(index)

    similar_user_recs.loc[indices] = similar_user_recs.loc[indices]*1.5


    #multiplying movies with similar genres and all users by a afactor of 0.9
    indices = []
    for index in movies_with_similar_genres[(movies_with_similar_genres['movieId'].isin(similar_user_recs.index))]['movieId']:
        indices.append(index)

    all_users_recs.loc[indices] = all_users_recs.loc[indices]*0.9

    rec_percentages = pd.concat([similar_user_recs , all_users_recs] , axis=1)
    rec_percentages.columns = ['similar' , 'all']
    rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']

    rec_percentages = rec_percentages.sort_values('score' , ascending=False)
    return rec_percentages

scores_calculator(3114)

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
276403,0.000075,0.000004,16.620794
3114,1.000000,0.100276,9.972476
285957,0.000050,0.000005,9.972476
258195,0.000050,0.000005,9.972476
253522,0.000050,0.000005,9.972476
...,...,...,...
601,0.000050,0.000249,0.199450
790,0.000050,0.000259,0.191778
797,0.000050,0.000269,0.184675
1075,0.000050,0.000294,0.169025


In [18]:
#listing 10 movies with highest score to user

def recommendation_results(user_input, title=0):

    #user_input = clean_title(user_input)
    title_candidates = search_by_title(user_input)
    movie_id = title_candidates.iloc[title]['movieId']
    scores = scores_calculator(movie_id)
    results = scores.head(10).merge(movies_data, left_index=True, right_on='movieId')[['clean_title', 'score', 'genres_list']]
    resutls = results.rename(columns={'clean_title': 'title', 'genres_list': 'genres'}, inplace=True)
    return results


user_input = 'Toy Story'
print("Are you looking for (please choose a number): ")
for i in range(5):
    print(i, ":" , search_by_title(user_input)['clean_title'].iloc[i])

title = 0
if int(title) in range(5):
    print("We have the following Recommendations: ")
    print(recommendation_results(user_input, int(title)))
else:
    print("Sorry! Could not find similar recommendations")
    

Are you looking for (please choose a number): 
0 : Toy Story 2 1999
1 : Toy Story 3 2010
2 : Toy Story 4 2019
3 : Toy Story 1995
4 : Toy Story of Terror 2013
We have the following Recommendations: 
                                              title      score  \
82245  Chickenhare and the Hamster of Darkness 2022  16.620794   
3021                               Toy Story 2 1999   9.972476   
85578                      That Kind of Summer 2022   9.972476   
77517                      A Problem with Fear 2003   9.972476   
76601                            Devils Island 2021   9.972476   
76085                                   Morgue 2019   9.972476   
79323                          Le club Vinland 2021   9.972476   
83990                                  Goliath 2022   9.972476   
85563    Bruno Reidal Confession of a Murderer 2022   9.972476   
33055                  Deaf Smith  Johnny Ears 1973   9.972476   

                                            genres  
82245  Adventure Anima

In [19]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
from math import sqrt

def evaluate_model(movie_id, cv=5):
    # Calculating recommendation scores
    rec_scores = scores_calculator(movie_id)

    # Simulated true ratings: we'll assume an average rating for the sake of evaluation
    true_ratings = np.full_like(rec_scores['score'], final_data[final_data['movieId'] == movie_id]['rating'].mean())

    # Predicted ratings are derived from the recommendation scores
    predicted_ratings = rec_scores['score'].values

    # Mean Squared Error (MSE)
    mse = mean_squared_error(true_ratings, predicted_ratings)
    
    # Root Mean Squared Error (RMSE)
    rmse = sqrt(mse)
    
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")

    # Cross-Validation to assess robustness
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    
    # Placeholder for cross-validation RMSE scores
    cross_val_rmse_scores = []

    for train_index, test_index in kf.split(predicted_ratings):
        X_train, X_test = predicted_ratings[train_index], predicted_ratings[test_index]
        y_train, y_test = true_ratings[train_index], true_ratings[test_index]
        
        # Since we do not have a model to fit, we'll just calculate RMSE directly
        fold_mse = mean_squared_error(y_test, X_test)
        fold_rmse = sqrt(fold_mse)
        cross_val_rmse_scores.append(fold_rmse)
    
    cross_val_rmse = np.mean(cross_val_rmse_scores)
    
    print(f"Cross-Validated RMSE: {cross_val_rmse}")

    return mse, rmse, cross_val_rmse

# Example usage with a movie_id (e.g., 'Toy Story 2' with movie_id 3114)
mse, rmse, cross_val_rmse = evaluate_model(3114)

Mean Squared Error (MSE): 6.713570323997582
Root Mean Squared Error (RMSE): 2.591055831895095
Cross-Validated RMSE: 2.591008688962836
