In [4]:
# Import all necessary packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import surprise
from surprise.prediction_algorithms import *
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
import numpy as np
import datetime as dt

In [5]:
# Read in the 'links.csv'.
df_links = pd.read_csv("Data/links.csv")
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [6]:
# Checking the data.
df_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [7]:
# Checking the movie ID column for duplicates.
df_links['movieId'].duplicated().sum()

0

# Movies.csv

In [8]:
# Read in the 'movies.csv'.
df_movies = pd.read_csv("Data/movies.csv")
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
# Checking the data.
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [10]:
# Checking the movie ID column for duplicates.
df_movies['movieId'].duplicated().sum()

0

# Rating.csv

In [11]:
# Read in the 'ratings.csv'.
df_ratings = pd.read_csv("Data/ratings.csv")
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [12]:
# Checking the data
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [13]:
# Checking the movie ID column for duplicates.
df_ratings['movieId'].duplicated().sum()

91112

In [14]:
# Checking the number of user ID's
df_ratings['userId']

0           1
1           1
2           1
3           1
4           1
         ... 
100831    610
100832    610
100833    610
100834    610
100835    610
Name: userId, Length: 100836, dtype: int64

In [15]:
# Double checking user ID's by counting the amount of users.
print(len(df_ratings['userId'].value_counts() > 1))

610


In [16]:
# Looking at the denomination breakdown of ratings.
print((df_ratings['rating'].value_counts()))

4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: rating, dtype: int64


# Tags.csv

In [17]:
# Read in 'tags.csv'.
df_tags = pd.read_csv("Data/tags.csv")
df_tags.head()


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [18]:
# Checking the tags data.
df_tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [19]:
# Checking the movie ID column for duplicates.
df_tags['movieId'].duplicated().sum()

2111

In [20]:
# Checking how many moveId's have a value count equal to 1.
df_tagid = df_tags['movieId']
df_tagid = df_tagid.to_frame()
print(len(df_tagid['movieId'].value_counts()==1))


1572


In [21]:
# Counting the number of userId's
print(len(df_tags['userId'].value_counts() > 1))

58


# Merging Dataframes

In [22]:
# Merge 'links' and 'movie' dataframes together and check new dataframe.
movies_df = df_links.merge(df_movies, on='movieId')
movies_df = movies_df.dropna()
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9734 entries, 0 to 9741
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9734 non-null   int64  
 1   imdbId   9734 non-null   int64  
 2   tmdbId   9734 non-null   float64
 3   title    9734 non-null   object 
 4   genres   9734 non-null   object 
dtypes: float64(1), int64(2), object(2)
memory usage: 456.3+ KB


In [23]:
# Check to see if there are any movieId duplicates.
movies_df['movieId'].duplicated().sum()

0

In [24]:
# Merge new dataframe with the 'ratings.csv'
movie_df = movies_df.merge(df_ratings, on='movieId')
movie_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 100823 entries, 0 to 100822
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    100823 non-null  int64  
 1   imdbId     100823 non-null  int64  
 2   tmdbId     100823 non-null  float64
 3   title      100823 non-null  object 
 4   genres     100823 non-null  object 
 5   userId     100823 non-null  int64  
 6   rating     100823 non-null  float64
 7   timestamp  100823 non-null  int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 6.9+ MB


In [25]:
# Checking to see how many movieIds are duplicated
movie_df['movieId'].duplicated().sum()


91107

In [26]:
print("In this merged dataset we have", len(movie_df['movieId']) - movie_df['movieId'].duplicated().sum(), "individual movie titles.")

In this merged dataset we have 9716 individual movie titles.


In [27]:
movie_df.describe()

Unnamed: 0,movieId,imdbId,tmdbId,userId,rating,timestamp
count,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0
mean,19435.437737,351570.4,20105.462633,326.130823,3.501637,1205945000.0
std,35532.291269,622092.2,53274.14362,182.618176,1.04243,216261300.0
min,1.0,417.0,2.0,1.0,0.5,828124600.0
25%,1199.0,99685.0,712.0,177.0,3.0,1019124000.0
50%,2991.0,118771.0,6957.0,325.0,3.5,1186087000.0
75%,8120.0,314979.0,11635.0,477.0,4.0,1435994000.0
max,193609.0,8391976.0,525662.0,610.0,5.0,1537799000.0


In [28]:
# We are dropping the 'imdbId' and 'tmdbId' columns because they point to information 
# we do not have access to.
df_cleaned = movie_df.drop(columns= ['imdbId', 'tmdbId'], axis=1)


In [29]:
# Checking the new dataframe.
df_cleaned.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 100823 entries, 0 to 100822
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    100823 non-null  int64  
 1   title      100823 non-null  object 
 2   genres     100823 non-null  object 
 3   userId     100823 non-null  int64  
 4   rating     100823 non-null  float64
 5   timestamp  100823 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


In [30]:
# Checking the value counts for different genre combinations.
df_cleaned['genres'].value_counts()


Comedy                                        7194
Drama                                         6290
Comedy|Romance                                3967
Comedy|Drama|Romance                          3000
Comedy|Drama                                  2851
                                              ... 
Adventure|Comedy|Fantasy|Musical                 1
Comedy|Crime|Drama|Musical|Mystery|Romance       1
Action|Comedy|Drama|Horror                       1
Crime|Horror|Sci-Fi                              1
Fantasy|Horror|Sci-Fi|Western                    1
Name: genres, Length: 951, dtype: int64

# Filtering by rating to recommend movie with a rating of 3.0 and above

In [31]:
# df_clean will only contain ratings 3.0 and greater.
df_clean = df_cleaned[df_cleaned['rating'] >= 3.0]

df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81754 entries, 0 to 100822
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   movieId    81754 non-null  int64  
 1   title      81754 non-null  object 
 2   genres     81754 non-null  object 
 3   userId     81754 non-null  int64  
 4   rating     81754 non-null  float64
 5   timestamp  81754 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 4.4+ MB


In [32]:
# Movies with the highest average rating into separate df.
avg_high_rated = df_clean.groupby(['title']).agg({"rating":"mean"})['rating'].sort_values(ascending=False)
avg_high_rated = avg_high_rated.to_frame()
avg_high_rated.reset_index(level=0, inplace=True)
avg_high_rated.columns = ['title', 'Average Rating']
avg_high_rated.head(2)


Unnamed: 0,title,Average Rating
0,"American Friend, The (Amerikanische Freund, De...",5.0
1,"Four Days in September (O Que É Isso, Companhe...",5.0


In [33]:
# Split the genres into separate columns with a binary result.
genres_split = df_clean.genres.apply(lambda x: x.split(sep='|')).apply(pd.value_counts, 1).fillna(0)
df_clean = pd.concat([df_clean.iloc[:,:], genres_split], axis=1)
df_clean.head(2)


Unnamed: 0,movieId,title,genres,userId,rating,timestamp,Fantasy,Comedy,Animation,Children,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
df_clean['(no genres listed)'].sum()

35.0

In [35]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81754 entries, 0 to 100822
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movieId             81754 non-null  int64  
 1   title               81754 non-null  object 
 2   genres              81754 non-null  object 
 3   userId              81754 non-null  int64  
 4   rating              81754 non-null  float64
 5   timestamp           81754 non-null  int64  
 6   Fantasy             81754 non-null  float64
 7   Comedy              81754 non-null  float64
 8   Animation           81754 non-null  float64
 9   Children            81754 non-null  float64
 10  Adventure           81754 non-null  float64
 11  Romance             81754 non-null  float64
 12  Drama               81754 non-null  float64
 13  Action              81754 non-null  float64
 14  Crime               81754 non-null  float64
 15  Thriller            81754 non-null  float64
 16  Hor

In [36]:
# Extract the release year from the 'title' column into a new 'year' column.
import re

def extract_year(title):
    match = re.search('\((\d{4})\)', title)
    if match:
        return match.group(1)
    else:
        return None

df_clean['year'] = df_clean['title'].apply(extract_year)

In [37]:
# Checking for missing year values.
df_clean['year'].isna().sum()


14

In [38]:
# Dropping missing values.
df_clean.dropna(subset=['year'], inplace=True)
df_clean['year'].isna().sum()


0

In [39]:
df_clean['num_viewers'] = df_clean.groupby('movieId')['userId'].transform('count')


In [40]:
df_clean.head(2)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,Fantasy,Comedy,Animation,Children,...,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),year,num_viewers
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995,199
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995,199


In [41]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81740 entries, 0 to 100822
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movieId             81740 non-null  int64  
 1   title               81740 non-null  object 
 2   genres              81740 non-null  object 
 3   userId              81740 non-null  int64  
 4   rating              81740 non-null  float64
 5   timestamp           81740 non-null  int64  
 6   Fantasy             81740 non-null  float64
 7   Comedy              81740 non-null  float64
 8   Animation           81740 non-null  float64
 9   Children            81740 non-null  float64
 10  Adventure           81740 non-null  float64
 11  Romance             81740 non-null  float64
 12  Drama               81740 non-null  float64
 13  Action              81740 non-null  float64
 14  Crime               81740 non-null  float64
 15  Thriller            81740 non-null  float64
 16  Hor

In [42]:
# Change year column to integer.
df_clean['year'] = df_clean['year'].astype('int')


In [43]:
# Creating df_clean2 to have copy before dividing the years into decades.
df_clean_col = df_clean.copy()
df_clean2 = df_clean_col.merge(avg_high_rated, on='title')
df_clean2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81740 entries, 0 to 81739
Data columns (total 29 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movieId             81740 non-null  int64  
 1   title               81740 non-null  object 
 2   genres              81740 non-null  object 
 3   userId              81740 non-null  int64  
 4   rating              81740 non-null  float64
 5   timestamp           81740 non-null  int64  
 6   Fantasy             81740 non-null  float64
 7   Comedy              81740 non-null  float64
 8   Animation           81740 non-null  float64
 9   Children            81740 non-null  float64
 10  Adventure           81740 non-null  float64
 11  Romance             81740 non-null  float64
 12  Drama               81740 non-null  float64
 13  Action              81740 non-null  float64
 14  Crime               81740 non-null  float64
 15  Thriller            81740 non-null  float64
 16  Horr

In [44]:
df_clean2 = df_clean2.drop(columns='timestamp', axis=1)

In [45]:
df_clean2 = df_clean2.drop_duplicates(subset='title', keep='first')
df_clean2.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 8432 entries, 0 to 81739
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movieId             8432 non-null   int64  
 1   title               8432 non-null   object 
 2   genres              8432 non-null   object 
 3   userId              8432 non-null   int64  
 4   rating              8432 non-null   float64
 5   Fantasy             8432 non-null   float64
 6   Comedy              8432 non-null   float64
 7   Animation           8432 non-null   float64
 8   Children            8432 non-null   float64
 9   Adventure           8432 non-null   float64
 10  Romance             8432 non-null   float64
 11  Drama               8432 non-null   float64
 12  Action              8432 non-null   float64
 13  Crime               8432 non-null   float64
 14  Thriller            8432 non-null   float64
 15  Horror              8432 non-null   float64
 16  Myste

In [46]:
df_clean2['genres'] = df_clean2['genres'].str.replace('|', ' ')


In [47]:
# Creating columns for each decade span with a binary value for each movie.
df_clean['1900s'] = ((df_clean['year'] >= 1900) & (df_clean['year'] < 1910)).astype(float)
df_clean['1910s'] = ((df_clean['year'] >= 1910) & (df_clean['year'] < 1920)).astype(float)
df_clean['1920s'] = ((df_clean['year'] >= 1920) & (df_clean['year'] < 1930)).astype(float)
df_clean['1930s'] = ((df_clean['year'] >= 1930) & (df_clean['year'] < 1940)).astype(float)
df_clean['1940s'] = ((df_clean['year'] >= 1940) & (df_clean['year'] < 1950)).astype(float)
df_clean['1950s'] = ((df_clean['year'] >= 1950) & (df_clean['year'] < 1960)).astype(float)
df_clean['1960s'] = ((df_clean['year'] >= 1960) & (df_clean['year'] < 1970)).astype(float)
df_clean['1970s'] = ((df_clean['year'] >= 1970) & (df_clean['year'] < 1980)).astype(float)
df_clean['1980s'] = ((df_clean['year'] >= 1980) & (df_clean['year'] < 1990)).astype(float)
df_clean['1990s'] = ((df_clean['year'] >= 1990) & (df_clean['year'] < 2000)).astype(float)
df_clean['2000s'] = ((df_clean['year'] >= 2000) & (df_clean['year'] < 2010)).astype(float)
df_clean['2010s'] = ((df_clean['year'] >= 2010) & (df_clean['year'] < 2020)).astype(float)


In [48]:
df_clean.head(2)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,Fantasy,Comedy,Animation,Children,...,1920s,1930s,1940s,1950s,1960s,1970s,1980s,1990s,2000s,2010s
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [49]:
df_clean.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 81740 entries, 0 to 100822
Data columns (total 40 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movieId             81740 non-null  int64  
 1   title               81740 non-null  object 
 2   genres              81740 non-null  object 
 3   userId              81740 non-null  int64  
 4   rating              81740 non-null  float64
 5   timestamp           81740 non-null  int64  
 6   Fantasy             81740 non-null  float64
 7   Comedy              81740 non-null  float64
 8   Animation           81740 non-null  float64
 9   Children            81740 non-null  float64
 10  Adventure           81740 non-null  float64
 11  Romance             81740 non-null  float64
 12  Drama               81740 non-null  float64
 13  Action              81740 non-null  float64
 14  Crime               81740 non-null  float64
 15  Thriller            81740 non-null  float64
 16  Hor

In [50]:
#Dropping these two columns as they are not needed for further analysis
df_clean.drop(columns=['(no genres listed)','timestamp'],inplace=True)


# Modeling

In [51]:
#import necessary packages for modeling
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import surprise
from surprise import KNNWithMeans, Dataset, accuracy, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from sklearn.preprocessing import MultiLabelBinarizer

from surprise.prediction_algorithms import SVD
from surprise.model_selection import GridSearchCV
from surprise.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

In [59]:
num_users = len(df_clean['userId'].value_counts())
num_items = len(df_clean['title'].value_counts())
print('Unique number of users in the dataset: {}'.format(num_users))
print('Unique number of movies in the dataset: {}'.format(num_items))

Unique number of users in the dataset: 609
Unique number of movies in the dataset: 8432


# Collaborative Filtering

Collaborative filtering assumes that users who have agreed in the past are likely to agree in the future and have similar preferences. This method generates recommendations by analyzing the rating profiles of different users or items. By identifying other users/items with similar rating histories to the current user/item, it generates recommendations using this information. This technique creates a model based on a user's past actions, including items purchased, selected, or rated. The model is then used to predict items or ratings that the user may be interested in. Collaborative filtering can be classified as memory-based or model-based.

In [60]:
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(3.0, 5.0))
data = Dataset.load_from_df(df_clean[['userId', 'movieId', 'rating']], reader)

# split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=.25)

# create an instance of the SVD algorithm and fit it on the training set
algo = SVD()
algo.fit(trainset)

# use the fitted model to predict ratings on the testing set
predictions = algo.test(testset)

# evaluate the performance of the model using different metrics
accuracy.mae(predictions)
accuracy.mse(predictions)


MAE:  0.4888
MSE: 0.3605


0.3605061275445651

In [61]:
# create a dictionary to map movie IDs to their indices in the dataset
movie_to_idx = {movie_id: i for i, movie_id in enumerate(df_clean['movieId'].unique())}

# create an instance of the SVD algorithm and fit it on the training set
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2419b87a940>

In [63]:
def recommend_movies(user_id):
    # get all movies that the user has not rated yet
    user_movies = df_clean[df_clean['userId'] == user_id]['movieId']
    unrated_movies = [movie_id for movie_id in df_clean['movieId'].unique() if movie_id not in user_movies]

    # create a list of tuples containing the movie ID and the predicted rating
    movie_ratings = [(movie_id, algo.predict(user_id, movie_to_idx[movie_id]).est) for movie_id in unrated_movies]

    # sort the list of tuples by the predicted rating in descending order and get the top 5 movies
    top_movies = sorted(movie_ratings, key=lambda x: x[1], reverse=True)[:5]

    # map the recommended movie indices back to their titles
    recommended_movies = [(df_clean[df_clean['movieId'] == movie_id]['title'].iloc[0], rating) for movie_id, rating in top_movies]

    # create a dataframe with the recommended movie titles and predicted ratings
    recommended_movies_df = pd.DataFrame(recommended_movies, columns=['title', 'predicted_rating'])

    return recommended_movies_df

In [64]:
recommend_movies(400) 

Unnamed: 0,title,predicted_rating
0,Henry V (1989),4.898207
1,Georgia (1995),4.89068
2,"Grifters, The (1990)",4.884759
3,"Parent Trap, The (1961)",4.841632
4,Simpatico (1999),4.813435


In [65]:
user_id = 400 # replace with the ID of the user you want to look up

movies_rated_by_user = df_clean[df_clean['userId'] == user_id][['title', 'rating']].sort_values(by='rating', ascending=False)

print(movies_rated_by_user)

                                                    title  rating
501                                           Heat (1995)     5.0
17077                                        Fargo (1996)     5.0
58450                          Requiem for a Dream (2000)     5.0
2243                          Seven (a.k.a. Se7en) (1995)     5.0
25995   Star Wars: Episode VI - Return of the Jedi (1983)     5.0
82040                                   Inside Man (2006)     5.0
24787   Star Wars: Episode V - The Empire Strikes Back...     5.0
91434                                    Inception (2010)     5.0
19963                               Godfather, The (1972)     5.0
45192                                  Matrix, The (1999)     5.0
18793                                Trainspotting (1996)     5.0
16407                    Silence of the Lambs, The (1991)     5.0
8860                     Shawshank Redemption, The (1994)     5.0
8068                                  Pulp Fiction (1994)     5.0
7813    Lé

# Code for model tuning

This is done to prevent overfitting.

In [66]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df_clean[['userId', 'title', 'rating']], test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

In [67]:
#initialize training,  validation and testing data
reader = Reader(rating_scale=(1.0, 5.0))
train_set = Dataset.load_from_df(train_data[['userId', 'title', 'rating']], reader)
test_set = Dataset.load_from_df(test_data[['userId', 'title', 'rating']], reader)
val_set = Dataset.load_from_df(val_data[['userId', 'title', 'rating']], reader)

In [68]:
#run SVD model
model = SVD()
train_set_full = train_set.build_full_trainset()
model.fit(train_set_full)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2419b85edc0>

In [69]:
test_predictions = model.test(test_set.build_full_trainset().build_testset())
val_predictions = model.test(val_set.build_full_trainset().build_testset())

accuracy.mae(test_predictions)
accuracy.mse(val_predictions)


MAE:  0.4879
MSE: 0.3644


0.36436999808902265

In [70]:
results = cross_validate(model, train_set, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6009  0.5983  0.6073  0.6029  0.6008  0.6020  0.0030  
Fit time          6.45    6.76    6.42    6.45    7.24    6.67    0.31    
Test time         0.28    0.16    0.23    0.23    0.17    0.22    0.04    


In [71]:
param_grid = {'n_factors':[50, 100, 200],'n_epochs': [10, 20, 30], 'lr_all': [0.002, 0.005, 0.01],
               'reg_all': [0.02, 0.1, 0.4]}
gs_model = GridSearchCV(SVD,param_grid=param_grid,n_jobs = -1,joblib_verbose=5)
gs_model.fit(data)
best_rmse = gs_model.best_score['rmse']
best_params = gs_model.best_params['rmse']

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   53.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 19.9min finished


In [72]:
print('Best RMSE: ' + str(best_rmse))

Best RMSE: 0.5915132042812975


In [73]:
print('Best Params: ' + str(best_params))

Best Params: {'n_factors': 50, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.1}


In [74]:
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
svd.fit(trainset)
predictions = svd.test(testset)
print(accuracy.rmse(predictions))

RMSE: 0.6020
0.6020199402696972


In [75]:
# Define the search space for hyperparameters
param_distributions = {'n_factors': [50, 100, 200],
                       'n_epochs': [10, 20, 30],
                       'lr_all': [0.002, 0.005, 0.01],
                       'reg_all': [0.02, 0.1, 0.4]}
# Create the randomized search object
rs = RandomizedSearchCV(SVD, param_distributions, n_iter=10, measures=['rmse', 'mae'], cv=5)

# Run the randomized search
rs.fit(data)

# Get the best RMSE score and the corresponding hyperparameters
best_rmse = rs.best_score['rmse']
best_params = rs.best_params['rmse']

In [76]:
print('Best RMSE: ' + str(best_rmse))

Best RMSE: 0.5918298213564797


In [77]:
print('Best Params: ' + str(best_params))

Best Params: {'n_factors': 100, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.1}


# Content-based Recommendation System

This type of recommender system uses specific characteristics or features of an item, such as its description, to suggest similar items to the user. It also takes into account the user's preferences and history to generate personalized recommendations. For example, it can recommend movies that are similar to a movie that the user has watched or based on all of the movies that the user has viewed. The system extracts relevant features from the item and uses the user's history to provide suggestions.

In [78]:
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(3.0, 5.0))
data = Dataset.load_from_df(df_clean2[['title', 'year', 'Average Rating']], reader)

# split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=.25)

# create an instance of the SVD algorithm and fit it on the training set
algo = SVD()
algo.fit(trainset)

# use the fitted model to predict ratings on the testing set
predictions3 = algo.test(testset)

# evaluate the performance of the model using different metrics
accuracy.mae(predictions3)
accuracy.mse(predictions3)


MAE:  0.3789
MSE: 0.2282


0.2282254888768546

In [79]:
# define a reader to read the dataframe
reader = Reader(rating_scale=(0, 1))

# create a dataset from the dataframe
data = Dataset.load_from_df(df_clean2[['title', 'year', 'Average Rating']], reader)

# split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# train a content-based model using the KNN algorithm
k = 10
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNWithMeans(k=k, sim_options=sim_options)
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x2419b81fd00>

In [81]:
# use the fitted model to predict ratings on the testing set
predictions4 = algo.test(testset)

# evaluate the performance of the model using different metrics
accuracy.mae(predictions4)
accuracy.mse(predictions4)

MAE:  2.7444
MSE: 7.7544


7.754430308274189

# Hybrid Recommendation System

A hybrid recommender system combines multiple recommendation techniques to solve problems and improve accuracy. This approach can overcome common issues in recommender systems such as cold start, sparsity, and knowledge engineering bottlenecks. By integrating content-based and collaborative filtering techniques, the hybrid recommender system can leverage the strengths of both approaches and minimize their limitations. Research has shown that hybrid recommender systems perform better than pure content-based or collaborative filtering methods.

In [82]:
# create a reader to read the dataframe
reader = Reader(rating_scale=(1.0, 5.0))

# create a surprise dataset from the dataframe
data = Dataset.load_from_df(df_clean2[['userId', 'movieId', 'rating']], reader)

# split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25)

# train a content-based model using the KNN algorithm
k = 10
sim_options = {'name': 'cosine', 'user_based': False}
algo_cb = KNNWithMeans(k=k, sim_options=sim_options)
algo_cb.fit(trainset)

# train a collaborative filtering model using SVD
algo_cf = SVD()
algo_cf.fit(trainset)

# for each user and item pair in the test set, make predictions using both models
predictions_cf = algo_cf.test(testset)
predictions_cb = algo_cb.test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [83]:
# assign weights to the models
weight_cf = 0.5
weight_cb = 0.5

# combine the predictions from both models using a weighted average
predictions_combined = []

# iterate over each prediction in the test set
for pred_cf, pred_cb in zip(predictions_cf, predictions_cb):
    # extract the user ID and item ID from the prediction
    uid, iid = pred_cf.uid, pred_cf.iid
    
    # calculate the weighted sum of the predicted ratings
    rating_combined = (weight_cf * pred_cf.est) + (weight_cb * pred_cb.est)
    
    # create a new prediction object with the combined rating
    pred_combined = surprise.prediction_algorithms.predictions.Prediction(uid, iid, r_ui=None, est=rating_combined, details=None)
    
    # add the combined prediction to the list
    predictions_combined.append(pred_combined)

In [85]:
##make predictions for all the movies that the user has not yet rated using the hybrid model

# get the list of all movie IDs that the user has not yet rated
user_id = 1  # replace with the user ID you want to get recommendations for
movie_ids = df_clean2[~df_clean2['movieId'].isin(df_clean2[df_clean2['userId'] == user_id]['movieId'])]['movieId']

# create a list of (user_id, movie_id, 0) tuples to make predictions on
testset = [[user_id, movie_id, 0] for movie_id in movie_ids]

# make predictions using both models
predictions_cf = algo_cf.test(testset)
predictions_cb = algo_cb.test(testset)

# combine the predictions from both models using a weighted average
predictions_combined = [(weight_cf * pred_cf.est) + (weight_cb * pred_cb.est) for pred_cf, pred_cb in zip(predictions_cf, predictions_cb)]

# add the predictions to the dataframe
df_clean2.loc[~df_clean2['movieId'].isin(df_clean2[df_clean2['userId'] == user_id]['movieId']), 'hybrid_score'] = predictions_combined

In [86]:
#sort the movies by their hybrid scores and select the top 5 movies

# get the top 5 movie recommendations based on the hybrid scores
top_5_movies = df_clean2[df_clean2['userId'] != user_id].sort_values(by='hybrid_score', ascending=False).head(5)['title']
print(top_5_movies)

47388                       Autumn in New York (2000)
80425    Kung Fu Panda: Secrets of the Masters (2011)
65021              Eddie Izzard: Dress to Kill (1999)
66779                                     9/11 (2002)
52566                         Beastmaster, The (1982)
Name: title, dtype: object


# Recommendation for Top 5

In [None]:
# get the top 5 movie recommendations based on the hybrid scores
top_5_movies = df_clean2[df_clean2['userId'] != user_id].sort_values(by='hybrid_score', ascending=False).head(5)

# iterate over the top 5 recommendations and print the actual rating and predicted rating for each movie
for index, row in top_5_movies.iterrows():
    movie_title = row['title']
    actual_rating = row['rating']
    predicted_rating = row['hybrid_score']
    print(f"Movie: {movie_title}")
    print(f"Actual rating: {actual_rating}")
    print(f"Predicted rating: {predicted_rating}")
    print() 

In [None]:
# filter the dataframe to only include movies rated by user_id
movies_rated_by_user = df_clean2[df_clean2['userId'] == user_id]

# get the top 5 movie recommendations based on the hybrid scores
top_5_movies = df_clean2[df_clean2['userId'] != user_id].sort_values(by='hybrid_score', ascending=False).head(5)

# iterate over the top 5 recommendations and print the actual rating and predicted rating for each movie
for rating in [5.0, 4.0, 3.0]:
    movies_with_rating = movies_rated_by_user[movies_rated_by_user['rating'] == rating].head(2)
    for index, row in movies_with_rating.iterrows():
        movie_title = row['title']
        actual_rating = row['rating']
        predicted_rating = row['hybrid_score']
        print(f"Movie: {movie_title}")
        print(f"Actual rating: {actual_rating}")
        print(f"Predicted rating: {predicted_rating}")
        print()
        
    top_movies_with_rating = top_5_movies[top_5_movies['rating'] == rating].head(2)
    for index, row in top_movies_with_rating.iterrows():
        movie_title = row['title']
        actual_rating = "N/A"
        predicted_rating = row['hybrid_score']
        print(f"Movie: {movie_title}")
        print(f"Actual rating: {actual_rating}")
        print(f"Predicted rating: {predicted_rating}")
        print()

# Conclusion

Throughout this project, we built a hybrid recommendation model that combined collaborative and content based recommendation techniques to predict the top 5 movie recommendations for a user. Our results showed that the hybrid model outperformed traditional recommendation models and provided more accurate and diverse movie recommendations. Overall, this project demonstrates the effectiveness of hybrid recommendation models in improving the accuracy and diversity of recommendations and can be applied to other domains beyond movie recommendations.