In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [3]:
cd /content/drive/MyDrive/Recommender_system

/content/drive/MyDrive/Recommender_system


In [4]:
r=pd.read_csv('ratings.csv')
r.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
m=pd.read_csv('movies.csv')
m.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
m['genres'] = m['genres'].str.split('|')
m['genres'].head()

0    [Adventure, Animation, Children, Comedy, Fantasy]
1                       [Adventure, Children, Fantasy]
2                                    [Comedy, Romance]
3                             [Comedy, Drama, Romance]
4                                             [Comedy]
Name: genres, dtype: object

In [7]:
#drop the titled without year
m=m[(m['title'].str[-1]==')')&(m['title'].str[-6]=='(')]

In [8]:
m['year']=m['title'].str[-5:-1]
m['year'].head()

0    1995
1    1995
2    1995
3    1995
4    1995
Name: year, dtype: object

In [9]:
m['year']=pd.to_numeric(m['year'])
m.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9718 entries, 0 to 9741
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9718 non-null   int64 
 1   title    9718 non-null   object
 2   genres   9718 non-null   object
 3   year     9718 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 379.6+ KB


In [10]:
m.sort_values('year').head()

Unnamed: 0,movieId,title,genres,year
5868,32898,"Trip to the Moon, A (Voyage dans la lune, Le) ...","[Action, Adventure, Fantasy, Sci-Fi]",1902
6355,49389,The Great Train Robbery (1903),"[Crime, Western]",1903
9020,140541,The Electric Hotel (1908),"[Animation, Comedy, Sci-Fi]",1908
4743,7065,"Birth of a Nation, The (1915)","[Drama, War]",1915
8170,102747,"Rink, The (1916)",[Comedy],1916


In [11]:
#remove yr from title
m['title']=m['title'].str[:-7]

In [12]:
m.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [13]:
m.astype({'year': 'int32'}).dtypes

movieId     int64
title      object
genres     object
year        int32
dtype: object

In [14]:
m.to_csv('m_cleaned.csv')

In [15]:
del r['timestamp']

In [16]:
r.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [18]:
#Merge movies and ratings dataframes for easier reference.
df = pd.merge(r, m, on='movieId', how='left')

In [19]:
print(m.shape)
print(r.shape)
print(df.shape)

(9718, 4)
(100836, 3)
(100836, 6)


In [20]:
df.head()

Unnamed: 0,userId,movieId,rating,title,genres,year
0,1,1,4.0,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0
1,1,3,4.0,Grumpier Old Men,"[Comedy, Romance]",1995.0
2,1,6,4.0,Heat,"[Action, Crime, Thriller]",1995.0
3,1,47,5.0,Seven (a.k.a. Se7en),"[Mystery, Thriller]",1995.0
4,1,50,5.0,"Usual Suspects, The","[Crime, Mystery, Thriller]",1995.0


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100836 non-null  int64  
 1   movieId  100836 non-null  int64  
 2   rating   100836 non-null  float64
 3   title    100805 non-null  object 
 4   genres   100805 non-null  object 
 5   year     100805 non-null  float64
dtypes: float64(2), int64(2), object(2)
memory usage: 5.4+ MB


In [22]:
df.describe()

Unnamed: 0,userId,movieId,rating,year
count,100836.0,100836.0,100836.0,100805.0
mean,326.127564,19435.295718,3.501557,1994.443708
std,182.618491,35530.987199,1.042529,14.361383
min,1.0,1.0,0.5,1902.0
25%,177.0,1199.0,3.0,1990.0
50%,325.0,2991.0,3.5,1997.0
75%,477.0,8122.0,4.0,2003.0
max,610.0,193609.0,5.0,2018.0


In [23]:
#Check sparsity of matrix
#The initial data set consisted of ~100,000 user ratings; then was reduced to ~47,000 once both users and movies with low number of ratings were removed to combat high matrix sparsity of around 98%.
numratings = len(df['rating'])
numusers = len(df['userId'].unique())
numitems = len(df['movieId'].unique())

sparse = 1 - (numratings / (numusers*numitems))
sparse

0.9830003169443864

In [24]:
#Filter out users who have rated less than 100 movies 
df = df.groupby('userId').filter(lambda x: len(x)>100)

In [25]:
#Filter out movies who have rated less than 5 movies 
df = df.groupby('movieId').filter(lambda x: len(x)>5)

In [26]:
#Check for NA values 
df.isna().sum()

userId     0
movieId    0
rating     0
title      0
genres     0
year       0
dtype: int64

In [27]:
df.shape

(71512, 6)

Content Based Filtering

In [28]:
link=pd.read_csv('links.csv')

In [29]:
movlink = pd.merge(link,m,on='movieId')

In [30]:
movlink.head()

Unnamed: 0,movieId,imdbId,tmdbId,title,genres,year
0,1,114709,862.0,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,113497,8844.0,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,113228,15602.0,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,114885,31357.0,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,113041,11862.0,Father of the Bride Part II,[Comedy],1995


In [31]:
k=pd.read_csv('movies_metadata.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [32]:
k['imdb_id']=k['imdb_id'].str[-6:]
k['imdb_id']=pd.to_numeric(k['imdb_id'])

In [33]:
k.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [34]:

k=k[['title', 'imdb_id','overview','tagline','vote_count','vote_average']]
k[['title','overview','tagline']].sort_values('tagline').head(10)

Unnamed: 0,title,overview,tagline
97,Heidi Fleiss: Hollywood Madam,A documentary crew from the BBC arrives in L.A...,
35913,Little Big Shot,A con man (Robert Armstrong) and his partner i...,"""A great kid!"" ""A great bet!"" ""A great show!"""
29947,Lolly-Madonna XXX,Laban Feather brews Tennessee moonshine with h...,"""A simple prank, a game nobody won..."""
5066,The Long Riders,"The origins, exploits and the ultimate fate of...","""All the world likes an outlaw. For some damn ..."
3371,The Son of the Sheik,"Ahmed, son of Diana and Sheik Ahmed Ben Hassan...","""An eye for an eye-a hate for a hate-that my g..."
7790,None But the Lonely Heart,When an itinerant reluctantly returns home to ...,"""Black as the Ace I am!"""
6929,I'm No Angel,The bold Tira works as dancing beauty and lion...,"""Come up and see me sometime - any time!"""
25386,Every Girl Should Be Married,Anabel Sims is determined to find the perfect ...,"""Every Girl Should be Married"" says Cary Grant..."
45204,Girls Trip,Four girlfriends take a trip to New Orleans fo...,"""Forgive us in advance for this wild weekend"""
26266,No Questions Asked,A young lawyer's primrose path to success gets...,"""Give her the works til she tells where the je..."


In [35]:
dfc= movlink.merge(k, left_on='imdbId', right_on='imdb_id', how='inner')

In [36]:
del dfc['imdb_id']
del dfc['title_x']
#del dfc['genres_y']
del dfc['tmdbId']
dfc.rename(columns={'title_y': 'title', 'genres_x': 'genres'}, inplace=True)

In [37]:
dfc.head()

Unnamed: 0,movieId,imdbId,genres,year,title,overview,tagline,vote_count,vote_average
0,1,114709,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,Toy Story,"Led by Woody, Andy's toys live happily in his ...",,5415.0,7.7
1,2,113497,"[Adventure, Children, Fantasy]",1995,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,2413.0,6.9
2,3,113228,"[Comedy, Romance]",1995,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,92.0,6.5
3,4,114885,"[Comedy, Drama, Romance]",1995,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,34.0,6.1
4,5,113041,[Comedy],1995,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,173.0,5.7


In [38]:
dfc['tagline'] = dfc['tagline'].fillna('')
dfc['description'] = dfc['overview'] + dfc['tagline']+dfc['title']
dfc['description'] = dfc['description'].fillna('')

In [39]:
dfc['description'][0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.Toy Story"

In [40]:
#Here TfidfVectorizer is used to create raw documents to a matrix of TF-IDF features. 
#ngram_range=(1,2) means I only want unigrams and bigrams
#min_df=0 means to take words in the feature vectors even if its frequency is just 1

from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(dfc['description'])
tfidf_matrix.shape



(7455, 239285)

In [41]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]

array([1.        , 0.00645353, 0.        , ..., 0.00896567, 0.        ,
       0.        ])

In [42]:
cosine_sim[:4, :4]

array([[1.        , 0.00645353, 0.        , 0.        ],
       [0.00645353, 1.        , 0.01502204, 0.        ],
       [0.        , 0.01502204, 1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

In [43]:
# Build a 1-dimensional array with movie titles
titles = dfc['title']
indices = pd.Series(dfc.index, index=dfc['title'])

In [44]:
# Function that get movie recommendations based on the cosine similarity score of movie descriptions
def generate_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [45]:
generate_recommendations('Jumanji').head(20)

5218                       Big Game
6272                     Stay Alive
3899      The Giant Spider Invasion
3050             Dungeons & Dragons
5227       Night of the Living Dead
1379                    He Got Game
7303                    Geri's Game
1580                      Peter Pan
5701                        Nirvana
6205                  Grandma's Boy
3845                     Panic Room
3652                       Spy Game
2195           For Love of the Game
2003                       eXistenZ
4329               Poolhall Junkies
6161     Zathura: A Space Adventure
7367     Alan Partridge: Alpha Papa
1557                    BASEketball
4535        Spy Kids 3-D: Game Over
1029    Amityville: It's About Time
Name: title, dtype: object

In [46]:
generate_recommendations('Batman Forever')

2469                 Batman: Mask of the Phantasm
6798                              The Dark Knight
6014                                Batman Begins
1197                               Batman & Robin
517                                        Batman
1080                               Batman Returns
7368              Batman: Mystery of the Batwoman
5716    The Batman Superman Movie: World's Finest
5727           Batman Beyond: Return of the Joker
660                           Eyes Without a Face
6083                                     Cry_Wolf
2492                              Wayne's World 2
144                                       Hackers
1998                               Open Your Eyes
2579                                          JFK
6027                                At the Circus
6847                          The Incredible Hulk
2491                                Wayne's World
6150                                 Just Friends
4529                                Loose Cannons


We can see that the model returns similar movies in the Batman franchise, but it is not taking into consideration movie ratings at all. For example, Wayne’s world and Batman and Robin are two such movies that have extremely low ratings and should not be recommended. Hence, we modify our system to include movies with good reviews.

For this, we use the  IMDB's weighted rating formula


In [47]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:35]
    movie_indices = [i[0] for i in sim_scores]
    global n
    global C
    movies = dfc.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    n = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= n) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['weightedrating'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('weightedrating', ascending=False).head(15)
    return qualified

In [48]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+n) * R) + (n/(n+v) * C)

In [49]:
improved_recommendations('Batman Forever')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,title,vote_count,vote_average,year,weightedrating
6798,The Dark Knight,12269,8,2008,7.91925
6014,Batman Begins,7511,7,2005,6.935694
1515,Back to the Future Part II,3926,7,1989,6.883796
517,Batman,2145,7,1989,6.806027
6562,Fracture,908,7,2007,6.637551
4764,WarGames,517,7,1983,6.500387
4439,Bruce Almighty,3121,6,2003,6.0
1080,Batman Returns,1706,6,1992,6.0
6847,The Incredible Hulk,3086,6,2008,6.0
2491,Wayne's World,738,6,1992,6.0


In [50]:
improved_recommendations('Jumanji')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,title,vote_count,vote_average,year,weightedrating
6445,Casino Royale,3930,7,2006,6.900262
1580,Peter Pan,1380,7,1953,6.748169
5495,Love Me If You Dare,531,7,2003,6.488445
7303,Geri's Game,309,7,1997,6.299548
6171,"The Chronicles of Narnia: The Lion, the Witch ...",2709,6,2005,5.957047
3845,Panic Room,1303,6,2002,5.919188
6161,Zathura: A Space Adventure,808,6,2005,5.882829
2819,The Running Man,713,6,1987,5.871755
3652,Spy Game,592,6,2001,5.854205
2003,eXistenZ,487,6,1999,5.834559


Collaborative Filtering


In [51]:
pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 2.9 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1633711 sha256=4deea59e729b429530041981dc0c06319adfd5e52c8485d7b904daed675bc32b
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [52]:
#Import necessary libraries
import numpy as np
import pandas as pd
from surprise import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

In [53]:
#Import data into a DataFrame and drop unnecessary columns 
df2 = df[['userId', 'movieId', 'rating']]

In [54]:
#Instansiate reader and data 
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df2, reader) 

In [55]:
df2['rating'].describe()

count    71512.000000
mean         3.506034
std          1.015955
min          0.500000
25%          3.000000
50%          3.500000
75%          4.000000
max          5.000000
Name: rating, dtype: float64

In [56]:
trainset, testset = train_test_split(data, test_size=.2)

1. Baseline Model

In [57]:
baseline = KNNBaseline(random_state=42)

In [58]:
baseline.fit(trainset)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7f3ac85784d0>

In [59]:
baselinepreds = baseline.test(testset)

In [60]:
#Check RMSE and MAE results 
accuracy.rmse(baselinepreds)
accuracy.mae(baselinepreds)

RMSE: 0.8361
MAE:  0.6389


0.6388537809360513

In [61]:
cv_baseline = cross_validate(baseline, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8460  0.8410  0.8440  0.8436  0.0021  
MAE (testset)     0.6473  0.6440  0.6458  0.6457  0.0014  
Fit time          0.27    0.34    0.42    0.34    0.06    
Test time         3.49    3.96    4.51    3.99    0.42    


In [62]:
for i in cv_baseline.items():
    print(i)

('test_rmse', array([0.84595664, 0.84096573, 0.84399596]))
('test_mae', array([0.64732345, 0.64400801, 0.64583123]))
('fit_time', (0.26674723625183105, 0.34377217292785645, 0.4190504550933838))
('test_time', (3.4888038635253906, 3.955693006515503, 4.511263132095337))


In [63]:
#Find the average test RMSE from the 3-Fold cross-validation
np.mean(cv_baseline['test_rmse'])

0.8436394440408316

2. Gridsearch + SVD

In [64]:
#Set parameters for GridSearch on SVD model 
parameters = {'n_factors': [20, 50, 80],
             'reg_all': [0.04, 0.06],
             'n_epochs': [10, 20, 30],
             'lr_all': [.002, .005, .01]}
gridsvd = GridSearchCV(SVD, param_grid=parameters, n_jobs=-1)

In [65]:
#Fit SVD model on data
gridsvd.fit(data)

In [66]:
#Print best score and best parameters from the GridSearch 
print(gridsvd.best_score)
print(gridsvd.best_params)

{'rmse': 0.8240002525831558, 'mae': 0.6312104190405059}
{'rmse': {'n_factors': 80, 'reg_all': 0.06, 'n_epochs': 30, 'lr_all': 0.01}, 'mae': {'n_factors': 80, 'reg_all': 0.06, 'n_epochs': 30, 'lr_all': 0.01}}


In [67]:
#Reinstantiate the model with the best parameters fromGridSearch 
svdtuned = SVD(n_factors=80,
               reg_all=0.06,
               n_epochs=30,
               lr_all=0.01)

In [68]:
#Fit and predict the model 
svdtuned.fit(trainset)
svdpreds = svdtuned.test(testset)

In [69]:
#Print RMSE and MAE results 
accuracy.rmse(svdpreds)
accuracy.mae(svdpreds)

RMSE: 0.8234
MAE:  0.6288


0.6288297882268268

In [70]:
#Perform 3-Fold cross validation for SVD tuned model
cv_svd_tuned = cross_validate(svdtuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8340  0.8417  0.8279  0.8345  0.0057  
MAE (testset)     0.6395  0.6461  0.6344  0.6400  0.0048  
Fit time          3.71    3.85    3.85    3.80    0.06    
Test time         0.28    0.18    0.17    0.21    0.05    


In [None]:
#Display the results for all 3-folds 
for i in cv_svd_tuned.items():
    print(i)

In [None]:
# Print out the average RMSE score for the test set
np.mean(cv_svd_tuned['test_rmse'])

3. Gridsearch + KNN Basic

In [None]:
# Set parameters to be used in KNN models 
knn_params = {'name': ['cosine', 'pearson'],
              'user_based':[True, False], 
              'min_support':[True, False],
            'min_k' : [1, 2]}

In [None]:
# Apply GridSearch to the KNN Basic model to identify the best parameters
gsknnbasic = GridSearchCV(KNNBasic, knn_params, measures=['rmse', 'mae'], cv=3)
gsknnbasic.fit(data)

In [75]:
#Display the best scores and parameters from GridSearch
print(gsknnbasic.best_score)
print(gsknnbasic.best_params)

{'rmse': 0.9165510298218832, 'mae': 0.7070307599173494}
{'rmse': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 2}, 'mae': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 2}}


In [None]:
#Reinstantiate the model with the best parameters from GridSearch 
knnbasic_tuned = KNNBasic(sim_options={'name': 'cosine', 
                                       'user_based': True, 
                                       'min_support':True, 
                                       'min_k':1, })

In [None]:
#Fit on the train set and predict on the test set 
knnbasic_tuned.fit(trainset)
knnbpreds = knnbasic_tuned.test(testset)

In [None]:
#Print RMSE and MAE results 
accuracy.rmse(knnbpreds)
accuracy.mae(knnbpreds)

In [None]:
#Conduct cross validation for the KNNBasic tuned model 
cv_knn_basic = cross_validate(knnbasic_tuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

In [None]:
# Print out results from the cross-valdiatoin 
for i in cv_knn_basic.items():
    print(i)

In [81]:
# Print out the average RMSE score for the test set
np.mean(cv_knn_basic['test_rmse'])

0.9378539796423725

4. GridSearch + KNNBaseline

In [82]:
#Apply KNN GridSearch parameters on the KNNBaseline model 
gsknnbaseline = GridSearchCV(KNNBaseline, knn_params, measures=['rmse', 'mae'], cv=3)
gsknnbaseline.fit(data)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matr

In [None]:
#Display the best score and the best parameters 
print(gsknnbaseline.best_score)
print(gsknnbaseline.best_params)

In [None]:
#Reinstantiate the model with the best parameters from GridSearch 
knnbaseline_tuned = KNNBaseline(sim_options={'name': 'cosine', 
                                       'user_based': True, 
                                       'min_support':True, 
                                       'min_k':2, })

In [None]:
#Fit the trainset and predict on the test set 
knnbaseline_tuned.fit(trainset)
knnbaselinepreds = knnbaseline_tuned.test(testset)

In [None]:
#Print the RMSE and MAE scores 
accuracy.rmse(knnbaselinepreds)
accuracy.mae(knnbaselinepreds)

In [None]:
#Perform 3 fold cross validation 
cv_knn_baseline = cross_validate(knnbaseline_tuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

In [None]:
#Show the mean RMSE score for the test set 
np.mean(cv_knn_baseline['test_rmse'])

Model comparison

In [None]:
#Create a dictionary for each models' results 
baselineresult = {'model': 'baseline','RMSE': accuracy.rmse(baselinepreds), 'MAE': accuracy.mae(baselinepreds), 'CV': np.mean(cv_baseline['test_rmse'])}
svdresult = {'model':'svd', 'RMSE': accuracy.rmse(svdpreds), 'MAE': accuracy.mae(svdpreds), 'CV': np.mean(cv_svd_tuned['test_rmse'])}
knnbasicresult = {'model':'knnbasic','RMSE': accuracy.rmse(knnbpreds), 'MAE': accuracy.mae(knnbpreds), 'CV': np.mean(cv_knn_basic['test_rmse'])}
knnbaselineresult = {'model':'knnbaseline','RMSE': accuracy.rmse(knnbaselinepreds), 'MAE': accuracy.mae(knnbaselinepreds), 'CV': np.mean(cv_knn_baseline['test_rmse'])}

In [None]:
#Combine all the results into a list 
result_list = [baselineresult, svdresult, knnbasicresult, knnbaselineresult]
#Transform the results lists into a DataFrame 
df_results_updated = pd.DataFrame.from_dict(result_list, orient='columns')
df_results_updated = df_results_updated.set_index('model')

In [91]:
#Display the results for all of the models 
df_results_updated

Unnamed: 0_level_0,RMSE,MAE,CV
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
baseline,0.836054,0.638854,0.843639
svd,0.823351,0.62883,0.834515
knnbasic,0.933039,0.721458,0.937854
knnbaseline,0.839337,0.641865,0.847625


Get Predictions

In [92]:
#Create list of unique userIds and movieIds 
userids = df2['userId'].unique()
movieids = df2['movieId'].unique()

In [93]:
#Create a list and append the userId, movieId, and estimated ratings 
predictions = []
for u in userids:
    for n in movieids:
        predicted = svdtuned.predict(u, n)
        predictions.append([u, n, predicted[3]])

In [None]:
#Convert the list to a dataframe
estimated = pd.DataFrame(predictions)

In [None]:
#rename columns of DataFrame 
estimated.rename(columns={0: 'userId', 1: 'movieId', 2:'estimatedrating'}, inplace=True)

In [None]:
#Export the estimated data to a csv file 
estimated.to_csv('estimated.csv')

In [97]:
estimated.head()

Unnamed: 0,userId,movieId,estimatedrating
0,1,1,4.481462
1,1,3,4.068197
2,1,6,4.416878
3,1,47,4.749033
4,1,50,4.999811


In [None]:
estimatedx= estimated.merge(m, left_on='movieId', right_on='movieId', how='inner')

In [None]:
estimatedx.head()

In [100]:
def generate(user,n):
  k=estimatedx[estimatedx["userId"]==user]
  k=k.sort_values("estimatedrating",ascending=False)
  k=k.reset_index()
  del k['index']
  return k.head(n)
  # else:
  #   print("User does not exist")

In [101]:
generate(610,10)

Unnamed: 0,userId,movieId,estimatedrating,title,genres,year
0,610,3836,4.900878,Kelly's Heroes,"[Action, Comedy, War]",1970
1,610,3030,4.87885,Yojimbo,"[Action, Adventure]",1961
2,610,858,4.868218,"Godfather, The","[Crime, Drama]",1972
3,610,720,4.844019,Wallace & Gromit: The Best of Aardman Animation,"[Adventure, Animation, Comedy]",1996
4,610,2959,4.811357,Fight Club,"[Action, Crime, Drama, Thriller]",1999
5,610,1196,4.778372,Star Wars: Episode V - The Empire Strikes Back,"[Action, Adventure, Sci-Fi]",1980
6,610,1204,4.773598,Lawrence of Arabia,"[Adventure, Drama, War]",1962
7,610,58559,4.753331,"Dark Knight, The","[Action, Crime, Drama, IMAX]",2008
8,610,3201,4.751866,Five Easy Pieces,[Drama],1970
9,610,296,4.718121,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994


In [102]:
generate(10,20)

Unnamed: 0,userId,movieId,estimatedrating,title,genres,year
0,10,91529,4.505953,"Dark Knight Rises, The","[Action, Adventure, Crime, IMAX]",2012
1,10,7458,4.485755,Troy,"[Action, Adventure, Drama, War]",2004
2,10,49272,4.478872,Casino Royale,"[Action, Adventure, Thriller]",2006
3,10,89904,4.453364,The Artist,"[Comedy, Drama, Romance]",2011
4,10,81845,4.376803,"King's Speech, The",[Drama],2010
5,10,49286,4.357368,"Holiday, The","[Comedy, Romance]",2006
6,10,3451,4.356546,Guess Who's Coming to Dinner,[Drama],1967
7,10,4878,4.338246,Donnie Darko,"[Drama, Mystery, Sci-Fi, Thriller]",2001
8,10,3083,4.330734,All About My Mother (Todo sobre mi madre),[Drama],1999
9,10,52281,4.32257,Grindhouse,"[Action, Crime, Horror, Sci-Fi, Thriller]",2007
