## Netflix Recommendation Competition Dataset

#! pip install surprise
https://visualstudio.microsoft.com/visual-cpp-build-tools/

In [1]:
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD, evaluate
sns.set_style("white")

### Load data set

In [3]:
df = pd.read_csv("netflix-prize-data/netflix.csv")
df.drop(labels= ["Unnamed: 0"],inplace= True,axis =1)

In [5]:
df.head()
print("Shape of data is {}".format(df.shape))

Shape of data is (24053764, 3)


### Remove movies with less count of ratings

In [7]:
f = ['count','mean']

df_movie_summary = df.groupby('Movie_Id')['Rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.8),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

print('Movie minimum times of review: {}'.format(movie_benchmark))

df_cust_summary = df.groupby('Cust_Id')['Rating'].agg(f)
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(0.8),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

print('Customer minimum times of review: {}'.format(cust_benchmark))

Movie minimum times of review: 3884.0
Customer minimum times of review: 79.0


### Use Singular Value Decompostion (SVD) to predict move preference

- Using surprise package to train SVD and predict movies for a user
ref: https://surprise.readthedocs.io/en/stable/

In [8]:
reader = Reader()

# get just top 100K rows for faster run time
data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']][:500000], reader)
data.split(n_folds=3)

svd = SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])



Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.9987
MAE:  0.7851
------------
Fold 2
RMSE: 0.9964
MAE:  0.7839
------------
Fold 3
RMSE: 1.0015
MAE:  0.7878
------------
------------
Mean RMSE: 0.9989
Mean MAE : 0.7856
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.9987090051817689,
                             0.9964101402201426,
                             1.0014962695520255],
                            'mae': [0.7850540988857813,
                             0.7838643740513767,
                             0.7877793846444987]})

### Load Movies title dataset

In [9]:
df_title = pd.read_csv('netflix-prize-data/movie_titles.csv', encoding = "ISO-8859-1", header = None, names = ['Movie_Id', 'Year', 'Name'])
df_title.set_index('Movie_Id', inplace = True)
print (df_title.head(10))

            Year                          Name
Movie_Id                                      
1         2003.0               Dinosaur Planet
2         2004.0    Isle of Man TT 2004 Review
3         1997.0                     Character
4         1994.0  Paula Abdul's Get Up & Dance
5         2004.0      The Rise and Fall of ECW
6         1997.0                          Sick
7         1992.0                         8 Man
8         2004.0    What the #$*! Do We Know!?
9         1991.0      Class of Nuke 'Em High 2
10        2001.0                       Fighter


In [10]:
df_785314 = df[(df['Cust_Id'] == 785314) & (df['Rating'] == 5)]
df_785314 = df_785314.set_index('Movie_Id')
df_785314 = df_785314.join(df_title)['Name']
print(df_785314[:10])

Movie_Id
57         Richard III
175     Reservoir Dogs
311            Ed Wood
329              Dogma
331        Chasing Amy
395      Captain Blood
788             Clerks
798               Jaws
907    Animal Crackers
985          The Mummy
Name: Name, dtype: object


In [12]:
# Let's predict which movies user 785314 would love to watch:
user_785314 = df_title.copy()
user_785314 = user_785314.reset_index()
user_785314 = user_785314[~user_785314['Movie_Id'].isin(drop_movie_list)]

# getting full dataset
data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']][:1000000], reader)


trainset = data.build_full_trainset()
#svd = SVD()
svd.fit(trainset)

user_785314['Estimate_Score'] = user_785314['Movie_Id'].apply(lambda x: svd.predict(785314, x).est)

user_785314 = user_785314.drop('Movie_Id', axis = 1)

user_785314 = user_785314.sort_values('Estimate_Score', ascending=False)
print(user_785314.head(10))

       Year                                  Name  Estimate_Score
208  1996.0  Star Trek: Deep Space Nine: Season 5        4.377619
137  1995.0          Star Trek: Voyager: Season 1        4.325194
151  1995.0                     A Little Princess        3.588695
32   2000.0        Aqua Teen Hunger Force: Vol. 1        3.584963
27   2002.0                       Lilo and Stitch        3.500838
190  2003.0                      X2: X-Men United        3.473903
142  1997.0                              The Game        3.473758
174  1992.0                        Reservoir Dogs        3.384928
166  2004.0                            The Chorus        3.379664
45   1964.0        Rudolph the Red-Nosed Reindeer        3.284995


### Writing a function to automate the above steps

In [14]:
def predict_movie(user_id,movie_list,df):
    # view historical preference of the user
    temp_usr = df[(df['Cust_Id'] == user_id) & (df['Rating'] == 5)]
    temp_usr = temp_usr.set_index('Movie_Id')
    temp_usr = temp_usr.join(df_title)['Name']
    print("Movies Previously liked by user.....................")
    print(temp_usr[:10])
    
    # create svd model to predict movies for user
    user = movie_list.copy()
    user = user.reset_index()
    user = user[~user['Movie_Id'].isin(drop_movie_list)]

    # getting dataset
    reader = Reader()
    data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']][:1000000], reader)

    trainset = data.build_full_trainset()
    svd = SVD()
    svd.fit(trainset)

    user['Estimate_Score'] = user['Movie_Id'].apply(lambda x: svd.predict(user_id, x).est)

    user = user.drop('Movie_Id', axis = 1)

    user = user.sort_values('Estimate_Score', ascending=False)
    print("Recommended Movies for User are as follows.........\n")
    print(user.head(10))

In [17]:
predict_movie(user_id=512536, movie_list= df_title, df = df)

Movies Previously liked by user.....................
Movie_Id
175                       Reservoir Dogs
223           Chappelle's Show: Season 1
281                           The Legend
312                        High Fidelity
416                             Elephant
457                    Kill Bill: Vol. 2
919                             Comedian
996                              Yojimbo
1479                    Man on the Train
1642    Casino: 10th Anniversary Edition
Name: Name, dtype: object
Recommended Movies for User are as follows.........

       Year                                  Name  Estimate_Score
222  2003.0            Chappelle's Show: Season 1        4.665305
151  1995.0                     A Little Princess        4.622339
208  1996.0  Star Trek: Deep Space Nine: Season 5        4.571597
32   2000.0        Aqua Teen Hunger Force: Vol. 1        4.544464
17   1994.0                      Immortal Beloved        4.463925
174  1992.0                        Reservoir Dogs    

In [31]:
## Use below customer ids to verify the preference

df["Cust_Id"].tail()

24053759    2591364
24053760    1791000
24053761     512536
24053762     988963
24053763    1704416
Name: Cust_Id, dtype: int64