## Movie Recommendation System - Collaborative Filtering


In [1]:
# DataSource :  https://grouplens.org/datasets/movielens/latest/
# Details: http://files.grouplens.org/datasets/movielens/ml-latest-README.html
# It contains 27753444 ratings and 1108997 tag applications across 58098 movies. 
# These data were created by 283228 users between January 09, 1995 and September 26, 2018. 
# This dataset was generated on September 26, 2018. 
# movies.csv , ratings.csv, tags.csv, links.csv ,genome-scores.csv ,genome-tags.csv

# consider user who has provided mimimum 100 movies rating
# consider movie whic has more than 1000 ratings


In [2]:
# During deployment of this model in to heroko seeing problem due to huge size of modle and trained vector
# model size give approxmatly 200MB this much huge data not accepted in github during commt time also
# we are going to reduce this dataset size and number of year of movies

# collect data from 2013 to till 2018 - 10 yrs movies 
# instead of 20 years data we taking 10yrs data

In [3]:
# import required libraries
import pandas as pd
import numpy as np

In [4]:
# Load movies data csv
movies_df_original= pd.read_csv('ml-latest/movies.csv',usecols=['movieId','title','genres'],dtype={'movieId': 'int32', 'title': 'str','genres': 'str'})
#movies_df

In [5]:
# Load ratings csv
ratings_df= pd.read_csv('ml-latest/ratings.csv',usecols=['userId','movieId','rating','timestamp'],dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32', 'timestamp': 'str'})
#ratings_df

In [6]:
#ratings_df.groupby(by="userId")["movieId"].count().sort_values(ascending=True)

In [7]:
# find each user given ratig with respect to movie
user_ratingCount= ratings_df.groupby(by="userId")["movieId"].count().reset_index()
user_ratingCount

Unnamed: 0,userId,movieId
0,1,16
1,2,15
2,3,11
3,4,736
4,5,72
...,...,...
283223,283224,329
283224,283225,20
283225,283226,11
283226,283227,17


In [8]:
# choose valid users out of 283228 , considering who provided mimimum 150 movies rating
user_threshold = 150
valuable_user_ratingCount= user_ratingCount.query('movieId >= @user_threshold')
valuable_user_ratingCount.rename(columns={'movieId': 'totalMovieRated'}, inplace=True)
valuable_user_ratingCount

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,userId,totalMovieRated
3,4,736
13,14,174
14,15,162
18,19,262
41,42,308
...,...,...
283183,283184,469
283184,283185,222
283194,283195,1512
283203,283204,278


In [9]:
# merger rating and total movie reated 
ratings_movie_rated_df  = pd.merge(ratings_df, valuable_user_ratingCount, how='inner', on="userId")

In [10]:
# drop unnecessary column timestamp
ratings_movie_rated_df.drop("timestamp",axis=1,inplace=True)

### Null Check

In [11]:
# movies_df, ratings_movie_rated_df, tags_df, links_df, genome_scores_df, genome_tags_df
movies_df_original.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [12]:
ratings_movie_rated_df.isnull().sum()

userId             0
movieId            0
rating             0
totalMovieRated    0
dtype: int64

### Joins  Dataframes 
#### movies_df, ratings_movie_rated_df, tags_df, links_df, genome_scores_df, genome_tags_df

In [13]:
# Merge movies_df and ratings_movie_rated_df with inner join
# movies_ratings=movies_df.merge(ratings_df, left_on = 'movieId', right_on = 'movieId', how = 'left')
movies_ratings  = pd.merge(movies_df_original, ratings_movie_rated_df,  how='inner', on=['movieId'])
movies_ratings

Unnamed: 0,movieId,title,genres,userId,rating,totalMovieRated
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,4.0,736
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.5,174
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,4.0,162
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,42,4.0,308
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,43,5.0,246
...,...,...,...,...,...,...
18992668,193876,The Great Glinka (1946),(no genres listed),103565,3.0,838
18992669,193878,Les tribulations d'une caissière (2011),Comedy,176871,2.0,607
18992670,193880,Her Name Was Mumu (2016),Drama,81710,2.0,1878
18992671,193882,Flora (2017),Adventure|Drama|Horror|Sci-Fi,33330,2.0,1881


In [14]:
# Null check for movies_ratings if anything is missed juring merge time
movies_ratings.isnull().sum()

movieId            0
title              0
genres             0
userId             0
rating             0
totalMovieRated    0
dtype: int64

In [15]:
# convert title to lower case
movies_ratings["title"]=movies_ratings["title"].str.lower()
# remove white spaces from title
movies_ratings["title"]=movies_ratings["title"].str.strip()

In [16]:
# parse movie title and get title name
def getTitle(title):
    try:
        _title=title.split("(")[0]
    except:
        _title="No_Title"
    return _title


In [17]:
# parse movie title and get movie year
def getYear(title):
    try:
        _year=title.split("(")[1].split(")")[0]
    except:
        _year =2013 # unkonw yere treated as 2013
    return _year

In [18]:
print(getTitle("Toy Story (1995)"))
print(getYear("Toy Story (1995)"))

Toy Story 
1995


In [19]:
movies_ratings["year"]=movies_ratings["title"].apply(getYear)

In [20]:
movies_ratings["title"]=movies_ratings["title"].apply(getTitle)

In [21]:
# userRatedMovies - a user given total movies rating
# user id 4 - given rating to total 736 movies 
movies_ratings.rename(columns = {'totalMovieRated': 'userRatedMovies'}, inplace=True)
movies_ratings[movies_ratings["userId"]==4]

Unnamed: 0,movieId,title,genres,userId,rating,userRatedMovies,year
0,1,toy story,Adventure|Animation|Children|Comedy|Fantasy,4,4.0,736,1995
30478,2,jumanji,Adventure|Children|Fantasy,4,4.0,736,1995
54824,5,father of the bride part ii,Comedy,4,2.0,736,1995
60482,6,heat,Action|Crime|Thriller,4,4.5,736,1995
84614,10,goldeneye,Action|Adventure|Thriller,4,4.0,736,1995
...,...,...,...,...,...,...,...
15843490,53972,live free or die hard,Action|Adventure|Crime|Thriller,4,3.5,736,2007
15852235,53996,transformers,Action|Sci-Fi|Thriller|IMAX,4,4.5,736,2007
15891115,54286,"bourne ultimatum, the",Action|Crime|Thriller,4,5.0,736,2007
15997210,55765,american gangster,Crime|Drama|Thriller,4,4.5,736,2007


In [22]:
movies_ratings.shape

(18992673, 7)

In [23]:
# need to convert year string to numeric datatype to run queries
movies_ratings.year = pd.to_numeric(movies_ratings.year, errors='coerce').fillna(0).astype(np.int64)
movies_ratings_filtered = movies_ratings[movies_ratings["year"]>=2013]
movies_ratings_filtered

Unnamed: 0,movieId,title,genres,userId,rating,userRatedMovies,year
14793233,32930,category 6: day of destruction,Action|Drama,72115,5.0,307,2013
14793234,32930,category 6: day of destruction,Action|Drama,112491,1.5,2912,2013
14793235,32930,category 6: day of destruction,Action|Drama,273842,3.0,3534,2013
15110312,40697,babylon 5,Sci-Fi,3846,4.0,1015,2013
15110313,40697,babylon 5,Sci-Fi,7066,5.0,170,2013
...,...,...,...,...,...,...,...
18992663,193863,cocaine godmother,Documentary|Drama,117654,2.0,1020,2017
18992665,193866,tales from the hood 2,Horror,19924,1.0,3312,2018
18992670,193880,her name was mumu,Drama,81710,2.0,1878,2016
18992671,193882,flora,Adventure|Drama|Horror|Sci-Fi,33330,2.0,1881,2017


In [24]:
movies_ratings = movies_ratings_filtered.copy()

##### rows 24498900 is causing below error during pivot table, so we need to reduce with help of totalRatingCount limitation
##### ValueError: Unstacked DataFrame is too big, causing int32 overflow
##### Accpet movie if it has more than 50 ratings

In [25]:
# we have taken user_threshold = 50 and to filter weighted users out of 283228  ie.final users are 109672
# find movie total rating - toy story  total ratings = 50624 
movie_ratingCount= movies_ratings.groupby(by="movieId")["userId"].count().reset_index()
movie_ratingCount.rename(columns = {'userId': 'totalRatingCount'}, inplace=True)
movie_ratingCount # Total Movies 50007


Unnamed: 0,movieId,totalRatingCount
0,32930,3
1,40697,175
2,95595,7
3,99007,1751
4,99335,6
...,...,...
11198,193863,1
11199,193866,1
11200,193880,1
11201,193882,1


In [26]:
# consider a movie which has minimum 2000 ratings
popularity_threshold = 2000
rating_popular_movie= movie_ratingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie

Unnamed: 0,movieId,totalRatingCount
60,101864,4054
69,102123,2376
70,102125,5814
80,102407,3374
81,102445,5496
...,...,...
6555,168250,2554
6556,168252,3301
7278,171763,2153
7598,174055,2249


In [27]:
# Merge movies_ratings and rating_popular_movie
movies_ratings_final=pd.merge(movies_ratings,rating_popular_movie, on="movieId", how ="inner") 
movies_ratings_final

Unnamed: 0,movieId,title,genres,userId,rating,userRatedMovies,year,totalRatingCount
0,101864,oblivion,Action|Adventure|Sci-Fi|IMAX,48,3.5,170,2013,4054
1,101864,oblivion,Action|Adventure|Sci-Fi|IMAX,81,4.0,1093,2013,4054
2,101864,oblivion,Action|Adventure|Sci-Fi|IMAX,134,4.5,1208,2013,4054
3,101864,oblivion,Action|Adventure|Sci-Fi|IMAX,173,3.5,1352,2013,4054
4,101864,oblivion,Action|Adventure|Sci-Fi|IMAX,176,4.0,702,2013,4054
...,...,...,...,...,...,...,...,...
444883,176371,blade runner 2049,Sci-Fi,282748,4.0,1033,2017,2606
444884,176371,blade runner 2049,Sci-Fi,282869,4.5,801,2017,2606
444885,176371,blade runner 2049,Sci-Fi,283000,3.5,2133,2017,2606
444886,176371,blade runner 2049,Sci-Fi,283183,4.0,253,2017,2606


In [28]:
movies_ratings_final.isnull().sum()

movieId             0
title               0
genres              0
userId              0
rating              0
userRatedMovies     0
year                0
totalRatingCount    0
dtype: int64

In [29]:
movies_ratings_final.shape

(444888, 8)

## # Create NearestNeighbors Model with cosine similarity
#### 1) create pivot matrix with respect to movie title, user id and user rating 
#### 2) Create CSR matrix
#### 3) create KNN model
#### 4) Test model

In [30]:
##create a Pivot matrix
movies_ratings_pivot=movies_ratings_final.pivot_table(index='title',columns='userId',values='rating',fill_value=0)
movies_ratings_pivot

userId,14,48,56,72,73,81,119,134,173,176,...,283000,283016,283095,283117,283125,283131,283164,283183,283184,283195
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 cloverfield lane,0.0,0.0,0.0,0.0,0.0,0.0,0,3.5,3.5,4.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12 years a slave,4.0,4.0,0.0,0.0,4.0,0.0,0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
22 jump street,0.0,0.0,0.0,0.0,0.0,0.5,0,3.0,3.0,0.0,...,4.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
about time,0.0,4.0,0.0,0.0,0.0,0.0,0,4.5,5.0,0.0,...,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
american hustle,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,3.5,...,4.0,0.0,4.5,2.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
world war z,4.0,2.0,0.0,5.0,0.0,0.5,0,3.5,3.5,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0
"world's end, the",0.0,0.0,0.0,0.0,0.0,0.0,0,4.0,3.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
x-men: apocalypse,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,3.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
x-men: days of future past,0.0,0.0,0.0,5.0,0.0,0.0,0,4.5,3.5,3.0,...,4.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0


In [31]:
# compressed sparse row (CSR) or compressed row storage (CRS) or compressed sparse matrix
from scipy.sparse import csr_matrix
movies_ratings_pivot_matrix = csr_matrix(movies_ratings_pivot.values)

In [32]:
# Create NearestNeighbors model
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movies_ratings_pivot_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [33]:
# with random index find title from movies_ratings_pivot
query_index = np.random.choice(movies_ratings_pivot.shape[0])
print("{0} = {1}".format(query_index,movies_ratings_pivot.index[query_index]))

37 = godzilla 


In [34]:
def getRecomendedMoviesByIndex(queryIndex,numberOfMovies):
    distances, indices = model_knn.kneighbors(movies_ratings_pivot.iloc[queryIndex,:].values.reshape(1, -1), 
                                              n_neighbors = numberOfMovies)
    return distances,indices

In [35]:
queryIndex=20 
numberOfMovies=6
distances, indices =getRecomendedMoviesByIndex(queryIndex,numberOfMovies)
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movies_ratings_pivot.index[queryIndex]))
    else:
        print('{0}: {1} =  {2} distance '.format(i, movies_ratings_pivot.index[indices.flatten()[i]], distances.flatten()[i]))


Recommendations for conjuring, the :

1: 10 cloverfield lane  =  0.6125001218021 distance 
2: world war z  =  0.6393269054012438 distance 
3: prisoners  =  0.6639329204102649 distance 
4: get out  =  0.6651519356113855 distance 
5: gone girl  =  0.666960619446064 distance 


In [36]:
# This KNN model requires queryIndex from pivot table to find best recommended movies
# we will use NLP model to get query index by passing moive title from pivot table
# So we need to save this pivot table and model

#### Save Model and Pivot table - model trained by pivot table that will be used in next NLP model

In [37]:
# save model
import pickle
fileName="collaborativeFiltering_model.pkl"
pickle.dump(model_knn,open(fileName,'wb'))

In [38]:
# Save pivot table
movies_ratings_pivot.to_csv('movie_user_rating_pivottable.csv', index=False,encoding='utf-8',compression='gzip')

In [39]:
# Save some part of pivot table as test csv - can used during testing time
# pivottable loading to dataframe taking too much time due to huge dataset.
movies_ratings_pivot_test=movies_ratings_pivot.iloc[:10,:]
#movies_ratings_pivot_test
#movie_features_df_test
movies_ratings_pivot_test.to_csv('movie_user_rating_pivottable_test.csv', index=False,encoding='utf-8',compression='gzip') 

In [40]:
#movies_ratings_pivot[movies_ratings_pivot.isnull()]

In [41]:
movies_ratings_pivot.index

Index(['10 cloverfield lane ', '12 years a slave ', '22 jump street ',
       'about time ', 'american hustle ', 'american sniper ', 'ant-man ',
       'arrival ', 'avengers: age of ultron ', 'baby driver ',
       ...
       'we're the millers ', 'whiplash ', 'wolf of wall street, the ',
       'wolverine, the ', 'wonder woman ', 'world war z ', 'world's end, the ',
       'x-men: apocalypse ', 'x-men: days of future past ', 'zootopia '],
      dtype='object', name='title', length=112)

In [42]:
# We are going to use NLP model to get query index by passing movie title.
# Here index is coming from pivot table
# save this pivot table index ie movie title name and it's index

title=movies_ratings_pivot.index
index=np.arange(0,len(movies_ratings_pivot.index))

pivot_df = pd.DataFrame({'index': index,'title': title})
pivot_df
pivot_df.to_csv('movie_user_rating_index.csv', index=False,encoding='utf-8',compression='gzip') 