## Movie Recommendation System


### Installing Data

 Data Source : https://www.kaggle.com/netflix-inc/netflix-prize-data

#### Data Preproccessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
from scipy import sparse
from scipy.sparse import csr_matrix

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import random

In [2]:
def readFile(file_path, rows=100000):
    data_dict = {'Cust_Id' : [], 'Rating' : [],  'Movie_Id' : []}
    f = open(file_path, "r")
    count = 0
    for line in f:
        count += 1
        if count > rows:
            break
            
        if ':' in line:
            movidId = line[:-2] # remove the last character ':'
            movieId = int(movidId)
        else:
            customerID, rating, date = line.split(',')
            data_dict['Cust_Id'].append(customerID)
            data_dict['Rating'].append(rating)
            data_dict['Movie_Id'].append(movieId)
    f.close()
            
    return pd.DataFrame(data_dict)
df1 = readFile('../Data/Netflix/combined_data_1.txt', rows=100000)
df2 = readFile('../Data/Netflix/combined_data_2.txt', rows=100000)
df3 = readFile('../Data/Netflix/combined_data_3.txt', rows=100000)
df4 = readFile('../Data/Netflix/combined_data_4.txt', rows=100000)
df1['Rating'] = df1['Rating'].astype(float)
df2['Rating'] = df2['Rating'].astype(float)
df3['Rating'] = df3['Rating'].astype(float)
df4['Rating'] = df4['Rating'].astype(float)

In [3]:
df=df1
df.append(df2)
df.append(df3)
df.append(df4)
df.head()

Unnamed: 0,Cust_Id,Rating,Movie_Id
0,1488844,3.0,1
1,822109,5.0,1
2,885013,4.0,1
3,30878,4.0,1
4,823519,3.0,1


In [4]:
print("Total Data:")
print("Total number of movie ratings = "+str(df.shape[0]))
print("Number of unique users = "+str(len(np.unique(df["Cust_Id"]))))
print("Number of unique movies = "+str(len(np.unique(df["Movie_Id"]))))

Total Data:
Total number of movie ratings = 99970
Number of unique users = 81472
Number of unique movies = 30


In [5]:
df_title = pd.read_csv('../Data/Netflix/movie_titles.csv', encoding = "ISO-8859-1", header = None, names = ['Movie_Id', 'Year', 'Name'])
df_title.head()

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [6]:
df = pd.merge(df,df_title,on='Movie_Id')
df.head()

Unnamed: 0,Cust_Id,Rating,Movie_Id,Year,Name
0,1488844,3.0,1,2003.0,Dinosaur Planet
1,822109,5.0,1,2003.0,Dinosaur Planet
2,885013,4.0,1,2003.0,Dinosaur Planet
3,30878,4.0,1,2003.0,Dinosaur Planet
4,823519,3.0,1,2003.0,Dinosaur Planet


In [7]:
df.groupby('Name')['Rating'].count().sort_values(ascending=False).head()


Name
Lilo and Stitch               39752
What the #$*! Do We Know!?    14910
Immortal Beloved              10722
Something's Gotta Give         7172
7 Seconds                      7108
Name: Rating, dtype: int64

In [8]:
df.groupby('Name')['Rating'].mean().sort_values(ascending=False).head()

Name
Lord of the Rings: The Return of the King: Extended Edition: Bonus Material    4.552000
Inspector Morse 31: Death Is Now My Neighbour                                  3.970174
The Rise and Fall of ECW                                                       3.919298
Lilo and Stitch                                                                3.823254
Immortal Beloved                                                               3.784369
Name: Rating, dtype: float64

In [9]:
util_mat = df.pivot_table(index = 'Cust_Id', columns = 'Name', values = 'Rating')
util_mat.head()  

Name,7 Seconds,8 Man,Boycott,By Dawn's Early Light,Character,Chump Change,Class of Nuke 'Em High 2,Clifford: Clifford Saves the Day! / Clifford's Fluffiest Friend Cleo,Dinosaur Planet,Fighter,...,Never Die Alone,Paula Abdul's Get Up & Dance,Screamers,Seeta Aur Geeta,Sesame Street: Elmo's World: The Street We Live On,Sick,Something's Gotta Give,Strange Relations,The Rise and Fall of ECW,What the #$*! Do We Know!?
Cust_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100006,,,,,3.0,,,,,,...,,,,,,,,,,
1000079,,,,,,,,,,,...,,,,,,,,,,
1000105,,,,,,,,,,,...,,,,,,,,,,
1000158,,,,,,,,,,,...,,,,,,,,,,
1000192,,,,,,,,,,,...,,,,,,,,,,


In [10]:
type(util_mat)

pandas.core.frame.DataFrame

In [11]:
item_util_matrix = util_mat.copy()

# We will fill the row wise NaN's with the corresponding user's mean ratings, so that we can carry out Pearson correlation.
# Here we assume avg ratings for the movie that is not rated.
item_util_matrix = item_util_matrix.apply(lambda col: col.fillna(col.mean()), axis=0)
item_util_matrix.head()



Name,7 Seconds,8 Man,Boycott,By Dawn's Early Light,Character,Chump Change,Class of Nuke 'Em High 2,Clifford: Clifford Saves the Day! / Clifford's Fluffiest Friend Cleo,Dinosaur Planet,Fighter,...,Never Die Alone,Paula Abdul's Get Up & Dance,Screamers,Seeta Aur Geeta,Sesame Street: Elmo's World: The Street We Live On,Sick,Something's Gotta Give,Strange Relations,The Rise and Fall of ECW,What the #$*! Do We Know!?
Cust_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100006,2.903208,2.129032,3.59847,3.324675,3.0,2.246305,2.621053,3.556098,3.749543,3.180723,...,2.793721,2.739437,3.098555,3.146552,3.527473,3.084396,3.758226,3.463303,3.919298,3.189805
1000079,2.903208,2.129032,3.59847,3.324675,3.641153,2.246305,2.621053,3.556098,3.749543,3.180723,...,2.793721,2.739437,3.098555,3.146552,3.527473,3.084396,3.758226,3.463303,3.919298,3.189805
1000105,2.903208,2.129032,3.59847,3.324675,3.641153,2.246305,2.621053,3.556098,3.749543,3.180723,...,2.793721,2.739437,3.098555,3.146552,3.527473,3.084396,3.758226,3.463303,3.919298,3.189805
1000158,2.903208,2.129032,3.59847,3.324675,3.641153,2.246305,2.621053,3.556098,3.749543,3.180723,...,2.793721,2.739437,3.098555,3.146552,3.527473,3.084396,3.758226,3.463303,3.919298,3.189805
1000192,2.903208,2.129032,3.59847,3.324675,3.641153,2.246305,2.621053,3.556098,3.749543,3.180723,...,2.793721,2.739437,3.098555,3.146552,3.527473,3.084396,3.758226,3.463303,3.919298,3.189805


In [12]:
item_util_matrix.isna().sum().sum()


0

In [13]:
item_corr_matrix = item_util_matrix.corr()

In [14]:
item_corr_matrix.head()

Name,7 Seconds,8 Man,Boycott,By Dawn's Early Light,Character,Chump Change,Class of Nuke 'Em High 2,Clifford: Clifford Saves the Day! / Clifford's Fluffiest Friend Cleo,Dinosaur Planet,Fighter,...,Never Die Alone,Paula Abdul's Get Up & Dance,Screamers,Seeta Aur Geeta,Sesame Street: Elmo's World: The Street We Live On,Sick,Something's Gotta Give,Strange Relations,The Rise and Fall of ECW,What the #$*! Do We Know!?
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7 Seconds,1.0,0.014689,0.017389,0.010115,0.006853,0.002539,0.003418,0.009068,0.005539,0.009481,...,0.047314,0.006895,0.013027,0.007355,0.013739,0.00409,0.006654,0.007741,0.011152,0.005469
8 Man,0.014689,1.0,0.050871,0.051361,0.031564,0.019776,0.024524,0.067978,0.058701,0.053807,...,0.012755,0.034086,0.057651,0.067369,0.053334,0.018826,0.000279,-0.002641,0.029554,0.006465
Boycott,0.017389,0.050871,1.0,0.084472,0.059338,0.015542,0.041833,0.07216,0.043366,0.050144,...,0.033329,0.030638,0.017198,0.072136,0.080414,0.060476,0.002543,0.072615,0.068024,0.013735
By Dawn's Early Light,0.010115,0.051361,0.084472,1.0,0.051568,0.019393,0.022737,0.064702,0.038627,0.050087,...,0.01047,0.035622,0.024103,0.073294,0.074011,0.029957,-0.00145,0.089208,0.045883,0.012773
Character,0.006853,0.031564,0.059338,0.051568,1.0,0.013345,0.02318,0.041292,0.03576,0.06231,...,0.009064,0.0238,0.010179,0.048018,0.051062,0.035808,0.00173,0.06169,0.050896,0.013678


In [15]:
def moviecorr(moviename):
    movie_corr = item_corr_matrix[str(moviename)]
    movie_corr = movie_corr.sort_values(ascending=False)
    movie_corr.dropna(inplace=True)
    movie_corr = pd.DataFrame(movie_corr)


    return movie_corr.drop(labels=str(moviename),).head()


In [16]:
moviecorr('The Rise and Fall of ECW')

Unnamed: 0_level_0,The Rise and Fall of ECW
Name,Unnamed: 1_level_1
Isle of Man TT 2004 Review,0.158534
Sesame Street: Elmo's World: The Street We Live On,0.086018
Seeta Aur Geeta,0.068152
Boycott,0.068024
Dinosaur Planet,0.054877


In [17]:
id_util_mat=util_mat.T
id_util_mat = id_util_mat.apply(lambda col: col.fillna(col.mean()), axis=0)



In [18]:
id_util_mat_1 = id_util_mat.iloc[:10000,:10000]
#id_util_mat_2 = id_util_mat.iloc[10000:20000,10000:20000]
#id_util_mat_3 = id_util_mat.iloc[20000:30000,20000:30000]
#id_util_mat_4 = id_util_mat.iloc[30000:40000,30000:40000]
#id_util_mat_5 = id_util_mat.iloc[40000:50000,40000:50000]
#id_util_mat_6 = id_util_mat.iloc[50000:60000,50000:60000]
#id_util_mat_7 = id_util_mat.iloc[60000:70000,60000:70000]
#id_util_mat_8 = id_util_mat.iloc[70000:,70000:]


In [33]:
id_util_mat_1=id_util_mat_1.corr()
#id_util_mat_2=id_util_mat_2.corr()
#id_util_mat_3=id_util_mat_3.corr()
#id_util_mat_4=id_util_mat_4.corr()
#id_util_mat_5=id_util_mat_5.corr()
#id_util_mat_6=id_util_mat_6.corr()
#id_util_mat_7=id_util_mat_7.corr()
#id_util_mat_8=id_util_mat_8.corr()


In [28]:
id_util_mat_1.head()

Cust_Id,100006,1000079,1000105,1000158,1000192,1000232,100029,1000301,1000303,1000328,...,1292216,1292272,1292299,1292338,1292345,1292357,1292384,1292387,129250,1292571
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7 Seconds,3.0,2.0,4.0,3.0,2.0,5.0,5.0,3.5,3.0,2.0,...,3.0,5.0,3.0,3.0,4.0,2.0,4.0,2.0,3.0,3.0
8 Man,3.0,2.0,4.0,3.0,2.0,5.0,5.0,3.5,3.0,2.0,...,3.0,5.0,3.0,3.0,4.0,2.0,4.0,2.0,3.0,3.0
Boycott,3.0,2.0,4.0,3.0,2.0,5.0,5.0,3.5,3.0,2.0,...,3.0,5.0,3.0,3.0,4.0,2.0,4.0,2.0,3.0,3.0
By Dawn's Early Light,3.0,2.0,4.0,3.0,2.0,5.0,5.0,3.5,3.0,2.0,...,3.0,5.0,3.0,3.0,4.0,2.0,4.0,2.0,3.0,3.0
Character,3.0,2.0,4.0,3.0,2.0,5.0,5.0,3.5,3.0,2.0,...,3.0,5.0,3.0,3.0,4.0,2.0,4.0,2.0,3.0,3.0


In [40]:
def idcorr(moviename):
    id_corr = id_util_mat_1[str(moviename)]
    id_corr = id_corr.sort_values(ascending=False)
    id_corr.dropna(inplace=True)
    id_corr = pd.DataFrame(id_corr)


    return id_corr.drop(labels=str(moviename),).head()

In [48]:
id_corr = id_util_mat_1['1000105']
id_corr = id_corr.sort_values(ascending=False)
id_corr.dropna(inplace=True)
id_corr = pd.DataFrame(id_corr)

id_corr.head()

Unnamed: 0_level_0,1000105
Name,Unnamed: 1_level_1
What the #$*! Do We Know!?,4.0
The Rise and Fall of ECW,4.0
8 Man,4.0
Boycott,4.0
By Dawn's Early Light,4.0


In [None]:
i