In [89]:
import pandas as pd
from math import sqrt
import numpy as np

In [90]:
moviesdf = pd.read_csv("Netflix_Dataset_Movie.csv")
moviesdf


Unnamed: 0,Movie_ID,Year,Name
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004,Fidel Castro: American Experience
17767,17768,2000,Epoch
17768,17769,2003,The Company


In [91]:
ratingdf =pd.read_csv('Netflix_Dataset_Rating.csv')
ratingdf

Unnamed: 0,User_ID,Rating,Movie_ID
0,712664,5,3
1,1331154,4,3
2,2632461,3,3
3,44937,5,3
4,656399,4,3
...,...,...,...
17337453,520675,3,4496
17337454,1055714,5,4496
17337455,2643029,4,4496
17337456,1559566,3,4496


In [92]:
userInput = [{'Name': 'Men in Black II','Rating' : 5},
            {'Name': 'Beverly Hills Cop','Rating' : 3},
            {'Name': 'Character','Rating' : 2},
            {'Name': 'Ricochet','Rating' : 4},
            {'Name': "High Fidelity",'Rating' : 1},]
inputMovie = pd.DataFrame(userInput)
inputMovie

Unnamed: 0,Name,Rating
0,Men in Black II,5
1,Beverly Hills Cop,3
2,Character,2
3,Ricochet,4
4,High Fidelity,1


In [93]:
inputId = moviesdf[moviesdf['Name'].isin(inputMovie['Name'].tolist())]
inputMovie = pd.merge(inputId, inputMovie)
inputMovie = inputMovie.drop('Year', 1)
inputMovie = inputMovie[['Movie_ID','Name','Rating']]
inputMovie

  inputMovie = inputMovie.drop('Year', 1)


Unnamed: 0,Movie_ID,Name,Rating
0,3,Character,2
1,312,High Fidelity,1
2,1744,Beverly Hills Cop,3
3,2199,Ricochet,4
4,3427,Men in Black II,5


In [94]:
userSubset = ratingdf[ratingdf['Movie_ID'].isin(inputMovie['Movie_ID'].tolist())]
userSubset.groupby('Movie_ID').count()

Unnamed: 0_level_0,User_ID,Rating
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
3,1524,1524
312,52726,52726
1744,57920,57920
2199,4676,4676
3427,81371,81371


In [95]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['User_ID'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])

In [96]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(1333,           User_ID  Rating  Movie_ID
66           1333       4         3
1098826      1333       3       312
6303895      1333       3      1744
8349609      1333       2      2199
12909250     1333       1      3427), (15420,           User_ID  Rating  Movie_ID
1188        15420       2         3
1138867     15420       2       312
6347897     15420       2      1744
8353173     15420       3      2199
12970903    15420       2      3427), (16272,           User_ID  Rating  Movie_ID
936         16272       4         3
1129175     16272       4       312
6337253     16272       4      1744
8352286     16272       3      2199
12955998    16272       3      3427), (32902,           User_ID  Rating  Movie_ID
1309        32902       4         3
1142826     32902       5       312
6352228     32902       5      1744
8353513     32902       1      2199
12977046    32902       3      3427), (57633,           User_ID  Rating  Movie_ID
1389        57633       4         3
1145242     5763

In [97]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='Movie_ID')
    inputMovie = inputMovie.sort_values(by='Movie_ID')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovie[inputMovie['Movie_ID'].isin(group['Movie_ID'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['Rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['Rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
    

In [98]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['User_ID'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  User_ID
0        -0.832050     1333
1         0.353553    15420
2        -0.866025    16272
3        -0.661438    32902
4        -0.121268    57633


In [99]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head(100))

    similarityIndex  User_ID
51         0.878310    16273
38         0.866025  2118461
62         0.764471    29815
64         0.764471    33430
43         0.693375  2457095
67         0.662541    38879
40         0.639602  2220732
45         0.606339  2606799
42         0.577350  2439493
22         0.566947  1314869
54         0.529150    21364
87         0.478091    62691
17         0.426401   789014
1          0.353553    15420
14         0.288675   603277
12         0.176777   387418
50         0.169031    14642
93         0.169031    72607
90         0.169031    66109
88         0.169031    63858
65         0.152894    33696
7          0.138675   305344
91         0.097590    66852
73         0.097590    44434
46         0.050965     8135
70         0.000000    42630
48         0.000000    11043
97         0.000000    80176
60         0.000000    27061
39         0.000000  2143500
66         0.000000    34762
78         0.000000    47075
20         0.000000  1037245
28         0.0

In [100]:
topUsersRating=topUsers.merge(ratingdf, left_on='User_ID', right_on='User_ID', how='inner')
print(topUsersRating.head(100))

    similarityIndex  User_ID  Rating  Movie_ID
0           0.87831    16273       3        17
1           0.87831    16273       4        30
2           0.87831    16273       3        55
3           0.87831    16273       3        58
4           0.87831    16273       5        76
..              ...      ...     ...       ...
95          0.87831    16273       3       896
96          0.87831    16273       3       907
97          0.87831    16273       5       918
98          0.87831    16273       3       931
99          0.87831    16273       3       937

[100 rows x 4 columns]


In [101]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['Rating']
print(topUsersRating.head())

   similarityIndex  User_ID  Rating  Movie_ID  weightedRating
0          0.87831    16273       3        17         2.63493
1          0.87831    16273       4        30         3.51324
2          0.87831    16273       3        55         2.63493
3          0.87831    16273       3        58         2.63493
4          0.87831    16273       5        76         4.39155


In [102]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('Movie_ID').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

          sum_similarityIndex  sum_weightedRating
Movie_ID                                         
3                    4.668661            7.370004
8                    4.209713            7.086718
16                   5.303888           17.535616
17                   4.148485           10.983048
18                   3.721425            9.982439


In [103]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['Movie_ID'] = tempTopUsersRating.index
print(recommendation_df.head(10))

          weighted average recommendation score  Movie_ID
Movie_ID                                                 
3                                      1.578612         3
8                                      1.683421         8
16                                     3.306181        16
17                                     2.647484        17
18                                     2.682424        18
26                                     3.100058        26
28                                     3.399868        28
30                                     3.916766        30
32                                     3.888461        32
33                                     2.489440        33


In [104]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df

Unnamed: 0_level_0,weighted average recommendation score,Movie_ID
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
2782,4.623715,2782
2452,4.611710,2452
3624,4.576816,3624
1757,4.564346,1757
985,4.481604,985
...,...,...
47,1.586866,47
4086,1.586499,4086
3,1.578612,3
3875,1.413342,3875


In [105]:
recommendedMovie=moviesdf.loc[moviesdf['Movie_ID'].isin(recommendation_df['Movie_ID'])]

#we don't want to recommend the same movie
recommendedMovie=recommendedMovie.loc[~recommendedMovie.Movie_ID.isin(userSubset['Movie_ID'])]

print(recommendedMovie)

      Movie_ID  Year                        Name
7            8  2004  What the #$*! Do We Know!?
15          16  1996                   Screamers
16          17  2005                   7 Seconds
17          18  1994            Immortal Beloved
25          26  2004             Never Die Alone
...        ...   ...                         ...
4487      4488  2000                 Wonder Boys
4489      4490  2004                   Ned Kelly
4491      4492  2004                  Club Dread
4492      4493  2003           Ju-on: The Grudge
4495      4496  1993       Farewell My Concubine

[1345 rows x 3 columns]
