In [30]:
import pandas as pd
from math import sqrt
import numpy as np

In [31]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
print(movies_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None


In [32]:
userInput = [{'title':'Tom and Huck (1995)', 'rating':5},
             {'title':'Dangerous Minds (1995)', 'rating':3.5},
             {'title':'Copycat (1995)', 'rating':4},
             {'title':'Assassins (1995)', 'rating':4},
             {'title':'Sense and Sensibility (1995)', 'rating':4.5}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                          title  rating
0           Tom and Huck (1995)     5.0
1        Dangerous Minds (1995)     3.5
2                Copycat (1995)     4.0
3              Assassins (1995)     4.0
4  Sense and Sensibility (1995)     4.5


In [33]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)

   movieId                         title  rating
0        8           Tom and Huck (1995)     5.0
1       17  Sense and Sensibility (1995)     4.5
2       22                Copycat (1995)     4.0
3       23              Assassins (1995)     4.0
4       31        Dangerous Minds (1995)     3.5


In [34]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
8             8       8          8
17           67      67         67
22           36      36         36
23           16      16         16
31           38      38         38


In [35]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])


#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(414,        userId  movieId  rating  timestamp
62300     414        8     3.0  961594849
62305     414       17     4.0  961513829
62308     414       22     3.0  961518227
62309     414       23     2.0  961682276
62313     414       31     3.0  961518520), (6,      userId  movieId  rating  timestamp
566       6        8     3.0  845555281
572       6       17     4.0  845553559
575       6       22     5.0  845553875
580       6       31     3.0  845553819), (314,        userId  movieId  rating  timestamp
48587     314       17     4.0  834398442
48589     314       22     3.0  834398622
48590     314       23     3.0  834241586
48592     314       31     3.0  834241586), (274,        userId  movieId  rating   timestamp
39232     274        8     3.0  1172030892
39238     274       22     3.5  1171759024
39239     274       23     3.5  1171829251), (474,        userId  movieId  rating   timestamp
73100     474       17     5.0   974668666
73102     474       22     3.0  1046896006


  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


In [36]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()

    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()


    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [37]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0         0.310087     414
1        -0.134840       6
2         0.816497     314
3        -1.000000     274
4         0.866025     474


In [38]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
20              1.0     323
19              1.0     307
8               1.0      40
18              1.0     182
16              1.0     162


In [39]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating   timestamp
0               1.0     323        1     3.5  1422640363
1               1.0     323        2     4.0  1422640110
2               1.0     323       17     3.5  1422640288
3               1.0     323       19     2.5  1422640116
4               1.0     323       22     3.0  1422640551
..              ...     ...      ...     ...         ...
95              1.0     323   115617     3.5  1422640828
96              1.0     323   116797     4.5  1422640799
97              1.0     323   117176     4.0  1422640837
98              1.0     307        1     4.0  1186160893
99              1.0     307        2     2.5  1186161482

[100 rows x 5 columns]


In [40]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating   timestamp  weightedRating
0              1.0     323        1     3.5  1422640363             3.5
1              1.0     323        2     4.0  1422640110             4.0
2              1.0     323       17     3.5  1422640288             3.5
3              1.0     323       19     2.5  1422640116             2.5
4              1.0     323       22     3.0  1422640551             3.0


In [41]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                   8.088648           30.942057
2                   6.272152           20.518436
3                   1.906126            5.634407
4                   2.000000            6.000000
5                   4.676112           10.919212


In [42]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     3.825368        1
2                                     3.271355        2
3                                     2.955946        3
4                                     3.000000        4
5                                     2.335105        5
6                                     3.666986        6
7                                     3.330920        7
8                                     3.000000        8
9                                     1.500000        9
10                                    2.821204       10


In [43]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
89118                                      5.0    89118
56251                                      5.0    56251
611                                        5.0      611
43376                                      5.0    43376
5075                                       5.0     5075
...                                        ...      ...
116897                                     NaN   116897
117533                                     NaN   117533
161127                                     NaN   161127
165101                                     NaN   165101
175303                                     NaN   175303

[5811 rows x 2 columns]


In [44]:
recommended_movie=movies_df.loc[movies_df['movieId'].isin(recommendation_df['movieId'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie)

      movieId                                      title  \
0           1                           Toy Story (1995)   
1           2                             Jumanji (1995)   
2           3                    Grumpier Old Men (1995)   
3           4                   Waiting to Exhale (1995)   
4           5         Father of the Bride Part II (1995)   
...       ...                                        ...   
9656   180985                The Greatest Showman (2017)   
9660   181315                      Phantom Thread (2017)   
9679   183301        The Tale of the Bunny Picnic (1986)   
9695   184791  Fred Armisen: Standup for Drummers (2018)   
9710   187595             Solo: A Star Wars Story (2018)   

                                           genres  
0     Adventure|Animation|Children|Comedy|Fantasy  
1                      Adventure|Children|Fantasy  
2                                  Comedy|Romance  
3                            Comedy|Drama|Romance  
4                  