# Collaborative filtering movie recommendation

In [1]:
import numpy as np
import pandas as pd
from math import sqrt
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
pd.__version__

'1.5.3'

In [4]:
movie = pd.read_csv('dataset/movies.csv')
rating= pd.read_csv('dataset/ratings.csv')
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movie['year'] = movie.title.str.extract('(\\d\d\d\d\))',
expand=False)
#Removing the parentheses
movie['year'] = movie.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movie['title'] = movie.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movie['title'] = movie['title'].apply(lambda x: x.strip())

  movie['title'] = movie.title.str.replace('(\(\d\d\d\d\))', '')


In [6]:
movie

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,2017
9738,193583,No Game No Life: Zero,Animation|Comedy|Fantasy,2017
9739,193585,Flint,Drama,2017
9740,193587,Bungo Stray Dogs: Dead Apple,Action|Animation,2018


In [7]:
# Collaborative filtering doesn't recommend based on the features of the movie. The recommendation is based on the likes and dislikes or ratings of the neighbours or other users. So we will drop the genre column, since there is no use of it.

movie.drop(columns=['genres'], inplace=True)

In [8]:
movie.head()


Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [10]:
rating.drop(columns=['timestamp'],inplace=True)

rating.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [11]:
# Our target User with its ratings
user = [
            {'title':'Breakfast Club, The', 'rating':4},
            {'title':'Toy Story', 'rating':2.5},
            {'title':'Jumanji', 'rating':3},
            {'title':"Pulp Fiction", 'rating':4.5},
            {'title':'Akira', 'rating':5}
         ] 
inputMovie = pd.DataFrame(user)
inputMovie

Unnamed: 0,title,rating
0,"Breakfast Club, The",4.0
1,Toy Story,2.5
2,Jumanji,3.0
3,Pulp Fiction,4.5
4,Akira,5.0


In [12]:
# Add movieId to input user

#Filtering out the movies by title
Id = movie[movie['title'].isin(inputMovie['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovie = pd.merge(Id, inputMovie)
#Dropping information we won't use from the input dataframe
inputMovie = inputMovie.drop('year', 1)
inputMovie

  inputMovie = inputMovie.drop('year', 1)


Unnamed: 0,movieId,title,rating
0,1,Toy Story,2.5
1,2,Jumanji,3.0
2,296,Pulp Fiction,4.5
3,1274,Akira,5.0
4,1968,"Breakfast Club, The",4.0


In [13]:
#Filtering out users that have watched movies that the input has watched and storing it
users = rating[rating['movieId'].isin(inputMovie['movieId'].tolist())]
users.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
16,1,296,3.0
320,4,296,1.0
422,4,1968,4.0
516,5,1,4.0


In [15]:
users.shape

(784, 3)

In [16]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = users.groupby(['userId'])

In [24]:
#showing one such group example by getting all the users of a particular uderId
userSubsetGroup.get_group(1)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
16,1,296,3.0


In [25]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

  userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)


In [26]:
userSubsetGroup[0:3]


[(91,
         userId  movieId  rating
  14121      91        1     4.0
  14122      91        2     3.0
  14173      91      296     4.5
  14316      91     1274     5.0
  14383      91     1968     3.0),
 (177,
         userId  movieId  rating
  24900     177        1     5.0
  24901     177        2     3.5
  24930     177      296     5.0
  25069     177     1274     2.0
  25129     177     1968     3.5),
 (219,
         userId  movieId  rating
  31524     219        1     3.5
  31525     219        2     2.5
  31554     219      296     4.0
  31628     219     1274     2.5
  31680     219     1968     3.0)]

In [27]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovie = inputMovie.sort_values(by='movieId')
    #Get the N for the formula
    n = len(group)
    #Get the review scores for the movies that they both have in common
    temp = inputMovie[inputMovie['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp['rating'].tolist()
    #put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(n)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(n)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(n)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorDict[name] = 0

In [28]:
pearsonCorDict.items()


dict_items([(91, 0.5796011559684829), (177, -0.5187513759338123), (219, -0.05547950410914763), (274, 0.48424799847909017), (298, 0.8329565184432136), (414, 0.48993185504860093), (474, -0.48993185504860093), (477, 0.835703992326648), (480, 0.9644856443408245), (483, 0.0), (599, 0.9007334537569819), (608, 0.8367179328930429), (50, 0.6172133998483676), (57, -0.9669875568304563), (68, -0.22360679774997896), (103, 0.7302967433402214), (135, 0.3651483716701107), (182, 0.9258200997725514), (202, 0.3651483716701107), (217, -0.31622776601683794), (226, 0.848528137423857), (288, 0.26519741765271837), (307, 0.5786913866044946), (318, 0.8783100656536799), (322, 0.5786913866044946), (330, 0.42799248836102016), (357, 0.0), (434, 0.7407610636824496), (448, 0.0), (469, 0.2672612419124244), (561, 0.7302967433402214), (600, 0.5329480400990121), (606, 0.8233293074216317), (610, 0.0), (18, 0.7205766921228924), (19, -0.9819805060619652), (21, 0), (45, 0.69337524528154), (63, -0.2773500981126157), (64, 0.24

In [29]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.579601,91
1,-0.518751,177
2,-0.05548,219
3,0.484248,274
4,0.832957,298


In [30]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
114,1.0,62
170,1.0,325
103,1.0,15
146,1.0,195
168,1.0,290


In [31]:
topUsersRating=topUsers.merge(rating, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,62,2,4.0
1,1.0,62,6,4.5
2,1.0,62,47,4.5
3,1.0,62,110,4.5
4,1.0,62,260,4.5


In [34]:
print(topUsersRating.shape)
print(topUsers.shape)

(9089, 4)
(50, 2)


In [35]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,62,2,4.0,4.0
1,1.0,62,6,4.5,4.5
2,1.0,62,47,4.5,4.5
3,1.0,62,110,4.5,4.5
4,1.0,62,260,4.5,4.5


In [37]:
topUsersRating.describe()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
count,9089.0,9089.0,9089.0,9089.0,9089.0
mean,1.0,295.530531,22744.678953,3.665365,3.665365
std,0.0,186.990214,39528.949729,0.9399,0.9399
min,1.0,5.0,1.0,0.5,0.5
25%,1.0,124.0,1120.0,3.0,3.0
50%,1.0,279.0,2791.0,4.0,4.0
75%,1.0,490.0,27773.0,4.0,4.0
max,1.0,609.0,188751.0,5.0,5.0


In [38]:
# Applies a sum to the topUsers after grouping it by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']

# Creates an empty dataframe
recommendation_df = pd.DataFrame()

# Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating'] / tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index

# Sorting recommendations
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)

# Displaying top recommendations
recommendation_df.head(10)

# Matching movie information
movie.loc[movie['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]


Unnamed: 0,movieId,title,year
211,247,Heavenly Creatures,1994
650,838,Emma,1996
973,1274,Akira,1988
1529,2064,Roger & Me,1989
1532,2067,Doctor Zhivago,1965
5399,9010,Love Me If You Dare (Jeux d'enfants),2003
5400,9018,Control Room,2004
6401,50804,Hannibal Rising,2007
6523,54004,I Now Pronounce You Chuck and Larry,2007
6527,54190,Across the Universe,2007
