# Collaborative Filtering based recommendation system

In [1]:
#Dataframe manipulation library
import pandas as pd
#Math functions, we'll only need the sqrt function so let's import only that
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Storing the movie information into a pandas dataframe
movies_df = pd.read_csv('J:/sets/movies.csv')
#Storing the user information into a pandas dataframe
ratings_df = pd.read_csv('J:/sets/ratings.csv')

In [3]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don't conflict with movies that have years in their titles
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

In [5]:
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [6]:
#Dropping the genres column
movies_df = movies_df.drop('genres', 1)

In [7]:
movies_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [8]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [9]:
ratings_df = ratings_df.drop('timestamp', 1)

In [10]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


## Collaborative Filtering


In [11]:
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5},
            {'title':'Heat','rating':3.0},
            {'title':'American President, The','rating':3.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,rating,title
0,5.0,"Breakfast Club, The"
1,3.5,Toy Story
2,2.0,Jumanji
3,5.0,Pulp Fiction
4,4.5,Akira
5,3.0,Heat
6,3.5,"American President, The"


### Add movieId to input user

In [12]:
#Filtering out the movies by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)
#Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop('year', 1)
#Final input dataframe
#If a movie you added in above isn't here, then it might not be in the original 
#dataframe or it might spelled differently, please check capitalisation.
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,6,Heat,3.0
3,73608,Heat,3.0
4,131274,Heat,3.0
5,11,"American President, The",3.5
6,296,Pulp Fiction,5.0
7,1274,Akira,4.5
8,1968,"Breakfast Club, The",5.0


### The users who has seen the same movies

In [13]:
#Filtering out users that have watched movies that the input has watched and storing it
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
632,14,6,3.0


In [14]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

**lets look at one of the users, e.g. the one with userID=1130**

In [15]:
userSubsetGroup.get_group(1130)

Unnamed: 0,userId,movieId,rating
104167,1130,1,0.5
104168,1130,2,4.0
104169,1130,6,3.5
104214,1130,296,4.0
104363,1130,1274,4.5
104443,1130,1968,4.5


In [16]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [17]:
userSubsetGroup[0:3]

[(133624,           userId  movieId  rating
  12360335  133624        1     2.5
  12360336  133624        2     2.5
  12360339  133624        6     3.5
  12360342  133624       11     2.5
  12360473  133624      296     3.5
  12360825  133624     1274     4.0
  12361041  133624     1968     3.5
  12362444  133624    73608     3.5),
 (135877,           userId  movieId  rating
  12569790  135877        1     3.0
  12569791  135877        2     3.0
  12569792  135877        6     3.5
  12569794  135877       11     3.0
  12569844  135877      296     4.0
  12570147  135877     1274     3.5
  12570299  135877     1968     4.0
  12573863  135877    73608     2.5),
 (165000,           userId  movieId  rating
  15282821  165000        1     3.5
  15282822  165000        2     2.5
  15282823  165000        6     3.5
  15282826  165000       11     2.5
  15282871  165000      296     3.5
  15283037  165000     1274     3.5
  15283140  165000     1968     3.5
  15283653  165000    73608     3.0)

In [18]:
userSubsetGroup = userSubsetGroup[0:100]

In [19]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [20]:
pearsonCorrelationDict.items()

dict_items([(133624, 0.5566003248010438), (135877, 0.7652993938020273), (165000, 0.6485438263538637), (2065, 0.15504341823650913), (3186, 0.7146943350099456), (4099, -0.015314097295791779), (4208, -0.047178209756025366), (4282, -0.37210420376762865), (4818, 0.36899177027969143), (5104, 0.7991568119073308), (6482, -0.14303239192265596), (7403, 0.1834498464263357), (9994, 0.4302652729749452), (11769, 0.5253137831729979), (11827, 0.4586246160658392), (12325, 0.9508255313768973), (12921, 0.5742786485922085), (14551, 0.48121773951145436), (14984, 0.7198740330653568), (15157, 0.7904952493291819), (15466, 0.6705366440588023), (15670, 0.4134615384615381), (17666, 0.5399055247990172), (17897, 0.7343031958475685), (17944, 0.318729711138673), (19208, 0.6358670639975957), (21242, 0.7126829645199957), (23297, 0.7015859538778817), (23600, 0.17541160386140509), (26155, 0.2830692585361492), (26516, 0.3072549338995147), (27131, -0.04003203845127055), (27719, 0.45790546988962705), (29218, 0.647184621628

In [21]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.5566,133624
1,0.765299,135877
2,0.648544,165000
3,0.155043,2065
4,0.714694,3186


In [22]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
56,0.95984,40527
15,0.950826,12325
40,0.884615,33797
9,0.799157,5104
19,0.790495,15157


In [23]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,0.95984,40527,1,3.5
1,0.95984,40527,2,2.5
2,0.95984,40527,3,2.5
3,0.95984,40527,6,3.0
4,0.95984,40527,11,3.0


In [24]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,0.95984,40527,1,3.5,3.359441
1,0.95984,40527,2,2.5,2.399601
2,0.95984,40527,3,2.5,2.399601
3,0.95984,40527,6,3.0,2.879521
4,0.95984,40527,11,3.0,2.879521


In [25]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,32.366424,121.577876
2,32.366424,79.857134
3,17.953874,46.454164
4,4.573025,8.822548
5,17.280468,38.885711


In [26]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.756296,1
2,2.467283,2
3,2.587417,3
4,1.929258,4
5,2.25027,5


In [27]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4468,5.0,4468
127021,5.0,127021
6144,5.0,6144
128366,5.0,128366
134583,5.0,134583
128320,5.0,128320
128091,5.0,128091
39481,5.0,39481
79311,5.0,79311
27423,5.0,27423


In [28]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
4374,4468,Apartment Zero,1988
6046,6144,"Traviata, La",1982
9328,27423,"O Auto da Compadecida (Dog's Will, A)",2000
10538,39481,Will Success Spoil Rock Hunter?,1957
15604,79311,Hamster Factor and Other Tales of Twelve Monke...,1997
27366,127021,Rewind This!,2013
27602,128091,Craig Ferguson: A Wee Bit o' Revolution,2009
27654,128320,Monty Python: Almost the Truth - Lawyers Cut,2009
27662,128366,Patton Oswalt: Tragedy Plus Comedy Equals Time,2014
29398,134583,Misery Loves Comedy,2015
