In [79]:
import pandas as pd
from math import sqrt
import numpy as np

In [98]:
df = pd.read_csv("HorrorSVMCookies.csv")
df = df[['Title', 'Rating']]
df
ratingdf = pd.read_csv("userRating.csv")
ratingdf

Unnamed: 0,userId,bookId,Rating
0,1,1,4.0
1,1,1041,3.0
2,1,591,4.5
3,4,1041,1.0
4,4,591,4.5
...,...,...,...
872,610,1,5.0
873,610,1041,5.0
874,610,70,5.0
875,610,82,4.0


In [81]:
df['bookId'] = df.groupby(['Title','Rating']).ngroup()
df

Unnamed: 0,Title,Rating,bookId
0,The Shining (Paperback),4.26,1041
1,It (Paperback),4.25,418
2,Dracula (Paperback),4.01,244
3,Carrie (Mass Market Paperback),3.98,134
4,Pet Sematary (Kindle Edition),4.04,591
...,...,...,...
1243,The Other Black Girl (Hardcover),3.40,997
1244,"Blood Countess (Lady Slayers, #1)",3.47,105
1245,"The Cuckoo Clock of Doom (Goosebumps, #28)",3.64,797
1246,Färjan (Hardcover),3.45,312


In [82]:
userInput = [{'Title': 'Pet Sematary (Kindle Edition)','Rating' : 5.00},
            {'Title': 'The Turn of the Screw (Paperback)','Rating' : 3.00},
            {'Title': 'The Mist (Paperback)','Rating' : 4.00},
            {'Title': 'Through the Woods (Hardcover)','Rating' : 4.00},
            {'Title': 'Hell House (Hardcover)','Rating' : 5.00},]
inputBooks = pd.DataFrame(userInput)
inputBooks

Unnamed: 0,Title,Rating
0,Pet Sematary (Kindle Edition),5.0
1,The Turn of the Screw (Paperback),3.0
2,The Mist (Paperback),4.0
3,Through the Woods (Hardcover),4.0
4,Hell House (Hardcover),5.0


In [83]:
inputId = df[df['Title'].isin(inputBooks['Title'].tolist())]
inputId

Unnamed: 0,Title,Rating,bookId
4,Pet Sematary (Kindle Edition),4.04,591
42,The Turn of the Screw (Paperback),3.4,1084
60,The Mist (Paperback),3.95,971
70,Through the Woods (Hardcover),3.94,1153
72,Hell House (Hardcover),3.77,357


In [84]:
userSubset = ratingdf[ratingdf['bookId'].isin(inputId['bookId'].tolist())]
print(userSubset.groupby('bookId').count())

        userId  Rating
bookId                
357          7       7
591         23      23
971          3       3
1084        15      15
1153        16      16


In [85]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])

In [86]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(19,     userId  bookId  Rating
25      19     591     2.0
26      19     591     3.0), (42,     userId  bookId  Rating
55      42     591     4.0
56      42     591     5.0), (57,     userId  bookId  Rating
76      57     591     4.0
77      57    1084     5.0), (63,     userId  bookId  Rating
83      63     591     4.0
84      63    1084     3.0), (1,    userId  bookId  Rating
2       1     591     4.5)]


In [87]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='bookId')
    inputId = inputId.sort_values(by='bookId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputId[inputId['bookId'].isin(group['bookId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['Rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['Rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
    

In [88]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0             -1.0      19
1             -1.0      42
2             -1.0      57
3              1.0      63
4              0.0       1


In [96]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head(100))

    similarityIndex  userId
3               1.0      63
30              0.0     314
45              0.0     415
33              0.0     328
34              0.0     331
35              0.0     346
36              0.0     352
37              0.0     362
38              0.0     363
39              0.0     372
40              0.0     399
41              0.0     400
42              0.0     404
43              0.0     405
44              0.0     410
46              0.0     418
31              0.0     317
47              0.0     426
48              0.0     437
49              0.0     447
50              0.0     448
51              0.0     453
52              0.0     458
53              0.0     474
54              0.0     476
55              0.0     606
56              0.0     607
57              0.0     608
58              0.0     610
32              0.0     322
59              0.0    1997
16              0.0      66
29              0.0     292
5               0.0       4
6               0.0 

In [90]:
topUsersRating=topUsers.merge(ratingdf, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  bookId  Rating
0               1.0      63       1     5.0
1               1.0      63     296     5.0
2               1.0      63     591     4.0
3               1.0      63    1084     3.0
4               0.0     314       1     3.0
..              ...     ...     ...     ...
95              0.0      18    1041     4.0
96              0.0      18     591     4.0
97              0.0      21       1     3.0
98              0.0      21       2     3.0
99              0.0      21    1041     3.5

[100 rows x 4 columns]


In [91]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['Rating']
print(topUsersRating.head())

   similarityIndex  userId  bookId  Rating  weightedRating
0              1.0      63       1     5.0             5.0
1              1.0      63     296     5.0             5.0
2              1.0      63     591     4.0             4.0
3              1.0      63    1084     3.0             3.0
4              0.0     314       1     3.0             0.0


In [92]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('bookId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

        sum_similarityIndex  sum_weightedRating
bookId                                         
1                       1.0                 5.0
2                       0.0                 0.0
70                      0.0                 0.0
78                      0.0                 0.0
82                      0.0                 0.0


In [93]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['bookId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

        weighted average recommendation score  bookId
bookId                                               
1                                         5.0       1
2                                         NaN       2
70                                        NaN      70
78                                        NaN      78
82                                        NaN      82
91                                        NaN      91
110                                       NaN     110
120                                       NaN     120
296                                       5.0     296
357                                       NaN     357


In [94]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df

Unnamed: 0_level_0,weighted average recommendation score,bookId
bookId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,5.0,1
296,5.0,296
591,4.0,591
1084,3.0,1084
2,,2
70,,70
78,,78
82,,82
91,,91
110,,110


In [95]:
recommendedBook=df.loc[df['bookId'].isin(recommendation_df['bookId'])]

#we don't want to recommend the same movie
recommendedBook=recommendedBook.loc[~recommendedBook.bookId.isin(userSubset['bookId'])]

print(recommendedBook)

                                                  Title  Rating  bookId
0                               The Shining (Paperback)    4.26    1041
7                              'Salem's Lot (Paperback)    4.05       1
190                                11/22/63 (Hardcover)    4.32       2
602                              Black Hole (Hardcover)    3.84      91
950                                   Breed (Hardcover)    3.16     120
985                                 Autumn (Autumn, #1)    3.67      70
1040                 Blue World (Mass Market Paperback)    3.96     110
1107                             Bec (The Demonata, #4)    4.17      82
1238                  Fool Moon (The Dresden Files, #2)    3.99     296
1243                   The Other Black Girl (Hardcover)    3.40     997
1247  Batman: Arkham Asylum - A Serious House on Ser...    4.10      78
