In [34]:
import pandas as pd
from math import sqrt
import numpy as np

In [35]:
books_df = pd.read_csv('Books.csv')
ratings_df = pd.read_csv('Ratings.csv')
books_df.head()
ratings_df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


# Data Preprocessing

In [36]:
ratings_df = ratings_df.groupby('ISBN').filter(lambda x: len(x)>=100)
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
2,276727,0446520802,0
8,276744,038550120X,7
10,276746,0425115801,0
11,276746,0449006522,0
12,276746,0553561618,0


In [37]:
ratings_df = ratings_df[ratings_df.groupby('User-ID').ISBN.transform('nunique')>=10]
ratings_df

Unnamed: 0,User-ID,ISBN,Book-Rating
796,277042,0061097101,0
799,277042,0312983271,0
800,277042,0380731851,0
801,277042,0446605484,7
802,277042,0446611212,8
...,...,...,...
1149553,276680,0446670251,0
1149564,276680,0452283205,7
1149577,276680,0679731725,0
1149581,276680,0679781587,9


# Model Input

In [61]:
userInput = [{'Book-Title':'Mog\'s Christmas', 'Book-Rating':6},
             {'Book-Title':'Brave New World', 'Book-Rating':9},
             {'Book-Title':'Autumn Story Brambly Hedge', 'Book-Rating':1},
             {'Book-Title':'Life of Pi', 'Book-Rating':9.5},
             {'Book-Title':'THE COAL HOUSE T/PB', 'Book-Rating':7}]
inputBooks = pd.DataFrame(userInput)
print(inputBooks)

                   Book-Title  Book-Rating
0             Mog's Christmas          6.0
1             Brave New World          9.0
2  Autumn Story Brambly Hedge          1.0
3                  Life of Pi          9.5
4         THE COAL HOUSE T/PB          7.0


In [62]:
inputId = books_df[books_df['Book-Title'].isin(inputBooks['Book-Title'].tolist())]
inputBooks = pd.merge(inputId, inputBooks)
inputBooks = inputBooks.drop('Year-Of-Publication', 1) #we don't really need this at the moment
inputBooks = inputBooks[['ISBN','Book-Title','Book-Rating']]
print(inputBooks)

          ISBN                  Book-Title  Book-Rating
0   0151008116                  Life of Pi          9.5
1   0156027321                  Life of Pi          9.5
2   1565117794                  Life of Pi          9.5
3   184195425X                  Life of Pi          9.5
4   0060809833             Brave New World          9.0
5   0060929871             Brave New World          9.0
6   0001047973             Brave New World          9.0
7   0060830956             Brave New World          9.0
8   0582060168             Brave New World          9.0
9   0001848445         THE COAL HOUSE T/PB          7.0
10  0001837397  Autumn Story Brambly Hedge          1.0
11  0001010565             Mog's Christmas          6.0


  inputBooks = inputBooks.drop('Year-Of-Publication', 1) #we don't really need this at the moment


In [63]:
userSubset = ratings_df[ratings_df['ISBN'].isin(inputBooks['ISBN'].tolist())]
print(userSubset.groupby('ISBN').count())

            User-ID  Book-Rating
ISBN                            
0060929871       63           63
0151008116       89           89
0156027321      240          240


In [64]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['User-ID'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(89602,         User-ID        ISBN  Book-Rating
373179    89602  0060929871            0
373240    89602  0151008116            0
373246    89602  0156027321            0), (4017,        User-ID        ISBN  Book-Rating
20763     4017  0060929871            9
20784     4017  0156027321            0), (11676,        User-ID        ISBN  Book-Rating
46638    11676  0151008116            6
46678    11676  0156027321            6), (23768,         User-ID        ISBN  Book-Rating
101262    23768  0060929871            8
101377    23768  0156027321            0), (36606,         User-ID        ISBN  Book-Rating
165095    36606  0060929871            0
165206    36606  0156027321            0)]


In [65]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='ISBN')
    inputBooks = inputBooks.sort_values(by='ISBN')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputBooks[inputBooks['ISBN'].isin(group['ISBN'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['Book-Rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['Book-Rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [66]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['User-ID'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  User-ID
0              0.0    89602
1             -1.0     4017
2              0.0    11676
3             -1.0    23768
4              0.0    36606


In [67]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  User-ID
8               1.0    48046
10              1.0    69971
5               1.0    39467
12              1.0    80538
7               1.0    46398


In [68]:
topUsersRating=topUsers.merge(ratings_df, left_on='User-ID', right_on='User-ID', how='inner')
print(topUsersRating.head(100))

    similarityIndex  User-ID        ISBN  Book-Rating
0               1.0    48046  0060199652           10
1               1.0    48046  0060391626           10
2               1.0    48046  0060915544            0
3               1.0    48046  0060922532            0
4               1.0    48046  0060929871            0
..              ...      ...         ...          ...
95              1.0    69971  0525947647            9
96              1.0    69971  0553208845            0
97              1.0    69971  0553273914            0
98              1.0    69971  0553582747            0
99              1.0    69971  0671021001            0

[100 rows x 4 columns]


In [69]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['Book-Rating']
print(topUsersRating.head())

   similarityIndex  User-ID        ISBN  Book-Rating  weightedRating
0              1.0    48046  0060199652           10            10.0
1              1.0    48046  0060391626           10            10.0
2              1.0    48046  0060915544            0             0.0
3              1.0    48046  0060922532            0             0.0
4              1.0    48046  0060929871            0             0.0


In [70]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('ISBN').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

            sum_similarityIndex  sum_weightedRating
ISBN                                               
002542730X                  1.0                 7.0
0060008032                  0.0                 0.0
0060096195                  0.0                 0.0
006016848X                  1.0                 0.0
0060173289                  0.0                 0.0


In [71]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['ISBN'] = tempTopUsersRating.index
print(recommendation_df.head(10))

            weighted average recommendation score        ISBN
ISBN                                                         
002542730X                                    7.0  002542730X
0060008032                                    NaN  0060008032
0060096195                                    NaN  0060096195
006016848X                                    0.0  006016848X
0060173289                                    NaN  0060173289
0060175400                                    NaN  0060175400
006019491X                                    NaN  006019491X
0060199652                                   10.0  0060199652
0060391626                                   10.0  0060391626
0060392452                                    NaN  0060392452


In [72]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

            weighted average recommendation score        ISBN
ISBN                                                         
0451524934                                   10.0  0451524934
080410753X                                   10.0  080410753X
0345413350                                   10.0  0345413350
0312243022                                   10.0  0312243022
076790592X                                   10.0  076790592X
...                                           ...         ...
1558744150                                    NaN  1558744150
1559029838                                    NaN  1559029838
1573227331                                    NaN  1573227331
1573229571                                    NaN  1573229571
1592400876                                    NaN  1592400876

[704 rows x 2 columns]


In [74]:
recommended_movie=books_df.loc[books_df['ISBN'].isin(recommendation_df['ISBN'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.ISBN.isin(userSubset['ISBN'])]

recommended_movie.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
18,440234743,The Testament,John Grisham,1999,Dell,http://images.amazon.com/images/P/0440234743.0...,http://images.amazon.com/images/P/0440234743.0...,http://images.amazon.com/images/P/0440234743.0...
19,452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994,Plume,http://images.amazon.com/images/P/0452264464.0...,http://images.amazon.com/images/P/0452264464.0...,http://images.amazon.com/images/P/0452264464.0...
26,971880107,Wild Animus,Rich Shapero,2004,Too Far,http://images.amazon.com/images/P/0971880107.0...,http://images.amazon.com/images/P/0971880107.0...,http://images.amazon.com/images/P/0971880107.0...
27,345402871,Airframe,Michael Crichton,1997,Ballantine Books,http://images.amazon.com/images/P/0345402871.0...,http://images.amazon.com/images/P/0345402871.0...,http://images.amazon.com/images/P/0345402871.0...
28,345417623,Timeline,MICHAEL CRICHTON,2000,Ballantine Books,http://images.amazon.com/images/P/0345417623.0...,http://images.amazon.com/images/P/0345417623.0...,http://images.amazon.com/images/P/0345417623.0...
