In [1]:
import pandas as pd

In [2]:
dataFile='C:/Anaconda3/Notebooks/machine-learning-demo/Recommender/BX-Book-Ratings.csv'

In [3]:
data=pd.read_csv(dataFile,sep=";",encoding='ISO-8859-1',header=0,names=["user","isbn","rating"])

In [4]:
data.head()


Unnamed: 0,user,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [5]:
bookFile='C:/Anaconda3/Notebooks/machine-learning-demo/Recommender/BX-Books.csv'
books=pd.read_csv(bookFile,sep=";",header=0,error_bad_lines=False, usecols=[0,1,2],index_col=0,names=['isbn',"title","author"],encoding='ISO-8859-1')

In [6]:
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


In [7]:
def bookMeta(isbn):
    title = books.at[isbn,"title"]
    author = books.at[isbn,"author"]
    return title, author
bookMeta("0671027360")

('Angels &amp; Demons', 'Dan Brown')

In [8]:
data = data[data["isbn"].isin(books.index)]

In [9]:
data.count()

user      1031175
isbn      1031175
rating    1031175
dtype: int64

In [10]:
def faveBooks(user,N):
    userRatings = data[data["user"]==user]
    sortedRatings = pd.DataFrame.sort_values(userRatings,['rating'],ascending=[0])[:N] 
    sortedRatings["title"] = sortedRatings["isbn"].apply(bookMeta)
    return sortedRatings

In [11]:
faveBooks(204622,5)

Unnamed: 0,user,isbn,rating,title
844955,204622,0967560500,10,"(Natural Hormonal Enhancement, Rob Faigin)"
844935,204622,0671027360,10,"(Angels &amp; Demons, Dan Brown)"
844926,204622,0385504209,10,"(The Da Vinci Code, Dan Brown)"
844958,204622,097173660X,9,"(Life After School Explained, Cap &amp; Compass)"
844920,204622,0060935464,9,"(To Kill a Mockingbird, Harper Lee)"


In [12]:
data.shape

(1031175, 3)

In [13]:
usersPerISBN = data.isbn.value_counts()
usersPerISBN.head()

0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
Name: isbn, dtype: int64

In [14]:
# Now get books read by each user
isbnsPerUser = data.user.value_counts()
isbnsPerUser.head()

11676     11144
198711     6456
153662     5814
98391      5779
35859      5646
Name: user, dtype: int64

In [15]:
# Filter out the data so that isbns (books) which have not been read by more than 100 users are removed 
data = data[data["isbn"].isin(usersPerISBN[usersPerISBN>20].index)]
data.shape

(377635, 3)

In [16]:
#Filter out the data so that users who have not read more than 100 books are removed
data = data[data["user"].isin(isbnsPerUser[isbnsPerUser>20].index)]
data.shape

(275654, 3)

In [17]:
# Now we can get a pivot table from the data with users as rows, isbns in the columns and ratings in the cells
userItemRatingMatrix=pd.pivot_table(data, values='rating',
                                    index=['user'], columns=['isbn'])

In [18]:
userItemRatingMatrix.head()

isbn,000649840X,0006547834,0006550576,0006550789,0007110928,0007141076,0007154615,0020198817,0020198906,0020199600,...,880781210X,8807813025,8817106100,8817106259,8817131628,8845205118,8845247414,884590184X,8885989403,950491036X
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
242,,,,,,,,,,,...,,,,,,,,,,
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,
383,,,,,,,,,,,...,,,,,,,,,,
388,,,,,,,,,,,...,,,,,,,,,,


In [19]:
userItemRatingMatrix.shape

(6290, 6863)

In [20]:
user1 = 1435
user2 = 1424

In [21]:
user1Ratings = userItemRatingMatrix.transpose()[user1]
user1Ratings.head()

isbn
000649840X   NaN
0006547834   NaN
0006550576   NaN
0006550789   NaN
0007110928   NaN
Name: 1435, dtype: float64

In [22]:
user2Ratings = userItemRatingMatrix.transpose()[user2]

In [23]:
from scipy.spatial.distance import hamming 
hamming(user1Ratings,user2Ratings)

1.0

In [24]:
import numpy as np
def distance(user1,user2):
        try:
            user1Ratings = userItemRatingMatrix.transpose()[user1]
            user2Ratings = userItemRatingMatrix.transpose()[user2]
            distance = hamming(user1Ratings,user2Ratings)
        except: 
            distance = np.NaN
        return distance 

In [25]:
distance(1435,1424)

1.0

In [26]:
# Take a user (we will call it active user)
# Get all the users from the user-item matrix by just getting the user column
# Remove the active user from the list
user = 1435
allUsers = pd.DataFrame(userItemRatingMatrix.index)
allUsers = allUsers[allUsers.user!=user]
allUsers.head()

Unnamed: 0,user
0,242
1,243
2,254
3,383
4,388


In [27]:
# Caclulate the distance between the active user and every other user by apply lambda function on the user list
# Store the distance in the "distance" column
allUsers["distance"] = allUsers["user"].apply(lambda x: distance(user,x))

In [28]:
#allUsers.sort_values(["distance"],ascending=True)
allUsers.head()

Unnamed: 0,user,distance
0,242,1.0
1,243,0.999854
2,254,0.999854
3,383,1.0
4,388,1.0


In [29]:
# Get the 10 nearest neighbours by sorting the allUsers dataframe by distance in ascending order
K = 10
KnearestUsers = allUsers.sort_values(["distance"],ascending=True)["user"][:K]

In [30]:
KnearestUsers

1675     76352
5987    265313
3908    175003
1246     56856
773      35859
1218     55492
209      11676
2558    115120
4431    198711
2291    102967
Name: user, dtype: int64

In [31]:
def nearestNeighbors(user,K=10):
    allUsers = pd.DataFrame(userItemRatingMatrix.index)
    allUsers = allUsers[allUsers.user!=user]
    allUsers["distance"] = allUsers["user"].apply(lambda x: distance(user,x))
    KnearestUsers = allUsers.sort_values(["distance"],ascending=True)["user"][:K]
    return KnearestUsers

In [32]:
KnearestUsers = nearestNeighbors(user)
KnearestUsers

1675     76352
5987    265313
3908    175003
1246     56856
773      35859
1218     55492
209      11676
2558    115120
4431    198711
2291    102967
Name: user, dtype: int64

In [33]:
KnearestUsers.shape

(10,)

In [34]:
NNRatings = userItemRatingMatrix[userItemRatingMatrix.index.isin(KnearestUsers)]
NNRatings

isbn,000649840X,0006547834,0006550576,0006550789,0007110928,0007141076,0007154615,0020198817,0020198906,0020199600,...,880781210X,8807813025,8817106100,8817106259,8817131628,8845205118,8845247414,884590184X,8885989403,950491036X
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11676,8.0,5.0,,7.0,7.0,0.0,7.0,,,,...,,9.0,8.0,,,,,,0.0,
35859,,,,,,,,,,,...,,,,,,,,,,
55492,,,,,,,,,,,...,,,,,,,,,,
56856,,,,,,,,,,,...,,,,,,,,,,
76352,,,,,,,,,0.0,,...,,,,,,,,,,
102967,,,,,,,,,,,...,,,,,,,,,,
115120,,,,,,,,0.0,,,...,,,,,,,,,,
175003,,,,,,,,,,,...,,,,,,,,,,
198711,,,,,,,,,,,...,,,,,,,,,,
265313,,,,,,,,,,0.0,...,,,,,,,,,,


In [35]:
avgRating = NNRatings.apply(np.nanmean).dropna()
avgRating.head()

  labels=labels)


isbn
000649840X    8.0
0006547834    5.0
0006550789    7.0
0007110928    7.0
0007141076    0.0
dtype: float64

In [36]:
booksAlreadyRead = userItemRatingMatrix.transpose()[user].dropna().index
booksAlreadyRead

Index(['0060915544', '0060922532', '0060977477', '0140092501', '0345338545',
       '0345370775', '0345378490', '0345380371', '0345387651', '034540288X',
       '0375400117', '0380718340', '0380728249', '0385240406', '038548951X',
       '0399133143', '0425133516', '0425161242', '0425169863', '0425180638',
       '0425182908', '0440150167', '044016205X', '0440211727', '044023722X',
       '0441172695', '0441172717', '0446611239', '0446672211', '0451176812',
       '0451181468', '0451183665', '0451195922', '0451208765', '0515127221',
       '0517149257', '0553258524', '0553265520', '0553568760', '0671024108',
       '0671024248', '0671028383', '0671042262', '0671537458', '068484267X',
       '0684848783', '0684855631', '0688177859', '0743222008', '0743437810',
       '080410753X', '0812590236', '0971880107', '155874262X', '1573221775'],
      dtype='object', name='isbn')

In [37]:
avgRating = avgRating[~avgRating.index.isin(booksAlreadyRead)]

In [38]:
N=3
topNISBNs = avgRating.sort_values(ascending=False).index[:N]

In [39]:
pd.Series(topNISBNs).apply(bookMeta)

0              (Are You My Mother?, Philip D. Eastman)
1    (Total Recall: A V.I. Warshawski Novel, Sara P...
2    (Are You Somebody? : The Accidental Memoir of ...
Name: isbn, dtype: object

In [40]:
def topN(user,N=3):
    KnearestUsers = nearestNeighbors(user)
    NNRatings = userItemRatingMatrix[userItemRatingMatrix.index.isin(KnearestUsers)]
    avgRating = NNRatings.apply(np.nanmean).dropna()
    booksAlreadyRead = userItemRatingMatrix.transpose()[user].dropna().index
    avgRating = avgRating[~avgRating.index.isin(booksAlreadyRead)]
    topNISBNs = avgRating.sort_values(ascending=False).index[:N]
    return pd.Series(topNISBNs).apply(bookMeta)

In [41]:
faveBooks(user,10)

Unnamed: 0,user,isbn,rating,title
12866,1435,0060915544,10,"(The Bean Trees, Barbara Kingsolver)"
12887,1435,0380728249,10,"(Almost Adam, Petru Popescu)"
12975,1435,155874262X,10,(Chicken Soup for the Soul (Chicken Soup for t...
12957,1435,080410753X,10,"(The Kitchen God's Wife, Amy Tan)"
12924,1435,0517149257,10,"(The Ultimate Hitchhiker's Guide, Douglas Adams)"
12911,1435,0446611239,10,"(The Cabinet of Curiosities, Douglas Preston)"
12947,1435,0684855631,9,"(The Empty Chair, Jeffery Deaver)"
12912,1435,0446672211,8,(Where the Heart Is (Oprah's Book Club (Paperb...
12945,1435,068484267X,8,"(Angela's Ashes: A Memoir, Frank McCourt)"
12916,1435,0451183665,7,"(A Case of Need, Michael Crichton)"


In [42]:
topN(user, 10)

  labels=labels)


0              (Are You My Mother?, Philip D. Eastman)
1    (Total Recall: A V.I. Warshawski Novel, Sara P...
2    (Are You Somebody? : The Accidental Memoir of ...
3    (It Was on Fire When I Lay Down on It, Robert ...
4                          (Monkeewrench, P. J. Tracy)
5                  (The Giving Tree, Shel Silverstein)
6    (How Wal-Mart is Destroying America and The Wo...
7                           (Dirty Work, Stuart Woods)
8             (To Dance With the White Dog, Terry Kay)
9                (84 Charing Cross Road, Helene Hanff)
Name: isbn, dtype: object