### Dataset URL

http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip

# Workflow

<img src='../img/reco10.JPG'>



# Setting up data

In [43]:
import pandas as pd
dataFile='../datasets/BX-Book-Ratings.csv'
data=pd.read_csv(dataFile,sep=";",header=0,names=["user","isbn","rating"], encoding="latin1")
# pd.read_csv('BX-Book-Ratings.csv', delimiter=";", encoding="latin1")

In [2]:
data.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [3]:
bookFile='../datasets/BX-Books.csv'
books=pd.read_csv(bookFile,sep=";",header=0,error_bad_lines=False, usecols=[0,1,2],index_col=0,names=['isbn',"title","author"], encoding="latin1")

In [4]:
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


In [5]:
def bookMeta(isbn):
    title = books.at[isbn,"title"]
    author = books.at[isbn,"author"]
    return title, author
bookMeta("0671027360")

('Angels &amp; Demons', 'Dan Brown')

In [6]:
data = data[data["isbn"].isin(books.index)]

In [7]:
def faveBooks(user,N):
    userRatings = data[data["user"]==user]
    sortedRatings = pd.DataFrame.sort_values(userRatings,['rating'],ascending=[0])[:N] 
    sortedRatings["title"] = sortedRatings["isbn"].apply(bookMeta)
    return sortedRatings

In [8]:
faveBooks(204622,5)

Unnamed: 0,user,isbn,rating,title
844955,204622,0967560500,10,"(Natural Hormonal Enhancement, Rob Faigin)"
844935,204622,0671027360,10,"(Angels &amp; Demons, Dan Brown)"
844926,204622,0385504209,10,"(The Da Vinci Code, Dan Brown)"
844958,204622,097173660X,9,"(Life After School Explained, Cap &amp; Compass)"
844920,204622,0060935464,9,"(To Kill a Mockingbird, Harper Lee)"


# Now proceeding to create rating matrix

In [9]:
data.shape

(1031175, 3)

In [10]:
usersPerISBN = data.isbn.value_counts()
usersPerISBN.head(10)

0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
044023722X     647
0142001740     615
067976402X     614
0671027360     586
0446672211     585
Name: isbn, dtype: int64

In [11]:
usersPerISBN.shape

(270170,)

In [12]:
ISBNsPerUser = data.user.value_counts()

In [13]:
ISBNsPerUser.shape

(92107,)

# Since rating matrix will be very sparse, subsetting the rating matrix to ISBNs read by more than 10 users

In [44]:
data = data[data["isbn"].isin(usersPerISBN[usersPerISBN>10].index)]

# Doing same at user side

In [45]:
data = data[data["user"].isin(ISBNsPerUser[ISBNsPerUser>10].index)]

# Creating a Pivot table

In [46]:
userItemRatingMatrix=pd.pivot_table(data, values='rating',
                                    index=['user'], columns=['isbn'])

In [47]:
userItemRatingMatrix.head()

isbn,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,5.0,,,,,,,,,,...,,,,,,,,,,
99,,,,,,,,,,,...,,,,,,,,,,
242,,,,,,,,,,,...,,,,,,,,,,
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,


# Before calling algo, checking results for a couple of users

In [49]:
userItemRatingMatrix.shape

(10706, 15451)

In [19]:
user1 = 204622
user2 = 255489

# Computing hamming distance

In [20]:
user1Ratings = userItemRatingMatrix.transpose()[user1]
user1Ratings.head()

isbn
0002005018   NaN
0002251760   NaN
0002259834   NaN
0002558122   NaN
0006480764   NaN
Name: 204622, dtype: float64

In [50]:
user2Ratings = userItemRatingMatrix.transpose()[user2]

In [51]:
from scipy.spatial.distance import hamming 
hamming(user1Ratings,user2Ratings)

0.9999352792699502

# So we can see that how to compute hamming distance between 2 users based on the ratings they've provided


# Converting that into a function

In [52]:
import numpy as np
def distance(user1,user2):
        try:
            user1Ratings = userItemRatingMatrix.transpose()[user1]
            user2Ratings = userItemRatingMatrix.transpose()[user2]
            distance = hamming(user1Ratings,user2Ratings)
        except: 
            distance = np.NaN
        return distance 

In [53]:
distance(204622,10118)

0.9998705585399004

In [54]:
user = 204622
allUsers = pd.DataFrame(userItemRatingMatrix.index)
allUsers = allUsers[allUsers.user!=user]
allUsers.head()

Unnamed: 0,user
0,8
1,99
2,242
3,243
4,254


Now we can take the allUsers data frame and add a new column, which will be the distance between each of the users and the active user. 


To create this distance column, take the user column from allUsers, and simply apply the distance function that we had set up earlier. 


We are doing this by applying a lambda function to the user column. 


This will take each user and calculate that user's distance from the active user. 


Let's print out all users again, and see that a new column called distance has been added with the distance from each user to the active user. 

In [57]:
allUsers["distance"] = allUsers["user"].apply(lambda x: distance(user,x))

In [58]:
allUsers.head()

Unnamed: 0,user,distance
0,8,1.0
1,99,1.0
2,242,0.999935
3,243,0.999935
4,254,1.0




Now we can find the K nearest neighbors for the active user by simply sorting this data frame. 


We need to pick a value for K. Here we'll pick 10. 


So the code will pick the 10 nearest neighbors for the active user. 


Then we take the allUsers data frame and sort the values in ascending order of distance. 


Then pick the user column from the allUsers data frame and the top K users from that set. 


We can print out the KnearestUsers after running this to see the list of users which represent the K nearest neighbors for that active user.

In [59]:
K = 10
KnearestUsers = allUsers.sort_values(["distance"],ascending=True)["user"][:K]

In [60]:
KnearestUsers

3201     82893
3368     87555
2624     68555
1813     48046
5401    140036
7584    198711
565      16795
8866    232131
239       7346
9693    251422
Name: user, dtype: int64

In [61]:
def nearestNeighbors(user,K=10):
    allUsers = pd.DataFrame(userItemRatingMatrix.index)
    allUsers = allUsers[allUsers.user!=user]
    allUsers["distance"] = allUsers["user"].apply(lambda x: distance(user,x))
    KnearestUsers = allUsers.sort_values(["distance"],ascending=True)["user"][:K]
    return KnearestUsers

In [64]:
KnearestUsers = nearestNeighbors(user)

In [65]:
KnearestUsers

3201     82893
3368     87555
2624     68555
1813     48046
5401    140036
7584    198711
565      16795
8866    232131
239       7346
9693    251422
Name: user, dtype: int64

# Get the ratings of nearest neighbors for all books

In [33]:
NNRatings = userItemRatingMatrix[userItemRatingMatrix.index.isin(KnearestUsers)]
NNRatings

isbn,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7346,,,,,,,,,,,...,,,,,,,,,,
16795,,,,,,,,,,,...,,,,,,,,,,
48046,,,,,,,,,,,...,,,,,,,,,,
68555,,,,,,,,,,,...,,,,,,,,,,
82893,,,,,,,,,,,...,,,,,,,,,,
87555,,,,,,,,,,,...,,,,,,,,,,
140036,,,,,,,,,,,...,,,,,,,,,,
198711,,,,,,,,,,,...,,,,,,,,,,
232131,,,,,,,,,,,...,,,,,,,,,,
251422,,,,,,,,,,,...,,,,,,,,,,


# Averaging the rating for for each book using nanmean which will ignore nan

In [34]:
avgRating = NNRatings.apply(np.nanmean).dropna()
avgRating.head()

  labels=labels)


isbn
0007154615    1.5
0020125305    0.0
0020125607    0.0
0020198817    0.0
0020198906    8.0
dtype: float64

In [35]:
booksAlreadyRead = userItemRatingMatrix.transpose()[user].dropna().index
booksAlreadyRead

Index(['006016848X', '0060935464', '0140042598', '0140178724', '0142004278',
       '0380732238', '0385504209', '0425109720', '0425152898', '0440136482',
       '0440241162', '0451191145', '0451197127', '0553096060', '0671027360',
       '0671027387', '0671666258', '0688174574', '0743225708', '076790592X',
       '0785264280', '0786868716', '0802131867', '0802132952', '0971880107',
       '1853260045', '1853260126', '1853260207', '185326041X', '1878424114'],
      dtype='object', name='isbn')

# Remove the average ratings of the books already read by the user

In [67]:
avgRating = avgRating[~avgRating.index.isin(booksAlreadyRead)]

# Sort rating in descending order and picking top N

In [70]:
N=3
topNISBNs = avgRating.sort_values(ascending=False).index[:N]

In [71]:
pd.Series(topNISBNs).apply(bookMeta)

0              (Love, Greg &amp; Lauren, Greg Manning)
1    (The Two Towers (The Lord of the Rings, Part 2...
2    (Harry Potter and the Sorcerer's Stone (Book 1...
Name: isbn, dtype: object

# Creating a function which will do all this for us

### Get the ratings of the nearest neigbours for all books

### Find average rating of the nearest neighbours for all books

### Remove the average ratings of books already rated which means also read

###  Finally we sort rating in descending order and picking top N

### Return top N books

In [72]:
def topN(user,N=3):
    KnearestUsers = nearestNeighbors(user)
    NNRatings = userItemRatingMatrix[userItemRatingMatrix.index.isin(KnearestUsers)]
    avgRating = NNRatings.apply(np.nanmean).dropna()
    booksAlreadyRead = userItemRatingMatrix.transpose()[user].dropna().index
    avgRating = avgRating[~avgRating.index.isin(booksAlreadyRead)]
    topNISBNs = avgRating.sort_values(ascending=False).index[:N]
    return pd.Series(topNISBNs).apply(bookMeta)

In [73]:
faveBooks(204813,10)

Unnamed: 0,user,isbn,rating,title
845417,204813,399149848,10,"(Birthright, Nora Roberts)"
845407,204813,385504209,10,"(The Da Vinci Code, Dan Brown)"
845382,204813,373218036,10,"(Truly, Madly Manhattan, Nora Roberts)"
845359,204813,142001805,10,"(The Eyre Affair: A Novel, Jasper Fforde)"
845431,204813,446527793,10,"(The Guardian, Nicholas Sparks)"
845416,204813,399149392,10,"(Chesapeake Blue (Quinn Brothers (Hardcover)),..."
845432,204813,446531332,9,"(Nights in Rodanthe, Nicholas Sparks)"
845434,204813,446606243,9,"(The Tenth Justice, Brad Meltzer)"
845451,204813,671027360,9,"(Angels &amp; Demons, Dan Brown)"
845433,204813,446532452,9,"(The Wedding, Nicholas Sparks)"


In [74]:
topN(204813,10)

  labels=labels)


0    (Waiting For Nick (Silhouette Special Edition)...
1           (Wringer (Trophy Newbery), Jerry Spinelli)
2    (The Star Wars Trilogy: Star Wars, the Empire ...
3          (One, Two, Buckle My Shoe, Agatha Christie)
4                          (On the Road, Jack Kerouac)
5                 (Dead Poets Society, N.H. Kleinbaum)
6     (Go Ask Alice (Avon/Flare Book), James Jennings)
7                        (Carolina Moon, Nora Roberts)
8    (Illusions: The Adventures of a Reluctant Mess...
9    (You Just Don't Duct Tape a Baby!: True Tales ...
Name: isbn, dtype: object