In [1]:
import pandas as pd
import numpy as np
import scipy

In [2]:
datafile= r'C:\Users\Hp\Desktop\scool\BX-CSV-Dump\BX-Book-Ratings.csv'
data=pd.read_csv(datafile, sep=';', encoding='latin-1', error_bad_lines=False)
data.columns=["User","ISBN","Rating"]
bookfile= r'C:\Users\Hp\Desktop\scool\BX-CSV-Dump\BX-Books.csv'
books=pd.read_csv(bookfile, sep=';',header=0, usecols=[0,1,2], encoding='latin-1',index_col=0, error_bad_lines=False,names=["ISBN","Title","Author"])

In [3]:
books.head()
data.head()

Unnamed: 0,User,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [4]:
def bookMeta(isbn):   #Return Title,Author of a book using ISBN
    title=books.at[isbn,"Title"]
    author=books.at[isbn, "Author"]
    return title,author
bookMeta('0060973129') #TEST CALL OF FUNCTION

('Decision in Normandy', "Carlo D'Este")

In [5]:
def favbooks(user,N): #Return 'N' favorite books of user
    userRatings=data[data["User"]==user] #filter relevent user
    sortedRatings=pd.DataFrame.sort_values(userRatings,["Rating"],ascending=[0])[:N] #sort the ratings in descending order
    sortedRatings["title"]= sortedRatings["ISBN"].apply(bookMeta) #apply bookMeta function
    return sortedRatings

In [6]:
data=data[data["ISBN"].isin(books.index)] 

In [7]:
favbooks(204622,5) #TEST CALL OF FUNCTION

Unnamed: 0,User,ISBN,Rating,title
844955,204622,0967560500,10,"(Natural Hormonal Enhancement, Rob Faigin)"
844935,204622,0671027360,10,"(Angels &amp; Demons, Dan Brown)"
844926,204622,0385504209,10,"(The Da Vinci Code, Dan Brown)"
844958,204622,097173660X,9,"(Life After School Explained, Cap &amp; Compass)"
844920,204622,0060935464,9,"(To Kill a Mockingbird, Harper Lee)"


In [8]:
UsersPerISBN= data.ISBN.value_counts() #To calculate distinct no of isbn in rating data
UsersPerISBN.head(10)
UsersPerISBN.shape

(270170,)

In [9]:
ISBNPerUser= data.User.value_counts() #To calculate distinct no. of users
ISBNPerUser.shape

(92107,)

In [10]:
data=data[data["ISBN"].isin(UsersPerISBN[UsersPerISBN>10].index)] #Books read by more than 10 users
data=data[data["User"].isin(ISBNPerUser[ISBNPerUser>10].index)]   #Users who've read more than 10 books

In [11]:
UserItemRatingMatrix=pd.pivot_table(data, values='Rating',index=['User'],columns=['ISBN']) #User-Book sparce matrix

In [12]:
UserItemRatingMatrix.head()

ISBN,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,5.0,,,,,,,,,,...,,,,,,,,,,
99,,,,,,,,,,,...,,,,,,,,,,
242,,,,,,,,,,,...,,,,,,,,,,
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,


Calculating The EUCLIDIAN DISTANCE BTW TWO USERS ON N- Dimensional space:

In [13]:
user1=204622
user2=255489
user1Ratings=UserItemRatingMatrix.transpose()[user1]
user1Ratings.head()

ISBN
0002005018   NaN
0002251760   NaN
0002259834   NaN
0002558122   NaN
0006480764   NaN
Name: 204622, dtype: float64

In [14]:
user2Ratings=UserItemRatingMatrix.transpose()[user2]
user2Ratings.head()

ISBN
0002005018   NaN
0002251760   NaN
0002259834   NaN
0002558122   NaN
0006480764   NaN
Name: 255489, dtype: float64

In [15]:
from scipy.spatial.distance import hamming
hamming(user1Ratings,user2Ratings)

0.9999352792699502

In [16]:
#FUNCTION TO CALCULATE HAMMING DISTANCE BTW 2 USERS
def distance(user1,user2):
    try:
        user1Ratings=UserItemRatingMatrix.transpose()[user1]
        user2Ratings=UserItemRatingMatrix.transpose()[user2]
        distance=hamming(user1Ratings,user2Ratings)
    except:
        distance=np.NaN
    return distance
        

In [17]:
distance(204622,255489) #test run of distance function

0.9999352792699502

In [22]:
# FUNCTION TO FIND K nearest neighbors:
def nearestNeighbors(user,K=10):
    allUsers=pd.DataFrame(UserItemRatingMatrix.index) #ALL USERS
    allUsers=allUsers[allUsers.User!=user]            #ALL USERS OTHER THAN ACTIVE USER
    allUsers["distance"]=allUsers["User"].apply(lambda x:distance(user,x))   #DISTANCE OF ALL USERS FROM ACTIVE USER
    Knearestusers= allUsers.sort_values(["distance"],ascending=True)["User"][:K] #K nearest neignbors of active user
    return Knearestusers
    

In [23]:
knearestusers= nearestNeighbors(204622) #TEST RUN OF KNN Function
knearestusers

3201     82893
3368     87555
2624     68555
1813     48046
5401    140036
7584    198711
565      16795
8866    232131
239       7346
9693    251422
Name: User, dtype: int64

In [46]:
# FIND THE TOP "N" RECOMMENDATIONS FOR THE ACTIVE USER:
def topN(user,N=3):
    knearestusers=nearestNeighbors(user)
    NNRatings=UserItemRatingMatrix[UserItemRatingMatrix.index.isin(knearestusers)]
    avgRatings=NNRatings.apply(np.nanmean).dropna()
    booksAlreadyRead=UserItemRatingMatrix.transpose()[user].dropna().index
    avgRatings=avgRatings[~avgRatings.index.isin(booksAlreadyRead)]
    topNISBNs=avgRatings.sort_values(ascending=False).index[:N]
    return pd.Series(topNISBNs).apply(bookMeta)


In [47]:
favbooks(204813,10) #favorite books of the active user

Unnamed: 0,User,ISBN,Rating,title
845417,204813,399149848,10,"(Birthright, Nora Roberts)"
845407,204813,385504209,10,"(The Da Vinci Code, Dan Brown)"
845382,204813,373218036,10,"(Truly, Madly Manhattan, Nora Roberts)"
845359,204813,142001805,10,"(The Eyre Affair: A Novel, Jasper Fforde)"
845431,204813,446527793,10,"(The Guardian, Nicholas Sparks)"
845416,204813,399149392,10,"(Chesapeake Blue (Quinn Brothers (Hardcover)),..."
845432,204813,446531332,9,"(Nights in Rodanthe, Nicholas Sparks)"
845434,204813,446606243,9,"(The Tenth Justice, Brad Meltzer)"
845451,204813,671027360,9,"(Angels &amp; Demons, Dan Brown)"
845433,204813,446532452,9,"(The Wedding, Nicholas Sparks)"


In [48]:
topN(204813,10) #TOP 10 recommendation of books for the active user

0    (Waiting For Nick (Silhouette Special Edition)...
1           (Wringer (Trophy Newbery), Jerry Spinelli)
2    (The Star Wars Trilogy: Star Wars, the Empire ...
3          (One, Two, Buckle My Shoe, Agatha Christie)
4                          (On the Road, Jack Kerouac)
5                 (Dead Poets Society, N.H. Kleinbaum)
6     (Go Ask Alice (Avon/Flare Book), James Jennings)
7                        (Carolina Moon, Nora Roberts)
8    (Illusions: The Adventures of a Reluctant Mess...
9    (You Just Don't Duct Tape a Baby!: True Tales ...
Name: ISBN, dtype: object