# Book Recommender System using collaborative system

In [1]:
# Import the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# load and read data
book = pd.read_csv('Books.csv',low_memory = False)
rating = pd.read_csv('Ratings.csv')
users = pd.read_csv('Users.csv')

In [3]:
book.head(2)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...


## The shape of each dataset:

In [4]:
print('The shape of tthe Book dateset is',book.shape)
print('The shape of tthe Rating dateset is',rating.shape)
print('The shape of tthe Users dateset is',users.shape)

The shape of tthe Book dateset is (271360, 8)
The shape of tthe Rating dateset is (1149780, 3)
The shape of tthe Users dateset is (278858, 3)


## Analyzing The Data 

In [5]:
# droping Unnecessary columns 
book = book.drop(columns=['Image-URL-S','Image-URL-M'])
book.head(2)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...


In [6]:
#check for null value
book.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-L            3
dtype: int64

In [7]:
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [8]:
rating.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [9]:
rating['User-ID'].value_counts()

11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
116180        1
116166        1
116154        1
116137        1
276723        1
Name: User-ID, Length: 105283, dtype: int64

In [10]:
#filtering the rating  
r = rating['User-ID'].value_counts()>200

In [11]:
r = r[r].index

In [12]:
rating = rating[rating['User-ID'].isin(r)]

In [13]:
rating.shape

(526356, 3)

In [14]:
# merging book and rating datesets
df = pd.merge(rating,book,on = 'ISBN')

In [15]:
df.head(2)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...


In [16]:
numberOfrating = df.groupby('Book-Title')['Book-Rating'].count().reset_index()

In [17]:
numberOfrating = numberOfrating.rename(columns={'Book-Rating':'number-rating'})

In [18]:
numberOfrating.head(2)

Unnamed: 0,Book-Title,number-rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1


In [19]:
# merging df and numberOfrating datesets
df1 = pd.merge(numberOfrating,df,on = 'Book-Title')
df1.head(2)

Unnamed: 0,Book-Title,number-rating,User-ID,ISBN,Book-Rating,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,A Light in the Storm: The Civil War Diary of ...,2,35859,590567330,0,Karen Hesse,1999,Hyperion Books for Children,http://images.amazon.com/images/P/0590567330.0...
1,A Light in the Storm: The Civil War Diary of ...,2,96448,590567330,9,Karen Hesse,1999,Hyperion Books for Children,http://images.amazon.com/images/P/0590567330.0...


In [20]:
df1 = df1[df1['number-rating']>=50]
df1.head(2)

Unnamed: 0,Book-Title,number-rating,User-ID,ISBN,Book-Rating,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
1073,1984,76,254,451524934,9,George Orwell,1990,Signet Book,http://images.amazon.com/images/P/0451524934.0...
1074,1984,76,11676,451524934,0,George Orwell,1990,Signet Book,http://images.amazon.com/images/P/0451524934.0...


In [21]:
df1.shape

(61853, 9)

In [22]:
df1[df1.duplicated(['User-ID','Book-Title'])]
 

Unnamed: 0,Book-Title,number-rating,User-ID,ISBN,Book-Rating,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
1126,1984,76,11676,0451519841,0,George Orwell,1980,New Amer Library,http://images.amazon.com/images/P/0451519841.0...
1127,1984,76,12538,0451519841,0,George Orwell,1980,New Amer Library,http://images.amazon.com/images/P/0451519841.0...
1131,1984,76,87555,0451519841,0,George Orwell,1980,New Amer Library,http://images.amazon.com/images/P/0451519841.0...
1142,1984,76,11676,0452262933,10,George Orwell,1983,Plume Books,http://images.amazon.com/images/P/0452262933.0...
1332,1st to Die: A Novel,162,11676,0316666009,8,James Patterson,2001,Little Brown and Company,http://images.amazon.com/images/P/0316666009.0...
...,...,...,...,...,...,...,...,...,...
487410,"\O\"" Is for Outlaw""",105,36606,0805059555,5,Sue Grafton,1999,Henry Holt &amp; Company,http://images.amazon.com/images/P/0805059555.0...
487411,"\O\"" Is for Outlaw""",105,55492,0805059555,0,Sue Grafton,1999,Henry Holt &amp; Company,http://images.amazon.com/images/P/0805059555.0...
487419,"\O\"" Is for Outlaw""",105,155147,0805059555,0,Sue Grafton,1999,Henry Holt &amp; Company,http://images.amazon.com/images/P/0805059555.0...
487421,"\O\"" Is for Outlaw""",105,158295,0805059555,6,Sue Grafton,1999,Henry Holt &amp; Company,http://images.amazon.com/images/P/0805059555.0...


In [23]:
df1 = df1.drop_duplicates(subset=['User-ID','Book-Title'],keep='first')

In [24]:
df1.shape

(59850, 9)

In [25]:
df_piovt = df1.pivot_table(columns= 'User-ID',index = 'Book-Title',values = 'Book-Rating')

In [26]:
df_piovt.head(4)

User-ID,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,,,,,,0.0,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2nd Chance,,10.0,,,,,,,,,...,,,,0.0,,,,,0.0,
4 Blondes,,,,,,,,,,0.0,...,,,,,,,,,,


In [27]:
df_piovt.fillna(0,inplace=True)
df_piovt.head(2)

User-ID,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
from scipy.sparse import csr_matrix
df_sparse = csr_matrix(df_piovt)

In [29]:
from sklearn.neighbors import NearestNeighbors

In [30]:
nner = NearestNeighbors(algorithm='brute')
nner.fit(df_sparse)

In [31]:
#nner.kneighbors(df_piovt.iloc[105, :])

In [32]:
distances, suggestions = nner.kneighbors(df_piovt.iloc[105, :].values.reshape(1, -1),n_neighbors=6)

In [33]:
for i in range(len(suggestions)):
    print(df_piovt.index[suggestions[i]])

Index(['Catering to Nobody', 'Exclusive', 'The First Counsel',
       'Jacob Have I Loved', 'No Safe Place', 'The Cradle Will Fall'],
      dtype='object', name='Book-Title')


In [34]:
distances, suggestions = nner.kneighbors(df_piovt.iloc[254, :].values.reshape(1, -1),n_neighbors=6)

In [35]:
for i in range(len(suggestions)):
    print(df_piovt.index[suggestions[i]])

Index(['High Fidelity', 'About a Boy', 'Pleading Guilty', 'No Safe Place',
       'Long After Midnight', 'Exclusive'],
      dtype='object', name='Book-Title')


In [36]:
df_piovt.index[4]

'84 Charing Cross Road'

In [38]:
import pickle

In [39]:
df1_pIk = pickle.load(open('df1.pIk','rb'))

In [42]:
type(
 df1_pIk)

pandas.core.frame.DataFrame

In [43]:
pd.__version__

'1.5.3'

## The website

In [42]:
books_name = df_piovt.index

In [43]:
books_name

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       '84 Charing Cross Road', 'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Cry In The Night',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='Book-Title', length=742)

In [44]:
import pickle
pickle.dump(nner,open('bookweb/nner.pIk','wb'))
pickle.dump(books_name,open('bookweb/books_name.pIk','wb'))
pickle.dump(df1,open('bookweb/df1.pIk','wb'))
pickle.dump(df_piovt,open('bookweb/df_piovt.pIk','wb'))

In [45]:
def recommend(book_name):
    book_id = np.where(df_piovt.index == book_name)[0][0]
    distances, suggestions = nner.kneighbors(df_piovt.iloc[book_id, :].values.reshape(1, -1),n_neighbors=6)
    
    for i in range(len(suggestions)):
        books = df_piovt.index[suggestions[i]]
        for b in books:
            print(b)
            
            

In [46]:
#testing 
recommend('1st to Die: A Novel')

1st to Die: A Novel
Exclusive
The Cradle Will Fall
The Clinic (Alex Delaware Novels (Paperback))
No Safe Place
The Sum of All Fears (Jack Ryan Novels)


#  method for a Item-based collaborative system

In [47]:
df_piovt.head(4)

User-ID,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# Get the average rating for each user 
avg_ratings = df_piovt.mean(axis=1)

# Center each users ratings around 0
user_ratings_table_centered = df_piovt.sub(avg_ratings, axis=0)

# Fill in the missing data with 0s
user_ratings_table_normed = user_ratings_table_centered.fillna(0)

In [56]:
user_ratings_table_normed

User-ID,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,8.717342,-0.282658,-0.282658,-0.282658,-0.282658,-0.282658,-0.282658,-0.282658,-0.282658,-0.282658,...,-0.282658,-0.282658,-0.282658,-0.282658,-0.282658,-0.282658,-0.282658,-0.282658,-0.282658,-0.282658
1st to Die: A Novel,-0.420045,-0.420045,-0.420045,-0.420045,-0.420045,-0.420045,-0.420045,-0.420045,-0.420045,-0.420045,...,-0.420045,-0.420045,-0.420045,-0.420045,-0.420045,-0.420045,-0.420045,-0.420045,-0.420045,-0.420045
2nd Chance,-0.309685,9.690315,-0.309685,-0.309685,-0.309685,-0.309685,-0.309685,-0.309685,-0.309685,-0.309685,...,-0.309685,-0.309685,-0.309685,-0.309685,-0.309685,-0.309685,-0.309685,-0.309685,-0.309685,-0.309685
4 Blondes,-0.077703,-0.077703,-0.077703,-0.077703,-0.077703,-0.077703,-0.077703,-0.077703,-0.077703,-0.077703,...,-0.077703,-0.077703,-0.077703,-0.077703,-0.077703,-0.077703,-0.077703,-0.077703,-0.077703,-0.077703
84 Charing Cross Road,-0.262387,-0.262387,-0.262387,-0.262387,-0.262387,-0.262387,-0.262387,-0.262387,-0.262387,-0.262387,...,-0.262387,-0.262387,-0.262387,-0.262387,-0.262387,9.737613,-0.262387,-0.262387,-0.262387,-0.262387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,-0.213964,-0.213964,-0.213964,6.786036,-0.213964,-0.213964,-0.213964,-0.213964,6.786036,-0.213964,...,-0.213964,-0.213964,-0.213964,-0.213964,-0.213964,-0.213964,-0.213964,-0.213964,-0.213964,-0.213964
You Belong To Me,-0.095721,-0.095721,-0.095721,-0.095721,-0.095721,-0.095721,-0.095721,-0.095721,-0.095721,-0.095721,...,-0.095721,-0.095721,-0.095721,-0.095721,-0.095721,-0.095721,-0.095721,-0.095721,-0.095721,-0.095721
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,-0.134009,-0.134009,-0.134009,-0.134009,-0.134009,-0.134009,-0.134009,-0.134009,-0.134009,-0.134009,...,-0.134009,-0.134009,-0.134009,-0.134009,-0.134009,-0.134009,-0.134009,-0.134009,-0.134009,-0.134009
Zoya,-0.072072,-0.072072,-0.072072,-0.072072,-0.072072,-0.072072,-0.072072,-0.072072,-0.072072,-0.072072,...,-0.072072,-0.072072,-0.072072,-0.072072,-0.072072,-0.072072,-0.072072,-0.072072,-0.072072,-0.072072


In [50]:
from sklearn.metrics.pairwise import cosine_similarity

In [58]:
# finding the similaritie between books dy useing cosinesimilarity thta give us a value btween 1 and -1 1 is very similer and -1 no similarity
similarities = cosine_similarity(user_ratings_table_normed)

cosine_similarity_df = pd.DataFrame(similarities,index=user_ratings_table_normed.index,columns=user_ratings_table_normed.index)

cosine_similarity_df.head()


Book-Title,1984,1st to Die: A Novel,2nd Chance,4 Blondes,84 Charing Cross Road,A Bend in the Road,A Case of Need,"A Child Called \It\"": One Child's Courage to Survive""",A Civil Action,A Cry In The Night,...,Winter Solstice,Wish You Well,Without Remorse,"Wizard and Glass (The Dark Tower, Book 4)",Wuthering Heights,Year of Wonders,You Belong To Me,Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,Zoya,"\O\"" Is for Outlaw"""
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,1.0,0.038007,-0.035126,-0.020824,0.044899,0.006933,-0.022563,0.045025,0.086834,-0.019518,...,0.008248,-0.024506,-0.024047,0.067889,0.029511,0.024434,-0.019691,0.072743,0.028496,0.004993
1st to Die: A Novel,0.038007,1.0,0.241489,-0.026868,0.000855,0.063111,0.075969,0.074141,0.113254,0.126466,...,0.029532,0.140364,0.139608,0.050246,0.005851,0.02575,0.166723,0.046608,0.146514,0.11927
2nd Chance,-0.035126,0.241489,1.0,-0.022744,0.003614,0.031713,0.08745,-0.031814,0.046535,0.140652,...,0.11351,0.163758,0.015812,0.119859,-0.026019,0.036388,0.151701,0.023032,0.034363,0.083888
4 Blondes,-0.020824,-0.026868,-0.022744,1.0,-0.01971,-0.019472,0.098608,-0.01886,-0.00872,-0.012638,...,-0.015357,-0.015868,-0.01557,-0.005291,0.093138,0.088556,0.064466,-0.01518,-0.01116,-0.018667
84 Charing Cross Road,0.044899,0.000855,0.003614,-0.01971,1.0,0.001014,0.106424,0.011346,-0.012746,-0.018474,...,0.011502,0.034879,0.031034,0.031595,-0.022549,-0.026829,0.031502,0.027558,0.051688,0.083717


In [59]:
cosine_similarity_series = cosine_similarity_df.loc['4 Blondes']
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)
print(ordered_similarities)


Book-Title
4 Blondes                                                           1.000000
The House of the Spirits                                            0.192057
Pleading Guilty                                                     0.173924
Seabiscuit                                                          0.168302
Bridget Jones: The Edge of Reason                                   0.153907
                                                                      ...   
Snow Falling on Cedars                                             -0.027091
Life of Pi                                                         -0.027859
Interview with the Vampire                                         -0.029733
Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))   -0.032462
Harry Potter and the Chamber of Secrets (Book 2)                   -0.034884
Name: 4 Blondes, Length: 742, dtype: float64


In [60]:
cosine_similarity_series = cosine_similarity_df.loc['84 Charing Cross Road']
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)
print(ordered_similarities)


Book-Title
84 Charing Cross Road           1.000000
Notes from a Small Island       0.231114
Under the Tuscan Sun            0.200310
Cold Mountain                   0.186368
ANGELA'S ASHES                  0.173669
                                  ...   
Suzanne's Diary for Nicholas   -0.035607
The Chamber                    -0.036262
Silence of the Lambs           -0.036872
The Partner                    -0.037630
The Pilot's Wife : A Novel     -0.038434
Name: 84 Charing Cross Road, Length: 742, dtype: float64


In [61]:
cosine_similarity_series = cosine_similarity_df.loc['Zoya']
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)
print(ordered_similarities)

Book-Title
Zoya                                                1.000000
Fine Things                                         0.657328
Secrets                                             0.537575
Kaleidoscope                                        0.499397
Tell Me Your Dreams                                 0.445788
                                                      ...   
Nickel and Dimed: On (Not) Getting By in America   -0.020583
The Joy Luck Club                                  -0.020648
Silence of the Lambs                               -0.020877
Divine Secrets of the Ya-Ya Sisterhood: A Novel    -0.023584
To Kill a Mockingbird                              -0.028438
Name: Zoya, Length: 742, dtype: float64
