In [3]:
import pandas as pd
import numpy as np

cleaned_data = pd.read_csv(
    "D:/M@hii/WSU/Sem 1/Data Science/Project - Book recommendation System/Code/Book-Recommendation-System/cleaned data/dataset.csv",
    sep=",",
    dtype="unicode",
)

## We remove the ratings with 0 and keep only other ratings
cleaned_data['book_rating'] = pd.to_numeric(cleaned_data['book_rating'], errors='coerce')
# cleaned_data['book_rating'] = cleaned_data["book_rating"].astype(int)
cleaned_data = cleaned_data[cleaned_data["book_rating"] != 0]
cleaned_data = cleaned_data.reset_index(drop=True)
cleaned_data.shape

(384074, 16)

In [11]:
cleaned_data.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,user_id,isbn,book_rating,User-ID,Age,City,State,Country
0,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,0002005018,5,8,35.0,timmins,ontario,canada
1,074322678X,Where You'll Find Me: And Other Stories,Ann Beattie,2002,Scribner,http://images.amazon.com/images/P/074322678X.0...,http://images.amazon.com/images/P/074322678X.0...,http://images.amazon.com/images/P/074322678X.0...,8,074322678X,5,8,35.0,timmins,ontario,canada
2,0887841740,The Middle Stories,Sheila Heti,2004,House of Anansi Press,http://images.amazon.com/images/P/0887841740.0...,http://images.amazon.com/images/P/0887841740.0...,http://images.amazon.com/images/P/0887841740.0...,8,0887841740,5,8,35.0,timmins,ontario,canada
3,1552041778,Jane Doe,R. J. Kaiser,1999,Mira Books,http://images.amazon.com/images/P/1552041778.0...,http://images.amazon.com/images/P/1552041778.0...,http://images.amazon.com/images/P/1552041778.0...,8,1552041778,5,8,35.0,timmins,ontario,canada
4,1567407781,The Witchfinder (Amos Walker Mystery Series),Loren D. Estleman,1998,Brilliance Audio - Trade,http://images.amazon.com/images/P/1567407781.0...,http://images.amazon.com/images/P/1567407781.0...,http://images.amazon.com/images/P/1567407781.0...,8,1567407781,6,8,35.0,timmins,ontario,canada


In [60]:
## Book recommendation using nearest neighbors

In [12]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# bookName = input("Enter a book name: ")
# number = int(input("Enter number of books to recommend: "))

bookName = "Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))"
number = 5

print(bookName)

popularity_threshold = 50

data = (cleaned_data.groupby(by=['Book-Title'])['book_rating']
        .sum()
        .reset_index()
        .rename(columns={'book_rating': 'Total-Rating'})
        [['Book-Title', 'Total-Rating']])

result = pd.merge(data, cleaned_data, left_on='Book-Title', right_on='Book-Title')
result = result[result['Total-Rating'] >= popularity_threshold]
result = result.reset_index(drop = True)

matrix = result.pivot_table(index = 'Book-Title', columns = 'User-ID', values = 'book_rating').fillna(0)
up_matrix = csr_matrix(matrix)

model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model.fit(up_matrix)

distances, indices = model.kneighbors(matrix.loc[bookName].values.reshape(1, -1), n_neighbors = number+1)
print("\nRecommended books:\n")
for i in range(0, len(distances.flatten())):
    if i > 0:
        print(matrix.index[indices.flatten()[i]]) 

Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))

Recommended books:

Harry Potter and the Chamber of Secrets (Book 2)
Harry Potter and the Prisoner of Azkaban (Book 3)
Harry Potter and the Goblet of Fire (Book 4)
Harry Potter and the Order of the Phoenix (Book 5)
The Sorcerer's Companion: A Guide to the Magical World of Harry Potter


In [13]:
number_ratings_df = cleaned_data.groupby('Book-Title').count()['book_rating'].reset_index()
number_ratings_df.rename(columns={'book_rating' : 'num_ratings'}, inplace=True)
number_ratings_df

Unnamed: 0,Book-Title,num_ratings
0,A Light in the Storm: The Civil War Diary of ...,1
1,"Ask Lily (Young Women of Faith: Lily Series, ...",1
2,Dark Justice,1
3,Earth Prayers From around the World: 365 Pray...,7
4,Final Fantasy Anthology: Official Strategy Gu...,2
...,...,...
135586,Ã?Â?bernachten mit Stil. Die schÃ?Â¶nsten Coun...,1
135587,Ã?Â?rger mit Produkt X. Roman.,3
135588,Ã?Â?sterlich leben.,1
135589,Ã?Â?stlich der Berge.,1


In [14]:
avg_ratings_df = cleaned_data.groupby(by=['Book-Title'])['book_rating'].agg(book_rating='mean').reset_index()
avg_ratings_df.rename(columns={'book_rating' : 'avg_ratings'}, inplace=True)
avg_ratings_df

Unnamed: 0,Book-Title,avg_ratings
0,A Light in the Storm: The Civil War Diary of ...,9.000000
1,"Ask Lily (Young Women of Faith: Lily Series, ...",8.000000
2,Dark Justice,10.000000
3,Earth Prayers From around the World: 365 Pray...,7.142857
4,Final Fantasy Anthology: Official Strategy Gu...,10.000000
...,...,...
135586,Ã?Â?bernachten mit Stil. Die schÃ?Â¶nsten Coun...,8.000000
135587,Ã?Â?rger mit Produkt X. Roman.,7.000000
135588,Ã?Â?sterlich leben.,7.000000
135589,Ã?Â?stlich der Berge.,8.000000


In [15]:
popular_df = number_ratings_df.merge(avg_ratings_df,on='Book-Title')
popular_df

Unnamed: 0,Book-Title,num_ratings,avg_ratings
0,A Light in the Storm: The Civil War Diary of ...,1,9.000000
1,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.000000
2,Dark Justice,1,10.000000
3,Earth Prayers From around the World: 365 Pray...,7,7.142857
4,Final Fantasy Anthology: Official Strategy Gu...,2,10.000000
...,...,...,...
135586,Ã?Â?bernachten mit Stil. Die schÃ?Â¶nsten Coun...,1,8.000000
135587,Ã?Â?rger mit Produkt X. Roman.,3,7.000000
135588,Ã?Â?sterlich leben.,1,7.000000
135589,Ã?Â?stlich der Berge.,1,8.000000


In [16]:
popular_df = popular_df[popular_df['num_ratings']>=250].sort_values('avg_ratings',ascending=False).head(50)
popular_df

Unnamed: 0,Book-Title,num_ratings,avg_ratings
45399,Harry Potter and the Prisoner of Azkaban (Book 3),277,9.043321
123703,To Kill a Mockingbird,267,8.977528
45409,Harry Potter and the Sorcerer's Stone (Harry P...,315,8.936508
45380,Harry Potter and the Chamber of Secrets (Book 2),326,8.840491
125586,"Tuesdays with Morrie: An Old Man, a Young Man,...",250,8.588
116217,The Secret Life of Bees,413,8.479419
102723,The Da Vinci Code,495,8.430303
110249,The Lovely Bones: A Novel,707,8.18529
114981,The Red Tent (Bestselling Backlist),383,8.182768
131363,Where the Heart Is (Oprah's Book Club (Paperba...,295,8.142373


In [17]:
books = pd.read_csv("../data/Books.csv", sep=',', dtype='unicode')
popular_df = popular_df.merge(books,on='Book-Title').drop_duplicates('Book-Title')[['Book-Title','Book-Author','Image-URL-M','num_ratings','avg_ratings']]

In [18]:
popular_df

Unnamed: 0,Book-Title,Book-Author,Image-URL-M,num_ratings,avg_ratings
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,http://images.amazon.com/images/P/0439136350.0...,277,9.043321
3,To Kill a Mockingbird,Harper Lee,http://images.amazon.com/images/P/0446310786.0...,267,8.977528
11,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,http://images.amazon.com/images/P/059035342X.0...,315,8.936508
13,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,http://images.amazon.com/images/P/0439064872.0...,326,8.840491
16,"Tuesdays with Morrie: An Old Man, a Young Man,...",MITCH ALBOM,http://images.amazon.com/images/P/0385484518.0...,250,8.588
19,The Secret Life of Bees,Sue Monk Kidd,http://images.amazon.com/images/P/0142001740.0...,413,8.479419
25,The Da Vinci Code,Dan Brown,http://images.amazon.com/images/P/0385504209.0...,495,8.430303
31,The Lovely Bones: A Novel,Alice Sebold,http://images.amazon.com/images/P/0316666343.0...,707,8.18529
32,The Red Tent (Bestselling Backlist),Anita Diamant,http://images.amazon.com/images/P/0312195516.0...,383,8.182768
33,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,http://images.amazon.com/images/P/0446672211.0...,295,8.142373


In [19]:
cleaned_data

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,user_id,isbn,book_rating,User-ID,Age,City,State,Country
0,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,0002005018,5,8,35.0,timmins,ontario,canada
1,074322678X,Where You'll Find Me: And Other Stories,Ann Beattie,2002,Scribner,http://images.amazon.com/images/P/074322678X.0...,http://images.amazon.com/images/P/074322678X.0...,http://images.amazon.com/images/P/074322678X.0...,8,074322678X,5,8,35.0,timmins,ontario,canada
2,0887841740,The Middle Stories,Sheila Heti,2004,House of Anansi Press,http://images.amazon.com/images/P/0887841740.0...,http://images.amazon.com/images/P/0887841740.0...,http://images.amazon.com/images/P/0887841740.0...,8,0887841740,5,8,35.0,timmins,ontario,canada
3,1552041778,Jane Doe,R. J. Kaiser,1999,Mira Books,http://images.amazon.com/images/P/1552041778.0...,http://images.amazon.com/images/P/1552041778.0...,http://images.amazon.com/images/P/1552041778.0...,8,1552041778,5,8,35.0,timmins,ontario,canada
4,1567407781,The Witchfinder (Amos Walker Mystery Series),Loren D. Estleman,1998,Brilliance Audio - Trade,http://images.amazon.com/images/P/1567407781.0...,http://images.amazon.com/images/P/1567407781.0...,http://images.amazon.com/images/P/1567407781.0...,8,1567407781,6,8,35.0,timmins,ontario,canada
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384069,8441407487,Sobre La Utilidad y Los Perjuicios de La Historia,Friedrich Wilhelm Nietzsche,2000,Edaf,http://images.amazon.com/images/P/8441407487.0...,http://images.amazon.com/images/P/8441407487.0...,http://images.amazon.com/images/P/8441407487.0...,246590,8441407487,6,246590,20.0,madrid,madrid,spain
384070,087004124X,Anthem,Ayn Rand,1966,Caxton Press,http://images.amazon.com/images/P/087004124X.0...,http://images.amazon.com/images/P/087004124X.0...,http://images.amazon.com/images/P/087004124X.0...,259589,087004124X,8,259589,19.0,venice,florida,usa
384071,158243123X,A Student of Weather,Elizabeth Hay,2001,Counterpoint Press,http://images.amazon.com/images/P/158243123X.0...,http://images.amazon.com/images/P/158243123X.0...,http://images.amazon.com/images/P/158243123X.0...,259591,158243123X,8,259591,39.0,tioga,pennsylvania,usa
384072,8485900057,Rimas y leyendas (ClÃ¡sicos Fraile ; 3),Gustavo Adolfo BÃ©cquer,1981,Ediciones Fraile,http://images.amazon.com/images/P/8485900057.0...,http://images.amazon.com/images/P/8485900057.0...,http://images.amazon.com/images/P/8485900057.0...,259614,8485900057,10,259614,22.0,madrid,madrid,spain


In [None]:
## Collaborative Filtering Based Recommender System

In [20]:
x = cleaned_data.groupby('User-ID').count()['book_rating'] > 200
high_rated_users = x[x].index

In [21]:
filtered_rating = cleaned_data[cleaned_data['User-ID'].isin(high_rated_users)]

In [22]:
y = cleaned_data.groupby('Book-Title').count()['book_rating']>=50
famous_books = y[y].index

In [23]:
final_rating = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]

In [24]:
table = final_rating.pivot_table(index='Book-Title', columns='User-ID', values='book_rating')

In [25]:
table.fillna(0, inplace=True)
table

User-ID,100906,101851,104636,105517,107784,110934,110973,112001,114368,114988,...,81560,87141,88677,88733,89602,93047,94347,95359,97874,98391
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0
2nd Chance,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84 Charing Cross Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Wizard and Glass (The Dark Tower, Book 4)",0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
Wuthering Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Year of Wonders,9.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,0.0,6.0,0.0,0.0,0.0


In [26]:
from sklearn.metrics.pairwise import cosine_similarity

In [27]:
score_similarity = cosine_similarity(table)

In [28]:
def recommend_books(bookName):
    index = np.where(table.index==bookName)[0][0]
    similar_books = sorted(list(enumerate(score_similarity[index])), key = lambda x:x[1], reverse=True)[1:6]
    
    data = []

    for i in similar_books:
        item = []
        temp_df = books[books['Book-Title'] == table.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].values))

        data.append(item)

    return data

In [30]:
recommend_books('The Fellowship of the Ring (The Lord of the Rings, Part 1)')

[['The Two Towers (The Lord of the Rings, Part 2)',
  'J.R.R. TOLKIEN',
  'http://images.amazon.com/images/P/0345339711.01.MZZZZZZZ.jpg'],
 ['Fahrenheit 451',
  'Ray Bradbury',
  'http://images.amazon.com/images/P/3257208626.01.MZZZZZZZ.jpg'],
 ['The Return of the King (The Lord of the Rings, Part 3)',
  'J.R.R. TOLKIEN',
  'http://images.amazon.com/images/P/0345339738.01.MZZZZZZZ.jpg'],
 ["The Hitchhiker's Guide to the Galaxy",
  'Douglas Adams',
  'http://images.amazon.com/images/P/0671461494.01.MZZZZZZZ.jpg'],
 ['1984',
  'George Orwell',
  'http://images.amazon.com/images/P/0451524934.01.MZZZZZZZ.jpg']]

In [31]:
recommend_books('Harry Potter and the Chamber of Secrets (Book 2)')

[['Harry Potter and the Prisoner of Azkaban (Book 3)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0439136350.01.MZZZZZZZ.jpg'],
 ['Harry Potter and the Goblet of Fire (Book 4)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0439139597.01.MZZZZZZZ.jpg'],
 ["Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))",
  'J. K. Rowling',
  'http://images.amazon.com/images/P/059035342X.01.MZZZZZZZ.jpg'],
 ["Harry Potter and the Sorcerer's Stone (Book 1)",
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0590353403.01.MZZZZZZZ.jpg'],
 ['The Alienist',
  'Caleb Carr',
  'http://images.amazon.com/images/P/0553572997.01.MZZZZZZZ.jpg']]

In [177]:
import pickle
pickle.dump(popular_df,open('../Pickle files/popular_df.pkl','wb'))

In [None]:
pickle.dump(table, open('../Pickle files/table.pkl', 'wb'))
pickle.dump(books, open('../Pickle files/books.pkl', 'wb'))
pickle.dump(score_similarity, open('../Pickle files/score_similarity.pkl', 'wb'))