In [1]:
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
books = pd.read_csv("data/books_info.csv")
rating = pd.read_csv("data/ratings.csv")

In [3]:
books.head()

Unnamed: 0,Book_ID,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,1,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,,,,,,,,
1,2,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp,Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,,,,,,,
2,3,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...,,,,,,,,
3,4,452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994,Plume,http://images.amazon.com/images/P/0452264464.0...,http://images.amazon.com/images/P/0452264464.0...,http://images.amazon.com/images/P/0452264464.0...,,,,,,,,
4,5,609804618,Our Dumb Century: The Onion Presents 100 Years...,The Onion,1999,Three Rivers Press,http://images.amazon.com/images/P/0609804618.0...,http://images.amazon.com/images/P/0609804618.0...,http://images.amazon.com/images/P/0609804618.0...,,,,,,,,


In [4]:
books.drop(["Unnamed: 9", "Unnamed: 10", "Unnamed: 11", "Unnamed: 12", "Unnamed: 13", "Unnamed: 14", "Unnamed: 15", "Unnamed: 16"], axis=1, inplace=True)

In [5]:
books.head(3)

Unnamed: 0,Book_ID,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,1,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
1,2,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp,Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
2,3,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...


In [6]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17384 entries, 0 to 17383
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Book_ID              17384 non-null  int64 
 1   ISBN                 17384 non-null  object
 2   Book-Title           17384 non-null  object
 3   Book-Author          17380 non-null  object
 4   Year-Of-Publication  17384 non-null  object
 5   Publisher            17384 non-null  object
 6   Image-URL-S          17384 non-null  object
 7   Image-URL-M          17382 non-null  object
 8   Image-URL-L          17383 non-null  object
dtypes: int64(1), object(8)
memory usage: 1.2+ MB


In [7]:
books.isna().sum()

Book_ID                0
ISBN                   0
Book-Title             0
Book-Author            4
Year-Of-Publication    0
Publisher              0
Image-URL-S            0
Image-URL-M            2
Image-URL-L            1
dtype: int64

In [8]:
# get the features 
books.drop(["ISBN", "Image-URL-L", "Image-URL-S", "Image-URL-M"], axis=1, inplace=True)

In [9]:
books.head(3)

Unnamed: 0,Book_ID,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,1,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
1,2,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp
2,3,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group


In [10]:
# rename some columns
books.rename(columns={"Book_ID":"book_id","Book-Title":"title"}, inplace=True)

In [11]:
books.columns

Index(['book_id', 'title', 'Book-Author', 'Year-Of-Publication', 'Publisher'], dtype='object')

In [12]:
books = books.dropna()

In [13]:
books.isna().sum()

book_id                0
title                  0
Book-Author            0
Year-Of-Publication    0
Publisher              0
dtype: int64

In [14]:
books.shape

(17380, 5)

In [27]:
print(books["title"].value_counts())
print(len(books["title"].value_counts()))

title
Love                                                               9
Pet Sematary                                                       6
Wuthering Heights                                                  6
The Subtle Knife (His Dark Materials                               5
I                                                                  5
                                                                  ..
Vernon God Little                                                  1
How the Light Gets in                                              1
Interesting Facts About the State of Arizona                       1
Shark Tales : True (and Amazing) Stories from America's Lawyers    1
Gandhi's Way: A Handbook of Conflict Resolution                    1
Name: count, Length: 16484, dtype: int64
16484


In [15]:
rating.head()

Unnamed: 0,user,item,rating
0,1,6264,7.0
1,1,4350,7.0
2,1,6252,5.0
3,1,202,9.0
4,1,6266,6.0


In [16]:
# renaming columns
rating.rename(columns={"user":"user_id", "item":"book_id"}, inplace=True)

In [17]:
rating.head(3)

Unnamed: 0,user_id,book_id,rating
0,1,6264,7.0
1,1,4350,7.0
2,1,6252,5.0


In [18]:
print(rating["user_id"].value_counts())
print(len(rating["user_id"].value_counts()))
print(len(rating["user_id"].value_counts() == 20))

user_id
1003    1092
1614     599
154      483
1180     425
965      417
        ... 
2901      20
711       20
2839      20
2598      20
1370      20
Name: count, Length: 1295, dtype: int64
1295
1295


In [19]:
rating.shape

(62656, 3)

In [20]:
# merge books and rating columns
books_with_rating = books.merge(rating, on="book_id")

In [21]:
books_with_rating.head(3)

Unnamed: 0,book_id,title,Book-Author,Year-Of-Publication,Publisher,user_id,rating
0,1,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,2789,6.0
1,2,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp,337,6.0
2,2,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp,1144,10.0


In [22]:
books_with_rating.shape

(62646, 7)

In [23]:
# make a new dataframe with title and number of rating of each book
no_of_rating = books_with_rating.groupby("title")["rating"].count().reset_index()

In [24]:
no_of_rating.head(3)

Unnamed: 0,title,rating
0,'48,12
1,'N Sync,3
2,'Salem's Lot,7


In [25]:
no_of_rating.shape

(13968, 2)

In [28]:
# rename rating column to num_of_rating
no_of_rating.rename(columns={"rating":"num_of_rating"}, inplace=True)

In [29]:
no_of_rating.head(3)

Unnamed: 0,title,num_of_rating
0,'48,12
1,'N Sync,3
2,'Salem's Lot,7


In [30]:
# merge no_of_rating with books_with_rating dataframe
books_df = books_with_rating.merge(no_of_rating, on="title")

In [31]:
books_df.head(3)

Unnamed: 0,book_id,title,Book-Author,Year-Of-Publication,Publisher,user_id,rating,num_of_rating
0,1,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,2789,6.0,1
1,2,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp,337,6.0,3
2,2,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp,1144,10.0,3


In [32]:
books_df.shape

(62646, 8)

In [33]:
# we will get the books which got at least 7 ratings of user
books_df = books_df[books_df["num_of_rating"] > 6]

In [34]:
books_df.sample(3)

Unnamed: 0,book_id,title,Book-Author,Year-Of-Publication,Publisher,user_id,rating,num_of_rating
37873,6452,Flowers for My Friend (Peter Pauper Petite Ser),Christina M. Anello,1992,Peter Pauper Press,1090,5.0,15
4706,374,Simple Abundance: A Daybook of Comfort and Joy,Sarah Ban Breathnach,1995,Warner Books,170,7.0,28
10866,1082,Winter Moon,Dean R. Koontz,2001,Bantam Books,252,8.0,33


In [35]:
books_df.isna().sum()

book_id                0
title                  0
Book-Author            0
Year-Of-Publication    0
Publisher              0
user_id                0
rating                 0
num_of_rating          0
dtype: int64

In [39]:
# books.drop_duplicates(["title", "user_id"], inplace=True)
books_df.drop_duplicates(["user_id", "title"], inplace=True)

In [40]:
books_df.shape

(32699, 8)

In [41]:
# let's create a pivot table
pivot_books = books_df.pivot_table(columns="user_id",
                                   index="title",
                                   values="rating")

In [42]:
pivot_books

user_id,1,2,5,7,9,11,14,16,18,20,...,2924,2925,2927,2928,2930,2937,2939,2942,2943,2945
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'48,,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2000 What Color is Your Parachute,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Young Wives,,,,,,,,,,,...,,,,,,,,,,
Your Blues Ain't Like Mine,,,,,,,,,,,...,,,,,,,,,,
Zen and the Art of Motorcycle Maintenance : An Inquiry into Values,,,,,,,,,,,...,,,,,,,,,,
Zodiac: The Eco-Thriller,,,,,,,,,,,...,,,,,,,,,,


In [43]:
# let's fill NAN values with zeros
pivot_books.fillna(0, inplace=True)

In [44]:
pivot_books

user_id,1,2,5,7,9,11,14,16,18,20,...,2924,2925,2927,2928,2930,2937,2939,2942,2943,2945
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2000 What Color is Your Parachute,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Young Wives,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Your Blues Ain't Like Mine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance : An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zodiac: The Eco-Thriller,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
pivot_books.shape

(2329, 1293)

In [46]:
from scipy.sparse import csr_matrix
# to handle zeroes values
sparse_books = csr_matrix(pivot_books)

In [47]:
# using KNN algo to make a recommendation
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(algorithm="brute")

In [48]:
model.fit(sparse_books)

In [49]:
# make a recommendation function
def recommend_book(book_name=None):
    if book_name is None:
        return "No book name provided."

    # Check if the book exists in the pivot_books index
    if book_name not in pivot_books.index:
        return f"Book '{book_name}' not found in the database."

    book_id = np.where(pivot_books.index == book_name)[0][0]
    distances, indices = model.kneighbors(pivot_books.iloc[book_id,:].values.reshape(1, -1), n_neighbors=6)
    
    recommended_books = []
    for i in range(len(indices)):
        books_names = pivot_books.index[indices[i]]
        recommended_books.extend(books_names)

    # Exclude the input book from recommendations
    recommended_books.remove(book_name)

    return recommended_books

In [53]:
print(recommend_book("Young Wives"))
print(recommend_book("Your Blues Ain't Like Mine"))

['CRY THE BELOVED COUNTRY (Scribner Classic)', 'Pacific Blues', 'The Perfect Husband', 'The London Blitz Murders', 'Learning to Say No: Establishing Healthy Boundaries']
['The DIETER', 'Who Killed Precious?', "Gerald's Game", 'Name All the Animals : A Memoir', 'More Adventures of the Great Brain (Dell Yearling Book)']


In [54]:
books_names = pivot_books.index

In [55]:
books_names

Index([''48', ''Salem's Lot', '1', '1st to Die: A Novel',
       '2000 What Color is Your Parachute', '2nd Chance', '30 Days',
       '311 Pelican Court',
       '36 Views of Mount Fuji: On Finding Myself in Japan',
       '69 Things to Do with a Dead Princess',
       ...
       'Yerma', 'You Belong to Me',
       'You Belong to Me and Other True Cases (Ann Rule's Crime Files: Vol. 2)',
       'You Have To Kiss A Lot Of Frogs', 'You Just Don't Understand',
       'Young Wives', 'Your Blues Ain't Like Mine',
       'Zen and the Art of Motorcycle Maintenance : An Inquiry into Values',
       'Zodiac: The Eco-Thriller', 'Zuflucht im Teehaus. Roman.'],
      dtype='object', name='title', length=2329)

In [56]:
with open("files/model.pkl", 'wb') as f:
    pickle.dump(model, f)

with open("files/books_name.pkl", 'wb') as f:
    pickle.dump(books_names, f)

with open("files/book_pivot.pkl", 'wb') as f:
    pickle.dump(pivot_books, f)