# **Book Recommendation System**

### Importing the necessary **modules**

In [2]:
import numpy as np
import pandas as pd
import opendatasets as od
from sklearn.metrics.pairwise import cosine_similarity as c_score

# Suppress all warnings
import warnings
warnings.filterwarnings("ignore")

In [12]:
od.download("https://www.kaggle.com/datasets/arashnic/book-recommendation-dataset")

ex = "Harry Potter and the Prisoner of Azkaban"
cols = ['Book-Title', 'Book-Author', 'Image-URL-M', 'Votes', 'Avg-rating']

def df_info(df):
    print(f"{df.head()}\n\nShape\t:\t{df.shape}")

Skipping, found downloaded files in ".\book-recommendation-dataset" (use force=True to force download)


### Importing the **datasets**

In [3]:
books = pd.read_csv('book-recommendation-dataset/Books.csv')
users = pd.read_csv('book-recommendation-dataset/Users.csv')
ratings = pd.read_csv('book-recommendation-dataset/Ratings.csv')

### Fixing Image URLs

In [4]:
books['Image-URL-M'] = books['Image-URL-M'].str.replace('http', 'https')

## **Popularity-based** Recommendation System

In [5]:
temp_br = books.merge(ratings, on="ISBN")
temp_br.shape

(1031136, 10)

### Number of Ratings

In [7]:
temp_num = temp_br.groupby('Book-Title').count()['Book-Rating'].reset_index()
temp_num.rename(columns={'Book-Rating': 'Votes'}, inplace=True)

temp_num.head()

Unnamed: 0,Book-Title,Votes
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


### Average Rating

In [8]:
temp_avg = temp_br.groupby('Book-Title')['Book-Rating'].mean().reset_index()
temp_avg.rename(columns={'Book-Rating': 'Avg-rating'}, inplace=True)

temp_avg.head()

Unnamed: 0,Book-Title,Avg-rating
0,A Light in the Storm: The Civil War Diary of ...,2.25
1,Always Have Popsicles,0.0
2,Apple Magic (The Collector's series),0.0
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.0
4,Beyond IBM: Leadership Marketing and Finance ...,0.0


### Popular books  -  based on `Avg-rating`

In [9]:
pop_ = temp_num.merge(temp_avg, on='Book-Title')
pop_.head()

Unnamed: 0,Book-Title,Votes,Avg-rating
0,A Light in the Storm: The Civil War Diary of ...,4,2.25
1,Always Have Popsicles,1,0.0
2,Apple Magic (The Collector's series),1,0.0
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.0
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.0


### Filtering books based on `Votes`
Obtaining **top 50** books based on `Votes >= 250`

In [13]:
temp__ = pop_[pop_['Votes'] >= 250].sort_values('Avg-rating', ascending=False)
pop = temp__.head(50)
pop.shape

(50, 3)

### Merging with `books` to obtain more data

In [14]:
top50 = pop.merge(books, on='Book-Title')[cols]
top50.drop_duplicates('Book-Title', inplace=True)

top50.head()

Unnamed: 0,Book-Title,Book-Author,Image-URL-M,Votes,Avg-rating
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,https://images.amazon.com/images/P/0439136350....,428,5.852804
3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,https://images.amazon.com/images/P/0439139597....,387,5.824289
5,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,https://images.amazon.com/images/P/0590353403....,278,5.73741
9,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,https://images.amazon.com/images/P/043935806X....,347,5.501441
13,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,https://images.amazon.com/images/P/0439064872....,556,5.183453


### Cleaning the `top50`

In [15]:
top50['Avg-rating'] = round(top50['Avg-rating'], 2)
top50['Book-Title'] = top50['Book-Title'].str.strip().replace(r'\s{1,}\(.*\)', '', regex=True)

top50.head()

Unnamed: 0,Book-Title,Book-Author,Image-URL-M,Votes,Avg-rating
0,Harry Potter and the Prisoner of Azkaban,J. K. Rowling,https://images.amazon.com/images/P/0439136350....,428,5.85
3,Harry Potter and the Goblet of Fire,J. K. Rowling,https://images.amazon.com/images/P/0439139597....,387,5.82
5,Harry Potter and the Sorcerer's Stone,J. K. Rowling,https://images.amazon.com/images/P/0590353403....,278,5.74
9,Harry Potter and the Order of the Phoenix,J. K. Rowling,https://images.amazon.com/images/P/043935806X....,347,5.5
13,Harry Potter and the Chamber of Secrets,J. K. Rowling,https://images.amazon.com/images/P/0439064872....,556,5.18


### Outputting the `top50` dataset

In [16]:
top50.to_csv('processed-dataset/top50.csv')

## User-based **Collaborative Filtering**

### Obtaining `User-ID` with **more than 200** reviews

In [19]:
x = temp_br.groupby('User-ID').count()['Book-Rating']
top_users = x[x > 200].index

top_users[:5]

Index([254, 2276, 2766, 2977, 3363], dtype='int64', name='User-ID')

### Obtaining the records made by `top_users`

In [20]:
filtered_users = temp_br[temp_br['User-ID'].isin(top_users)]
df_info(filtered_users)

          ISBN                                         Book-Title  \
3   0002005018                                       Clara Callan   
6   0002005018                                       Clara Callan   
7   0002005018                                       Clara Callan   
10  0002005018                                       Clara Callan   
21  0374157065  Flu: The Story of the Great Influenza Pandemic...   

             Book-Author Year-Of-Publication              Publisher  \
3   Richard Bruce Wright                2001  HarperFlamingo Canada   
6   Richard Bruce Wright                2001  HarperFlamingo Canada   
7   Richard Bruce Wright                2001  HarperFlamingo Canada   
10  Richard Bruce Wright                2001  HarperFlamingo Canada   
21      Gina Bari Kolata                1999   Farrar Straus Giroux   

                                          Image-URL-S  \
3   http://images.amazon.com/images/P/0002005018.0...   
6   http://images.amazon.com/images/P/000200

### Obtaining the `Book-Title` with **more than 50** ratings

In [21]:
y = filtered_users.groupby('Book-Title').count()['Book-Rating']
famous_books = y[y >= 50].reset_index()['Book-Title'].values

### Obtaining the `famous_books` records

In [22]:
filtered_books = filtered_users[filtered_users['Book-Title'].isin(famous_books)]
filtered_books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
31,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,https://images.amazon.com/images/P/0399135782....,http://images.amazon.com/images/P/0399135782.0...,11676,9
33,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,https://images.amazon.com/images/P/0399135782....,http://images.amazon.com/images/P/0399135782.0...,36836,0
34,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,https://images.amazon.com/images/P/0399135782....,http://images.amazon.com/images/P/0399135782.0...,46398,9
38,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,https://images.amazon.com/images/P/0399135782....,http://images.amazon.com/images/P/0399135782.0...,113270,0
39,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,https://images.amazon.com/images/P/0399135782....,http://images.amazon.com/images/P/0399135782.0...,113519,0


### Cleaning the `filtered books` book titles

In [24]:
filtered_books['Book-Title'] = filtered_books['Book-Title'].str.strip().replace(r'\s{1,}\(.*\)', '', regex=True)
filtered_books['Book-Title'] = filtered_books['Book-Title'].str.replace('&amp;', 'and')
filtered_books['Book-Title'] = filtered_books['Book-Title'].str.replace('\\O\\" Is for Outlaw"', "O is for Outlaw")

filtered_books[filtered_books['Book-Title'] == ex].head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
142493,439136350,Harry Potter and the Prisoner of Azkaban,J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,https://images.amazon.com/images/P/0439136350....,http://images.amazon.com/images/P/0439136350.0...,254,9
142499,439136350,Harry Potter and the Prisoner of Azkaban,J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,https://images.amazon.com/images/P/0439136350....,http://images.amazon.com/images/P/0439136350.0...,6251,10
142501,439136350,Harry Potter and the Prisoner of Azkaban,J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,https://images.amazon.com/images/P/0439136350....,http://images.amazon.com/images/P/0439136350.0...,8681,0
142503,439136350,Harry Potter and the Prisoner of Azkaban,J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,https://images.amazon.com/images/P/0439136350....,http://images.amazon.com/images/P/0439136350.0...,11676,0
142505,439136350,Harry Potter and the Prisoner of Azkaban,J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,https://images.amazon.com/images/P/0439136350....,http://images.amazon.com/images/P/0439136350.0...,16106,0


### **Pivoting** the table `filtered_books`
On the `Book-Title` column as **index**, with `User-ID` as **columns** and `Book-Rating` as **values**

In [25]:
pt = filtered_books.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating').fillna(0.0)

pt.head()

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Computing the **cosine similarity** scores

In [27]:
sim_scores = c_score(pt)
sim_scores.shape

(698, 698)

### **Recommender** function

In [28]:
def recommend(book):
    suggestions = []
    index = np.where(pt.index == book)[0][0]
    s_books = sorted(list(enumerate(sim_scores[index])), key=lambda x:x[1], reverse=True)[1:6]
    for book in s_books:
        suggestions.append(pt.index[book[0]])
    return suggestions

### Obtaining recommended books i.e. **top 5** suggestions

In [29]:
recommend("Harry Potter and the Sorcerer's Stone")

['Harry Potter and the Chamber of Secrets',
 'Harry Potter and the Prisoner of Azkaban',
 'Harry Potter and the Goblet of Fire',
 'Harry Potter and the Order of the Phoenix',
 'The Mists of Avalon']

### Rough Work

#### Enumerated list of `SIMILARITY SCORES` i.e. each index represents the corres. index in `pt` table

In [31]:
s = list(enumerate(sim_scores))
s[0][1][:5]                    # will give the SIMILARITY SCORE of `1984` against ALL the 698 books in pt

array([1.        , 0.10255025, 0.01220856, 0.        , 0.05367224])

#### Fetching the `index` of each book

In [32]:
print(f"Index of 'The Da Vinci Code' : {np.where(pt.index == ex)[0][0]}")

Index of 'The Da Vinci Code' : 228


#### Sorting the `SIMILARITY SCORES` in **descending order**

In [34]:
for p, q in enumerate(s[0][1][:5]):
    print(f"Book# : {p} - SCORE : {q}") # will display all the 698 books' SIMILARITY SCORES with index on LHS against '1984'

sample = list(enumerate(sim_scores[0])) # wil sort the scores against book '1984' in reverse order
z = sorted(sample, key=lambda x:x[1], reverse=True)[1:6]

Book# : 0 - SCORE : 0.9999999999999994
Book# : 1 - SCORE : 0.10255024829874465
Book# : 2 - SCORE : 0.012208555646509366
Book# : 3 - SCORE : 0.0
Book# : 4 - SCORE : 0.053672244225442485


#### Fetching the `Book-Title` using the **index** of `pt`  table

In [35]:
ctr = 0
for index, book_score in z:
    if ctr!=4:
        print(f"Book Name : {pt.index[index]}\nScore : {book_score}\n")
        ctr+=1
    else:
        print(f"Book Name : {pt.index[index]}\nScore : {book_score}")

Book Name : Animal Farm
Score : 0.2702651417103732

Book Name : The Handmaid's Tale
Score : 0.26396193711234966

Book Name : Brave New World
Score : 0.2366937434740099

Book Name : The Vampire Lestat
Score : 0.23299389358170394

Book Name : The Hours : A Novel
Score : 0.2262639743141286


### Fetching all the details for each of the `suggestions`

#### Removing **duplicate names** from `filtered_books`
Only `698` records remain

In [36]:
temp_books = filtered_books.drop_duplicates('Book-Title')[['Book-Title', 'Book-Author', 'Publisher', 'Year-Of-Publication', 'Image-URL-M']]
temp_books.to_csv('processed-dataset/final.csv')

#### **Top 5**  suggestions

In [37]:
all_suggestions = {}

for name in pt.index:
    all_suggestions.update({name: recommend(name)})
    
suggestions = pd.DataFrame(all_suggestions).T
suggestions.reset_index(inplace=True)
suggestions.rename(columns={'index': 'book-title', 0: "1st", 1: "2nd", 2: "3rd", 3: "4th", 4: "5th"}, inplace=True)

suggestions.to_csv('processed-dataset/sugg.csv')

#### Function for fetching details from `temp_books`

In [43]:
book_data = []
for name in suggestions[suggestions['book-title'] == ex].values[0][1:]:
    for data in temp_books[temp_books['Book-Title']==name].values:
        book_data.append(data)

for item in book_data:
    for i in item:
        print(i)
    print()

Harry Potter and the Goblet of Fire
J. K. Rowling
Scholastic
2000
https://images.amazon.com/images/P/0439139597.01.MZZZZZZZ.jpg

Harry Potter and the Chamber of Secrets
J. K. Rowling
Scholastic
2000
https://images.amazon.com/images/P/0439064872.01.MZZZZZZZ.jpg

Harry Potter and the Sorcerer's Stone
J. K. Rowling
Arthur A. Levine Books
1999
https://images.amazon.com/images/P/059035342X.01.MZZZZZZZ.jpg

Harry Potter and the Order of the Phoenix
J. K. Rowling
Scholastic
2003
https://images.amazon.com/images/P/043935806X.01.MZZZZZZZ.jpg

The Fellowship of the Ring
J.R.R. TOLKIEN
Del Rey
1986
https://images.amazon.com/images/P/0345339703.01.MZZZZZZZ.jpg

