<a href="https://www.kaggle.com/code/nirmit27/book-recommender-system?scriptVersionId=163577974" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import pandas as pd

# For Collaborative Filtering
from sklearn.metrics.pairwise import cosine_similarity as cos_sim

## Importing the **datasets**

In [2]:
books = pd.read_csv('/kaggle/input/book-recommendation-dataset/Books.csv')
users = pd.read_csv('/kaggle/input/book-recommendation-dataset/Users.csv')
ratings = pd.read_csv('/kaggle/input/book-recommendation-dataset/Ratings.csv')

  books = pd.read_csv('/kaggle/input/book-recommendation-dataset/Books.csv')


## Image **URLs** updation

In [3]:
books['Image-URL-S'] = books['Image-URL-S'].str.replace('http', 'https')
books['Image-URL-M'] = books['Image-URL-M'].str.replace('http', 'https')
books['Image-URL-L'] = books['Image-URL-L'].str.replace('http', 'https')

books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,https://images.amazon.com/images/P/0195153448....,https://images.amazon.com/images/P/0195153448....,https://images.amazon.com/images/P/0195153448....
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,https://images.amazon.com/images/P/0002005018....,https://images.amazon.com/images/P/0002005018....,https://images.amazon.com/images/P/0002005018....
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,https://images.amazon.com/images/P/0060973129....,https://images.amazon.com/images/P/0060973129....,https://images.amazon.com/images/P/0060973129....
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,https://images.amazon.com/images/P/0374157065....,https://images.amazon.com/images/P/0374157065....,https://images.amazon.com/images/P/0374157065....
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,https://images.amazon.com/images/P/0393045218....,https://images.amazon.com/images/P/0393045218....,https://images.amazon.com/images/P/0393045218....


# Data **Preprocessing**

## Converting **dates**

In [4]:
books['Year-Of-Publication'] = pd.to_datetime(books['Year-Of-Publication'], errors='coerce')
books['Year-Of-Publication'] = books['Year-Of-Publication'].fillna(books['Year-Of-Publication'].mean())
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(np.int64)

books['Year-Of-Publication']

0         2002
1         2001
2         1991
3         1999
4         1999
          ... 
271355    1988
271356    1991
271357    2004
271358    1996
271359    2000
Name: Year-Of-Publication, Length: 271360, dtype: int64

## **Merging** `books_data` and `ratings`

In [5]:
books_data_ratings = books.merge(ratings, on='ISBN')

books_data_ratings.shape

(1031136, 10)

## Average ratings

In [6]:
avg_ratings = books_data_ratings.groupby('Book-Title')['Book-Rating'].mean().sort_values(ascending=False).reset_index()
avg_ratings.rename(columns={'Book-Rating': 'avg_rating'}, inplace=True)

avg_ratings

Unnamed: 0,Book-Title,avg_rating
0,Film Is: The International Free Cinema,10.0
1,More Secrets of Happy Children: Embrace Your P...,10.0
2,Jo's Boys : From the Original Publisher,10.0
3,The Vanished Priestess : An Annie Szabo Mystery,10.0
4,Game and Hunting,10.0
...,...,...
241066,Invaders of the Planet Earth (Choose Your Own ...,0.0
241067,Sammy Discovers the Alphabet (Learn-With-Sammy),0.0
241068,Sammy Carducci's Guide to Women,0.0
241069,Sammle mir Kiesel am FluÃ?Â?. Mehr als eine Li...,0.0


## Number of ratings

In [7]:
num_ratings = books_data_ratings.groupby('Book-Title')['Book-Rating'].count().sort_values(ascending=False).reset_index()
num_ratings.rename(columns={'Book-Rating': 'num_rating'}, inplace=True)

num_ratings

Unnamed: 0,Book-Title,num_rating
0,Wild Animus,2502
1,The Lovely Bones: A Novel,1295
2,The Da Vinci Code,898
3,A Painted House,838
4,The Nanny Diaries: A Novel,828
...,...,...
241066,Real Love: The Truth About Finding Uncondition...,1
241067,Real Love: The Drawings for Sean,1
241068,"Real Love or Fake (Camfield Novel of Love, No 78)",1
241069,Fabulous Food for Family and Friends: Healthy ...,1


## Merging the data of **ratings** into one table

In [8]:
merged = num_ratings.merge(avg_ratings, on='Book-Title').sort_values(by='avg_rating', ascending=False)
merged

Unnamed: 0,Book-Title,num_rating,avg_rating
131743,The Adventures and Memoirs of Sherlock Holmes ...,1,10.0
152099,Unexpected News: Reading the Bible With Third ...,1,10.0
152134,A pictorial history of sex in films,1,10.0
86504,"The Violet Keystone (The Seventh Tower, Book 6)",2,10.0
222327,Stick and Rudder: An Explanation of the Art of...,1,10.0
...,...,...,...
136778,Confessions To My Mother-Cathy Guisewite,1,0.0
136777,Confessions Dun Porte Drapeau Dechu,1,0.0
136776,Confess-O-Rama,1,0.0
136775,Writing and Analysis in the Law,1,0.0


## Filter for `num_rating >= 250` **sorted** by `avg_rating`
------------------------------------------------------------
### We'll use this data for **Top 50** books' content.

In [9]:
popular_books = merged[merged['num_rating'] >= 250].sort_values(by='avg_rating', ascending=False)
popular_books = popular_books.merge(books, on='Book-Title').drop_duplicates('Book-Title')

top_50 = popular_books[popular_books['num_rating'] >= 250][['Book-Title', 'Book-Author', 'Image-URL-M', 'num_rating', 'avg_rating']]
top_50.info()

<class 'pandas.core.frame.DataFrame'>
Index: 186 entries, 0 to 739
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Book-Title   186 non-null    object 
 1   Book-Author  186 non-null    object 
 2   Image-URL-M  186 non-null    object 
 3   num_rating   186 non-null    int64  
 4   avg_rating   186 non-null    float64
dtypes: float64(1), int64(1), object(3)
memory usage: 8.7+ KB


# **Collaborative Filtering**
-----------------------------
### Building the **Recommender System** model

### Finding users who have rated **over 200** books

In [10]:
chosen_ratings = books_data_ratings.groupby('User-ID').count()['Book-Rating'] > 200
chosen_users = chosen_ratings[chosen_ratings].index
chosen_users

Index([   254,   2276,   2766,   2977,   3363,   4017,   4385,   6251,   6323,
         6543,
       ...
       271705, 273979, 274004, 274061, 274301, 274308, 275970, 277427, 277639,
       278418],
      dtype='int64', name='User-ID', length=811)

### **Filtering** out the ratings from the **chosen users** ONLY

In [11]:
filtered_ratings = books_data_ratings[books_data_ratings['User-ID'].isin(chosen_users)]
filtered_ratings

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,https://images.amazon.com/images/P/0002005018....,https://images.amazon.com/images/P/0002005018....,https://images.amazon.com/images/P/0002005018....,11676,8
6,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,https://images.amazon.com/images/P/0002005018....,https://images.amazon.com/images/P/0002005018....,https://images.amazon.com/images/P/0002005018....,85526,0
7,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,https://images.amazon.com/images/P/0002005018....,https://images.amazon.com/images/P/0002005018....,https://images.amazon.com/images/P/0002005018....,96054,0
10,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,https://images.amazon.com/images/P/0002005018....,https://images.amazon.com/images/P/0002005018....,https://images.amazon.com/images/P/0002005018....,177458,0
21,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,https://images.amazon.com/images/P/0374157065....,https://images.amazon.com/images/P/0374157065....,https://images.amazon.com/images/P/0374157065....,110912,10
...,...,...,...,...,...,...,...,...,...,...
1031124,0231128444,Slow Food(The Case For Taste),Carlo Petrini,2003,Columbia University Press,https://images.amazon.com/images/P/0231128444....,https://images.amazon.com/images/P/0231128444....,https://images.amazon.com/images/P/0231128444....,275970,0
1031125,0520242335,Strong Democracy : Participatory Politics for ...,Benjamin R. Barber,2004,University of California Press,https://images.amazon.com/images/P/0520242335....,https://images.amazon.com/images/P/0520242335....,https://images.amazon.com/images/P/0520242335....,275970,0
1031126,0762412119,"Burpee Gardening Cyclopedia: A Concise, Up to ...",Allan Armitage,2002,Running Press Book Publishers,https://images.amazon.com/images/P/0762412119....,https://images.amazon.com/images/P/0762412119....,https://images.amazon.com/images/P/0762412119....,275970,0
1031127,1582380805,Tropical Rainforests: 230 Species in Full Colo...,"Allen M., Ph.D. Young",2001,Golden Guides from St. Martin's Press,https://images.amazon.com/images/P/1582380805....,https://images.amazon.com/images/P/1582380805....,https://images.amazon.com/images/P/1582380805....,275970,0


### **Book titles** having **more than 50** ratings

In [12]:
filter_book_titles = filtered_ratings.groupby('Book-Title').count()['Book-Rating'] >= 50
famous_books = filter_book_titles[filter_book_titles].index
famous_books

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Day Late and a Dollar Short', 'A Fine Balance',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='Book-Title', length=706)

### **Filtering** out the books that have **more than 50** ratings

In [13]:
final_df = filtered_ratings[filtered_ratings['Book-Title'].isin(famous_books)]
final_df.tail()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
1028414,1878702831,Echoes,Nancy Morse,1992,Meteor Publishing Corporation,https://images.amazon.com/images/P/1878702831....,https://images.amazon.com/images/P/1878702831....,https://images.amazon.com/images/P/1878702831....,238781,0
1028600,394429869,I Know Why the Caged Bird Sings,Maya Angelou,1996,Random House,https://images.amazon.com/images/P/0394429869....,https://images.amazon.com/images/P/0394429869....,https://images.amazon.com/images/P/0394429869....,239594,8
1028602,449001164,The Promise,CHAIM POTOK,1997,Ballantine Books,https://images.amazon.com/images/P/0449001164....,https://images.amazon.com/images/P/0449001164....,https://images.amazon.com/images/P/0449001164....,239594,7
1028815,743527631,The Pillars of the Earth,Ken Follett,2002,Encore,https://images.amazon.com/images/P/0743527631....,https://images.amazon.com/images/P/0743527631....,https://images.amazon.com/images/P/0743527631....,240144,0
1028817,745168086,The Handmaid's Tale,Margaret Atwood,1999,Chivers Audio Books,https://images.amazon.com/images/P/0745168086....,https://images.amazon.com/images/P/0745168086....,https://images.amazon.com/images/P/0745168086....,240144,0


### **Processing** the final subset of data
-------------------------------------------
**Pivoting** the table on the `Book-Title` as **index**, `User-ID` as **columns** with `Book-Rating` as **values**.

In [14]:
pt = final_df.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating').fillna(0.0)
pt.index

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Day Late and a Dollar Short', 'A Fine Balance',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='Book-Title', length=706)

### Computing the **cosine similarity scores** of the books
We are computing the **Euclidean distances** of **706** books from **706** books.

In [15]:
sims = cos_sim(pt)
sims

array([[1.        , 0.10255025, 0.01220856, ..., 0.12110367, 0.07347567,
        0.04316046],
       [0.10255025, 1.        , 0.2364573 , ..., 0.07446129, 0.16773875,
        0.14263397],
       [0.01220856, 0.2364573 , 1.        , ..., 0.04558758, 0.04938579,
        0.10796119],
       ...,
       [0.12110367, 0.07446129, 0.04558758, ..., 1.        , 0.07085128,
        0.0196177 ],
       [0.07347567, 0.16773875, 0.04938579, ..., 0.07085128, 1.        ,
        0.10602962],
       [0.04316046, 0.14263397, 0.10796119, ..., 0.0196177 , 0.10602962,
        1.        ]])

### **Recommender** function
This function will ***suggest*** the **top 5** books based on their **similarity scores** closest to the **given** book.

In [16]:
def recommend(book_name):
    
    suggestions = []
    
    # Fetching the book's index number
    i = np.where(pt.index == book_name)[0][0]
    
    # Fetching the similarity scores
    top_5_scores = sorted(list(enumerate(sims[i])), key=lambda x: x[1], reverse=True)[1:6]
    
    # Fetching the book title suggestions that match the similarity scores
    for i in sorted(list(enumerate(sims[i])), key=lambda x: x[1], reverse=True)[1:6]:
        suggestions.append(pt.index[i[0]])
    
    return suggestions

### **Output**
--------------
1. Table of **top 5** suggestions for each of the **706 most famous** books.
2. Table of **top-rated** books with their **image urls**.

In [17]:
all_suggestions = {}

for name in pt.index:
    all_suggestions.update({name: recommend(name)})
    
suggestions_df = pd.DataFrame(all_suggestions).T
suggestions_df.rename(columns={0: "1st", 1: "2nd", 2: "3rd", 3: "4th", 4: "5th"}, inplace=True)

suggestions_df.to_csv('suggestions.csv')
top_50.to_csv('top50.csv')