## Importing the modules

In [1]:
import numpy as np
import pandas as pd

# For Collaborative Filtering
from sklearn.metrics.pairwise import cosine_similarity as cos_sim

# Repo URL : https://github.com/nirmit27/kaggle-notebooks/tree/main/Models

## Importing the datasets

In [2]:
books = pd.read_csv('/kaggle/input/book-recommendation-dataset/Books.csv')
users = pd.read_csv('/kaggle/input/book-recommendation-dataset/Users.csv')
ratings = pd.read_csv('/kaggle/input/book-recommendation-dataset/Ratings.csv')

  books = pd.read_csv('/kaggle/input/book-recommendation-dataset/Books.csv')


# Data **Preprocessing**

## Removing **columns**

In [3]:
books_data = books.drop(columns=['Image-URL-S', 'Image-URL-M', 'Image-URL-L'])
books_data.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


## Converting **dates**

In [4]:
books_data['Year-Of-Publication'] = pd.to_datetime(books_data['Year-Of-Publication'], errors='coerce').dt.year

books_data['Year-Of-Publication'].fillna(books_data['Year-Of-Publication'].mean(), inplace=True)

books_data['Year-Of-Publication'] = books_data['Year-Of-Publication'].astype(np.int64)

books_data['Year-Of-Publication']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  books_data['Year-Of-Publication'].fillna(books_data['Year-Of-Publication'].mean(), inplace=True)


0         1970
1         1970
2         1970
3         1970
4         1970
          ... 
271355    1970
271356    1970
271357    1970
271358    1970
271359    1970
Name: Year-Of-Publication, Length: 271360, dtype: int64

## **Merging** `books_data` and `ratings`

In [5]:
books_data_ratings = books_data.merge(ratings, on='ISBN')

books_data_ratings.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating
0,195153448,Classical Mythology,Mark P. O. Morford,1970,Oxford University Press,2,0
1,2005018,Clara Callan,Richard Bruce Wright,1970,HarperFlamingo Canada,8,5
2,2005018,Clara Callan,Richard Bruce Wright,1970,HarperFlamingo Canada,11400,0
3,2005018,Clara Callan,Richard Bruce Wright,1970,HarperFlamingo Canada,11676,8
4,2005018,Clara Callan,Richard Bruce Wright,1970,HarperFlamingo Canada,41385,0


# **Collaborative Filtering**
-----------------------------
### Building the **Recommender System** model

### Finding users who have rated **over 200** books

In [6]:
chosen_ratings = books_data_ratings.groupby('User-ID').count()['Book-Rating'] > 200
chosen_users = chosen_ratings[chosen_ratings].index
chosen_users

Index([   254,   2276,   2766,   2977,   3363,   4017,   4385,   6251,   6323,
         6543,
       ...
       271705, 273979, 274004, 274061, 274301, 274308, 275970, 277427, 277639,
       278418],
      dtype='int64', name='User-ID', length=811)

### **Filtering** out the ratings from the **chosen users** ONLY

In [7]:
filtered_ratings = books_data_ratings[books_data_ratings['User-ID'].isin(chosen_users)]
filtered_ratings

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating
3,0002005018,Clara Callan,Richard Bruce Wright,1970,HarperFlamingo Canada,11676,8
6,0002005018,Clara Callan,Richard Bruce Wright,1970,HarperFlamingo Canada,85526,0
7,0002005018,Clara Callan,Richard Bruce Wright,1970,HarperFlamingo Canada,96054,0
10,0002005018,Clara Callan,Richard Bruce Wright,1970,HarperFlamingo Canada,177458,0
21,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1970,Farrar Straus Giroux,110912,10
...,...,...,...,...,...,...,...
1031124,0231128444,Slow Food(The Case For Taste),Carlo Petrini,1970,Columbia University Press,275970,0
1031125,0520242335,Strong Democracy : Participatory Politics for ...,Benjamin R. Barber,1970,University of California Press,275970,0
1031126,0762412119,"Burpee Gardening Cyclopedia: A Concise, Up to ...",Allan Armitage,1970,Running Press Book Publishers,275970,0
1031127,1582380805,Tropical Rainforests: 230 Species in Full Colo...,"Allen M., Ph.D. Young",1970,Golden Guides from St. Martin's Press,275970,0


### **Book titles** having **more than 50** ratings

In [8]:
filter_book_titles = filtered_ratings.groupby('Book-Title').count()['Book-Rating'] >= 50
famous_books = filter_book_titles[filter_book_titles].index
famous_books

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Day Late and a Dollar Short', 'A Fine Balance',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='Book-Title', length=706)

### **Filtering** out the books that have **more than 50** ratings

In [9]:
final_df = filtered_ratings[filtered_ratings['Book-Title'].isin(famous_books)]
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58586 entries, 31 to 1028817
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ISBN                 58586 non-null  object
 1   Book-Title           58586 non-null  object
 2   Book-Author          58586 non-null  object
 3   Year-Of-Publication  58586 non-null  int64 
 4   Publisher            58586 non-null  object
 5   User-ID              58586 non-null  int64 
 6   Book-Rating          58586 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 3.6+ MB


### **Processing** the final subset of data
-------------------------------------------
**Pivoting** the table on the `Book-Title` as **index**, `User-ID` as **columns** with `Book-Rating` as **values**.

In [10]:
pt = final_df.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating').fillna(0.0)
pt.index

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Day Late and a Dollar Short', 'A Fine Balance',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='Book-Title', length=706)

### Computing the **cosine similarity scores** of the books
We are computing the **Euclidean distances** of **706** books from **706** books.

In [11]:
sims = cos_sim(pt)
sims

array([[1.        , 0.10255025, 0.01220856, ..., 0.12110367, 0.07347567,
        0.04316046],
       [0.10255025, 1.        , 0.2364573 , ..., 0.07446129, 0.16773875,
        0.14263397],
       [0.01220856, 0.2364573 , 1.        , ..., 0.04558758, 0.04938579,
        0.10796119],
       ...,
       [0.12110367, 0.07446129, 0.04558758, ..., 1.        , 0.07085128,
        0.0196177 ],
       [0.07347567, 0.16773875, 0.04938579, ..., 0.07085128, 1.        ,
        0.10602962],
       [0.04316046, 0.14263397, 0.10796119, ..., 0.0196177 , 0.10602962,
        1.        ]])

### **Recommender** function
This function will ***suggest*** the **top 5** books based on their **similarity scores** closest to the **given** book.

In [12]:
def recommend(book_name):
    
    suggestions = []
    
    # Fetching the book's index number
    i = np.where(pt.index == book_name)[0][0]
    
    # Fetching the similarity scores
    top_5_scores = sorted(list(enumerate(sims[i])), key=lambda x: x[1], reverse=True)[1:6]
    
    # Fetching the book title suggestions that match the similarity scores
    for i in sorted(list(enumerate(sims[i])), key=lambda x: x[1], reverse=True)[1:6]:
        suggestions.append(pt.index[i[0]])
    
    return suggestions

### **Output**
--------------
A table of **top 5** suggestions for each of the **706** most famous books.

In [13]:
all_suggestions = {}

for name in pt.index:
    all_suggestions.update({name: recommend(name)})
    
suggestions_df = pd.DataFrame(all_suggestions).T
suggestions_df.rename(columns={0: "1st", 1: "2nd", 2: "3rd", 3: "4th", 4: "5th"}, inplace=True)

suggestions_df.to_csv('suggestions.csv')