### Modules and other utils

In [None]:
import numpy as np
import pandas as pd
import opendatasets as od
from sklearn.metrics.pairwise import cosine_similarity as c_score

od.download("https://www.kaggle.com/datasets/arashnic/book-recommendation-dataset")

ex = "Harry Potter and the Prisoner of Azkaban"
cols = ['Book-Title', 'Book-Author', 'Image-URL-M', 'Votes', 'Avg-rating']

def df_info(df):
    print(f"{df.head()}\n\nShape\t:\t{df.shape}")

### Importing the **datasets**

In [None]:
books = pd.read_csv('book-recommendation-dataset/Books.csv')
users = pd.read_csv('book-recommendation-dataset/Users.csv')
ratings = pd.read_csv('book-recommendation-dataset/Ratings.csv')

### Fixing Image URLs

In [3]:
books['Image-URL-M'] = books['Image-URL-M'].str.replace('http', 'https')

## **Popularity-based** Recommendation System

In [None]:
temp_br = books.merge(ratings, on="ISBN")
temp_br.shape

### Number of Ratings

In [None]:
temp_num = temp_br.groupby('Book-Title').count()['Book-Rating'].reset_index()
temp_num.rename(columns={'Book-Rating': 'Votes'}, inplace=True)
temp_num

### Average Rating

In [None]:
temp_avg = temp_br.groupby('Book-Title')['Book-Rating'].mean().reset_index()
temp_avg.rename(columns={'Book-Rating': 'Avg-rating'}, inplace=True)
temp_avg

### Popular books  -  based on `Avg-rating`

In [None]:
pop_ = temp_num.merge(temp_avg, on='Book-Title')
pop_

### Filtering books based on `Votes`
Obtaining **top 50** books based on `Votes >= 250`

In [None]:
temp__ = pop_[pop_['Votes'] >= 250].sort_values('Avg-rating', ascending=False)
pop = temp__.head(50)
pop.shape

### Merging with `books` to obtain more data

In [None]:
top50 = pop.merge(books, on='Book-Title')[cols]
top50.drop_duplicates('Book-Title', inplace=True)
top50

### Cleaning the `top50`

In [None]:
top50['Avg-rating'] = round(top50['Avg-rating'], 2)
top50['Book-Title'] = top50['Book-Title'].str.strip().replace(r'\s{1,}\(.*\)', '', regex=True)
top50.head()

### Outputting the `top50` dataset

In [11]:
top50.to_csv('processed-dataset/top50.csv')

## User-based **Collaborative Filtering**

### Obtaining `User-ID` with **more than 200** reviews

In [None]:
x = temp_br.groupby('User-ID').count()['Book-Rating']
top_users = x[x > 200].index
top_users

### Obtaining the records made by `top_users`

In [None]:
filtered_users = temp_br[temp_br['User-ID'].isin(top_users)]
df_info(filtered_users)

### Obtaining the `Book-Title` with **more than 50** ratings

In [14]:
y = filtered_users.groupby('Book-Title').count()['Book-Rating']
famous_books = y[y >= 50].reset_index()['Book-Title'].values

### Obtaining the `famous_books` records

In [None]:
filtered_books = filtered_users[filtered_users['Book-Title'].isin(famous_books)]
filtered_books.head()

### Cleaning the `filtered books` book titles

In [None]:
filtered_books['Book-Title'] = filtered_books['Book-Title'].str.strip().replace(r'\s{1,}\(.*\)', '', regex=True)
filtered_books['Book-Title'] = filtered_books['Book-Title'].str.replace('&amp;', 'and')
filtered_books['Book-Title'] = filtered_books['Book-Title'].str.replace('\\O\\" Is for Outlaw"', "O is for Outlaw")
filtered_books[filtered_books['Book-Title'] == ex]

### **Pivoting** the table `filtered_books`
On the `Book-Title` column as **index**, with `User-ID` as **columns** and `Book-Rating` as **values**

In [None]:
pt = filtered_books.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating').fillna(0.0)
pt

### Computing the **cosine similarity** scores

In [None]:
sim_scores = c_score(pt)
sim_scores, sim_scores.shape

### **Recommender** function

In [19]:
def recommend(book):
    suggestions = []
    index = np.where(pt.index == book)[0][0]
    s_books = sorted(list(enumerate(sim_scores[index])), key=lambda x:x[1], reverse=True)[1:6]
    for book in s_books:
        suggestions.append(pt.index[book[0]])
    return suggestions

### Obtaining recommended books i.e. **top 5** suggestions

In [None]:
recommend("Harry Potter and the Sorcerer's Stone")

### Rough Work

#### Enumerated list of `SIMILARITY SCORES` i.e. each index represents the corres. index in `pt` table

In [None]:
s = list(enumerate(sim_scores))
s[0][1]                        # will give the SIMILARITY SCORE of `1984` against ALL the 698 books in pt

#### Fetching the `index` of each book

In [None]:
print(f"Index of 'The Da Vinci Code' : {np.where(pt.index == ex)[0][0]}")

#### Sorting the `SIMILARITY SCORES` in **descending order**

In [None]:
for p, q in enumerate(s[0][1]):
    print(f"Book# : {p} - SCORE : {q}") # will display all the 698 books' SIMILARITY SCORES with index on LHS against '1984'

sample = list(enumerate(sim_scores[0])) # wil sort the scores against book '1984' in reverse order
z = sorted(sample, key=lambda x:x[1], reverse=True)[1:6]

#### Fetching the `Book-Title` using the **index** of `pt`  table

In [None]:
ctr = 0
for index, book_score in z:
    if ctr!=4:
        print(f"Book Name : {pt.index[index]}\nScore : {book_score}\n")
        ctr+=1
    else:
        print(f"Book Name : {pt.index[index]}\nScore : {book_score}")

### Fetching all the details for each of the `suggestions`

#### Removing **duplicate names** from `filtered_books`
Only `698` records remain

In [25]:
temp_books = filtered_books.drop_duplicates('Book-Title')[['Book-Title', 'Book-Author', 'Publisher', 'Year-Of-Publication', 'Image-URL-M']]
temp_books.to_csv('processed-dataset/final.csv')

#### **Top 5**  suggestions

In [26]:
all_suggestions = {}

for name in pt.index:
    all_suggestions.update({name: recommend(name)})
    
suggestions = pd.DataFrame(all_suggestions).T
suggestions.reset_index(inplace=True)
suggestions.rename(columns={'index': 'book-title', 0: "1st", 1: "2nd", 2: "3rd", 3: "4th", 4: "5th"}, inplace=True)

suggestions.to_csv('processed-dataset/sugg.csv')

#### Function for fetching details from `temp_books`

In [None]:
book_data = []
for name in suggestions[suggestions['book-title'] == ex].values[0][1:]:
    for data in temp_books[temp_books['Book-Title']==name].values:
        book_data.append(data)

for item in book_data:
    for i in item:
        print(i)
    print()