### Books Recommendation system using clustering | Collaborative Based

- **Dataset**: https://www.kaggle.com/datasets/ra4u12/bookrecommendation

#### 1. Data Loading

In [8]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# Read datasets with updated parameter
books = pd.read_csv('data/BX-Books.csv', sep=';', encoding='latin-1', on_bad_lines='skip')
users = pd.read_csv('data/BX-Users.csv', sep=';', encoding='latin-1', on_bad_lines='skip')
ratings = pd.read_csv('data/BX-Book-Ratings.csv', sep=';', encoding='latin-1', on_bad_lines='skip')

In [17]:
# Display the first few rows of each dataset
print("Books Dataset:")
books.head(2)

Books Dataset:


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...


In [20]:
print("Users Dataset:")
users.head(2)

Users Dataset:


Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


In [22]:
print("Ratings Dataset:")
ratings.head(2)

Ratings Dataset:


Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5


#### 2. Data Cleaning and Renaming
- We keep only the necessary columns and rename them for easier access.

In [27]:
books = books[['ISBN','Book-Title','Book-Author','Year-Of-Publication','Publisher','Image-URL-L']]
books.rename(columns={
    'Book-Title': 'title',
    'Book-Author': 'author',
    'Year-Of-Publication': 'year',
    'Publisher': 'publisher',
    'Image-URL-L': 'image_url'
}, inplace=True)

users.rename(columns={'User-ID': 'user_id', 'Location': 'location', 'Age': 'age'}, inplace=True)
ratings.rename(columns={'User-ID': 'user_id', 'Book-Rating': 'rating'}, inplace=True)

In [31]:
# Display the shape of the d
shape_table = pd.DataFrame({
    'Dataset': ['Books', 'Users', 'Ratings'],
    'Rows': [books.shape[0], users.shape[0], ratings.shape[0]],
    'Columns': [books.shape[1], users.shape[1], ratings.shape[1]]
})

shape_table

Unnamed: 0,Dataset,Rows,Columns
0,Books,271360,6
1,Users,278858,3
2,Ratings,1149780,3


#### 3. Filter Active Users
- To reduce noise, we focus only on users who have rated more than 200 books.

In [32]:
ratings['user_id'].value_counts()

user_id
11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
119573        1
276706        1
276697        1
276679        1
276676        1
Name: count, Length: 105283, dtype: int64

In [35]:
active_users = ratings['user_id'].value_counts()
active_users = active_users[active_users > 200].index # The .index returns just the list of user IDs (without the counts).
active_users

Index([ 11676, 198711, 153662,  98391,  35859, 212898, 278418,  76352, 110973,
       235105,
       ...
        88793,   9856, 155916,  44296,  28634, 188951,  59727,  73681, 268622,
       274808],
      dtype='int64', name='user_id', length=899)

In [43]:
ratings = ratings[ratings['user_id'].isin(active_users)]
ratings.head()

Unnamed: 0,user_id,ISBN,rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


In [44]:
ratings['user_id'].value_counts()

user_id
11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
188951      201
59727       201
73681       201
268622      201
274808      201
Name: count, Length: 899, dtype: int64

In [45]:
ratings.shape

(526356, 3)

#### 4. Merge Ratings with Books and Filter Popular Books

In [46]:
ratings_books = ratings.merge(books, on='ISBN')
ratings_books.head(2)

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,image_url
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
1,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,http://images.amazon.com/images/P/0026217457.0...


In [47]:
book_rating_counts = ratings_books.groupby('title')['rating'].count().reset_index()
book_rating_counts.rename(columns={'rating': 'num_of_rating'}, inplace=True)
book_rating_counts.head(2)

Unnamed: 0,title,num_of_rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1


In [49]:
final_rating = ratings_books.merge(book_rating_counts, on='title')
final_rating = final_rating[final_rating['num_of_rating'] >= 50]
final_rating.head(2)

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,image_url,num_of_rating
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
13,277427,0060930535,0,The Poisonwood Bible: A Novel,Barbara Kingsolver,1999,Perennial,http://images.amazon.com/images/P/0060930535.0...,133


In [50]:
# Drop duplicates to avoid multiple entries from same user for same book
final_rating.drop_duplicates(['user_id', 'title'], inplace=True)
final_rating.shape

(59850, 9)

#### 5. Create Book-User Pivot Table
- Each row = book
- Each column = user
- Each cell = rating

In [54]:
book_pivot = final_rating.pivot_table(index='title', columns='user_id', values='rating', fill_value=0)
book_pivot.head(2)

user_id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
book_pivot.shape

(742, 888)

 #### 6. Train a KNN Model
- We use cosine distance to find similar books.
- csr_matrix: Used to efficiently store large, sparse rating matrices by keeping only non-zero values to save memory and speed up computations.
- algorithm='brute': Forces the Nearest Neighbors model to compute all pairwise distances exhaustively — ideal for small to medium datasets and compatible with complex metrics like cosine similarity.

In [58]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

book_sparse = csr_matrix(book_pivot)
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(book_sparse)

#### 7. Book Recommendation Function

In [None]:
def recommend_book(book_name):
    book_id = np.where(book_pivot.index == book_name)[0][0]
    distance, suggestion = model.kneighbors(book_pivot.iloc[book_id,:].values.reshape(1,-1), n_neighbors=6 )
    
    for i in range(len(suggestion)):
            books = book_pivot.index[suggestion[i]]
            for j in books:
                if j == book_name:
                    print(f"You searched '{book_name}'\n")
                    print("The suggestion books are: \n")
                else:
                    print(j)

#### 8. Test the Function

In [60]:
book_name = "Harry Potter and the Chamber of Secrets (Book 2)"
recommend_book(book_name)

You searched 'Harry Potter and the Chamber of Secrets (Book 2)'

The suggestion books are: 

Harry Potter and the Prisoner of Azkaban (Book 3)
Harry Potter and the Goblet of Fire (Book 4)
Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))
Harry Potter and the Sorcerer's Stone (Book 1)
Harry Potter and the Order of the Phoenix (Book 5)


#### Save Trained Model and Data

In [61]:
import pickle

pickle.dump(model, open('resources/model.pkl', 'wb'))
pickle.dump(book_pivot.index, open('resources/book_names.pkl', 'wb'))
pickle.dump(final_rating, open('resources/final_rating.pkl', 'wb'))
pickle.dump(book_pivot, open('resources/book_pivot.pkl', 'wb'))