In [1]:
import kagglehub

path = kagglehub.dataset_download("somnambwl/bookcrossing-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\Unaiza\.cache\kagglehub\datasets\somnambwl\bookcrossing-dataset\versions\1


In [2]:
import os

print(os.listdir(path))

['Books.csv', 'Ratings.csv', 'Users.csv']


In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import Normalizer

In [4]:
books = pd.read_csv(path + '/Books.csv', sep=';', encoding='latin-1', on_bad_lines='skip')
ratings = pd.read_csv(path + '/Ratings.csv', sep=';', encoding='latin-1', on_bad_lines='skip')
users = pd.read_csv(path + '/Users.csv', sep=';', encoding='latin-1', on_bad_lines='skip')

  users = pd.read_csv(path + '/Users.csv', sep=';', encoding='latin-1', on_bad_lines='skip')


In [5]:
print(books.shape)
print(ratings.shape)
print(users.shape)

(271379, 5)
(1149780, 3)
(278859, 2)


In [6]:
print(books.columns)
print(ratings.columns)
print(users.columns)

Index(['ISBN', 'Title', 'Author', 'Year', 'Publisher'], dtype='object')
Index(['User-ID', 'ISBN', 'Rating'], dtype='object')
Index(['User-ID', 'Age'], dtype='object')


In [7]:
books.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company


In [8]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [9]:
users.head()

Unnamed: 0,User-ID,Age
0,1,
1,2,18.0
2,3,
3,4,17.0
4,5,


In [10]:
print(books.isnull().sum(), '\n-------------')
print(ratings.isnull().sum(), '\n-------------')
print(users.isnull().sum(), '\n-------------')

ISBN         0
Title        0
Author       2
Year         0
Publisher    2
dtype: int64 
-------------
User-ID    0
ISBN       0
Rating     0
dtype: int64 
-------------
User-ID         0
Age        110232
dtype: int64 
-------------


In [11]:
print(books.dtypes)
print(ratings.dtypes)
print(users.dtypes)

ISBN         object
Title        object
Author       object
Year          int64
Publisher    object
dtype: object
User-ID     int64
ISBN       object
Rating      int64
dtype: object
User-ID    object
Age        object
dtype: object


In [12]:
books['Author'] = books['Author'].fillna('Unknown')
books['Publisher'] = books['Publisher'].fillna('Unknown')

In [13]:
users['Age'] = users['Age'].fillna(0)

In [14]:
print(books.isnull().sum(), '\n-------------')
print(ratings.isnull().sum(), '\n-------------')
print(users.isnull().sum(), '\n-------------')

ISBN         0
Title        0
Author       0
Year         0
Publisher    0
dtype: int64 
-------------
User-ID    0
ISBN       0
Rating     0
dtype: int64 
-------------
User-ID    0
Age        0
dtype: int64 
-------------


In [15]:
ratings = ratings[ratings['Rating'] > 0]

In [16]:
book_ratings = ratings.merge(books, on='ISBN')

In [17]:
book_ratings.head()

Unnamed: 0,User-ID,ISBN,Rating,Title,Author,Year,Publisher
0,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle
1,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press
2,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press
3,276744,038550120X,7,A Painted House,JOHN GRISHAM,2001,Doubleday
4,11676,038550120X,10,A Painted House,JOHN GRISHAM,2001,Doubleday


In [18]:
# Create combined features for content-based filtering
def combine_features(row):
    return row['Title'] + " " + row['Author'] + " " + row['Publisher']

In [19]:
books['combined_features'] = books.apply(combine_features, axis=1)

In [20]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')

In [21]:
tfidf_matrix = vectorizer.fit_transform(books['combined_features'])

In [22]:
normalizer = Normalizer()

In [23]:
tfidf_matrix = normalizer.fit_transform(tfidf_matrix)

In [24]:
# 1. Content-based recommendation
def content_based_recommendation(user_id, ratings_df, books_df, tfidf_matrix, threshold=8, n=5):
    print("Trying content-based filtering...")
    rated_books = ratings_df[(ratings_df['User-ID'] == user_id) & (ratings_df['Rating'] >= threshold)]['ISBN']
    indices = books_df[books_df['ISBN'].isin(rated_books)].index

    if len(indices) == 0:
        print("No highly-rated books found for content-based filtering.")
        return None

    user_profile = tfidf_matrix[indices].mean(axis=0)
    cosine_sim = cosine_similarity(user_profile, tfidf_matrix).flatten()
    cosine_sim[books_df[books_df['ISBN'].isin(rated_books)].index] = -1
    top_indices = cosine_sim.argsort()[::-1][:n]
    return books_df.iloc[top_indices][['ISBN', 'Title']]

In [25]:
# 2. User-based collaborative filtering
def user_based_collaborative_filtering(user_id, ratings_df, books_df, n=5):
    print("Trying user-based collaborative filtering...")
    user_item_matrix = ratings_df.pivot_table(index='User-ID', columns='ISBN', values='Rating').fillna(0)
    if user_id not in user_item_matrix.index:
        print("User not found in user-item matrix.")
        return None

    sparse_matrix = csr_matrix(user_item_matrix.values)
    cosine_sim = cosine_similarity(sparse_matrix)
    user_index = user_item_matrix.index.get_loc(user_id)
    sim_scores = list(enumerate(cosine_sim[user_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_users = [user_item_matrix.index[i] for i, _ in sim_scores[1:6]]

    user_books = set(ratings_df[ratings_df['User-ID'] == user_id]['ISBN'])
    similar_users_ratings = ratings_df[ratings_df['User-ID'].isin(top_users)]
    unseen_books = similar_users_ratings[~similar_users_ratings['ISBN'].isin(user_books)]
    if unseen_books.empty:
        print("No unseen books by similar users.")
        return None

    top_recs = unseen_books.groupby('ISBN')['Rating'].mean().sort_values(ascending=False).head(n)
    return books_df[books_df['ISBN'].isin(top_recs.index)][['ISBN', 'Title']]

In [26]:
# 3. Item-based collaborative filtering
def item_based_collaborative_filtering(user_id, ratings_df, books_df, n=5):
    print("Trying item-based collaborative filtering...")
    user_ratings = ratings_df[ratings_df['User-ID'] == user_id]
    if len(user_ratings) != 1:
        print("Item-based filtering only used when exactly one book is rated.")
        return None

    liked_book = user_ratings.iloc[0]['ISBN']
    similar_users = ratings_df[ratings_df['ISBN'] == liked_book]['User-ID'].unique()
    if len(similar_users) == 0:
        print("No similar users found.")
        return None

    similar_users_ratings = ratings_df[(ratings_df['User-ID'].isin(similar_users)) & (ratings_df['ISBN'] != liked_book)]
    if similar_users_ratings.empty:
        print("No additional books rated by similar users.")
        return None

    top_recs = similar_users_ratings.groupby('ISBN')['Rating'].mean().sort_values(ascending=False).head(n)
    return books_df[books_df['ISBN'].isin(top_recs.index)][['ISBN', 'Title']]

In [27]:
# 4. Global fallback for users with no ratings
def global_top_books(ratings_df, books_df, n=5):
    print("Falling back to global top-rated books...")
    top_books = ratings_df.groupby('ISBN')['Rating'].mean().sort_values(ascending=False).head(n)
    return books_df[books_df['ISBN'].isin(top_books.index)][['ISBN', 'Title']]

In [28]:
def recommend_books(user_id, ratings_df, books_df, tfidf_matrix, n=5):
    user_ratings = ratings_df[ratings_df['User-ID'] == user_id]
    print(f"User {user_id} has rated {len(user_ratings)} book(s).")

    if len(user_ratings) >= 3:
        result = user_based_collaborative_filtering(user_id, ratings_df, books_df, n)
        if result is not None:
            return result

    if len(user_ratings) == 1:
        result = item_based_collaborative_filtering(user_id, ratings_df, books_df, n)
        if result is not None:
            return result

    if len(user_ratings) >= 1:
        result = content_based_recommendation(user_id, ratings_df, books_df, tfidf_matrix, threshold=8, n=n)
        if result is not None:
            return result

    return global_top_books(ratings_df, books_df, n)

In [29]:
user_id = 10

In [30]:
recommended_books = recommend_books(user_id, ratings, books, tfidf_matrix)

User 10 has rated 1 book(s).
Trying item-based collaborative filtering...
No additional books rated by similar users.
Trying content-based filtering...
No highly-rated books found for content-based filtering.
Falling back to global top-rated books...


In [31]:
print(f"Recommended Books for User ID {user_id}:")
print(recommended_books)

Recommended Books for User ID 10:
              ISBN                                      Title
141169  8472452786                La Conciencia Sin Fronteras
156804  1559702028  Auschwitz : A Doctor's Eyewitness Account
202493  1559700904                                  The Loser
232831  1559634634         Alone: The Classic Polar Adventure
