# Book Recommendation System

# Part III: Content-Based Filtering

### Importing Libraries

In [1]:
import pandas as pd               # pandas is used for data manipulation and analysis, providing data structures like DataFrames.
import numpy as np                # numpy is used for numerical operations on large, multi-dimensional arrays and matrices.
import ast                        # ast is used for processing trees of the Python abstract syntax grammar.
import matplotlib.pyplot as plt   # matplotlib.pyplot is used for creating static, interactive, and animated visualizations in Python.

from scipy.sparse import csr_matrix                             # csr_matrix is used for creating compressed sparse row matrices, which are efficient for arithmetic and matrix operations on sparse data.
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras

# To have input from a dropdown
import tkinter as tk
from tkinter import simpledialog

# IPython's display module is used to display images within Jupyter Notebooks.
from IPython.display import Markdown, display, Image  
from IPython.display import clear_output

import dask.dataframe as dd
import sys

2024-12-05 22:43:16.891478: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-05 22:43:18.008643: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-05 22:43:20.806165: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



### Loading the Data

In [2]:
books = pd.read_csv("data/Books_cleaned.csv").drop('Unnamed: 0', axis = 1)

ratings_files = [f'data/Ratings_cleaned_part_{i}.csv' for i in range(1,6+1)]
ratings_dfs = [pd.read_csv(file) for file in ratings_files]
ratings = pd.concat(ratings_dfs, ignore_index=True).drop('Unnamed: 0', axis = 1)
del ratings_files, ratings_dfs

books_genres = pd.read_csv("data/Books_genres_cleaned.csv").drop('Unnamed: 0', axis = 1)
books_genres_list = pd.read_csv("data/Books_genres_list_cleaned.csv").drop('Unnamed: 0', axis = 1)

## Modelling

### Step 1. Preparing the datasets

Content-Based Filtering is a well-known method for making recommendations, TODO

In [3]:
# Get the average rating and number of ratings per book
avg_ratings_books = ratings.groupby('BookID').agg(
    Average_Rating = ('Rating', 'mean'),
    Num_Ratings = ('Rating', 'sum')
).reset_index()

# Remove the books in books_genres_list for which we do not have ratings
list_bookIDs = ratings['BookID'].unique()
books_genres_list = books_genres_list[books_genres_list['BookID'].isin(list_bookIDs)]

# Add the previous DataFrame to the genres_list DataFrame
avg_ratings_books = pd.merge(books_genres_list[['Genre', 'BookID']], avg_ratings_books, on='BookID', how='left')

ratings_genres = avg_ratings_books.groupby('Genre').agg(
    Num_Books = ('BookID', 'count'),
    Average_Rating = ('Average_Rating', 'mean'),
    Ratings_per_genre = ('Num_Ratings', 'sum')
).reset_index()

ratings_genres

Unnamed: 0,Genre,Num_Books,Average_Rating,Ratings_per_genre
0,12th Century,1,4.262712,503
1,15th Century,1,4.338624,820
2,16th Century,8,3.826460,5999
3,17th Century,3,3.736342,1606
4,18th Century,17,3.705869,32837
...,...,...,...,...
554,Young Adult Romance,10,3.806438,10344
555,Young Readers,1,4.309091,948
556,Zen,3,4.069333,2126
557,Zimbabwe,2,3.785771,2919


Let me now see what is the minimum number of books per genre that we can impose so that all the original books have, at least, one of the genres satisfying the constraint.

In [4]:
original_unique_books = books_genres_list['BookID'].nunique()
print(f'Original number of unique books in books_genres_list: {original_unique_books}\n')

minimum_books = 445
unique_genres = ratings_genres[ratings_genres['Num_Books'] > minimum_books]['Genre'].unique()
print(f'Number of unique genres with at least {minimum_books} different books: {unique_genres.shape[0]}\n')

new_unique_books = books_genres_list[books_genres_list['Genre'].isin(unique_genres)]['BookID'].nunique()
print(f'Number of books that have at least one of the remaining genres after imposing a minimum number of books per genre: {new_unique_books}\n')

Original number of unique books in books_genres_list: 9826

Number of unique genres with at least 445 different books: 36

Number of books that have at least one of the remaining genres after imposing a minimum number of books per genre: 9826



From the cell above we can conclude that we can safely impose that each genre that we consider needs to have at least 445 books. Above this number, we begin having books whose genres have been dropped.

In [5]:
minimum_books = 445
ratings_genres_min_books = ratings_genres[ratings_genres['Num_Books'] > minimum_books].reset_index().drop(['index'], axis=1)

genres_list = ratings_genres_min_books['Genre'].to_list()

print(f'The number of genres with, at least {minimum_books} books is {len(genres_list)}.')

The number of genres with, at least 445 books is 36.


I start by reducing the number of ratings, since 6 million would be too much.

In [6]:
ratings_aux = ratings.sample(n=int(len(ratings) / 6))
ratings_aux = ratings_aux.sort_values(by=['UserID']).reset_index()
ratings_aux = ratings_aux.drop(['index'], axis=1)

ratings = ratings_aux

del ratings_aux

ratings

Unnamed: 0,UserID,BookID,Rating
0,1,1644,5
1,1,11,5
2,1,867,3
3,1,66,4
4,1,2738,3
...,...,...,...
1006963,53424,2032,4
1006964,53424,641,5
1006965,53424,76,5
1006966,53424,83,5


In [7]:
# Keep just the genres that satisfy the previous constraint
items_pre = books_genres_list[books_genres_list['Genre'].isin(genres_list)].reset_index().drop(['index', 'Goodreads_BookID'], axis=1)

# Add the year of the book and the average rating
items = pd.merge(items_pre, books[['BookID', 'Year', 'Average_Rating']], on='BookID', how='left')

# Convert the categories in the Genre column into dummy features
items = pd.get_dummies(items, columns=['Genre'], dtype=int)

# Change the columns names
names = ['BookID', 'Year', 'Average_Rating']
[names.append(genre) for genre in genres_list]
items.columns = names

# Now I have a row for each of the genres of the books, so let me combine them in one row
items = items.groupby('BookID').agg({
    'Year': 'first',
    'Average_Rating': 'first',
    **{genre: 'max' for genre in genres_list}
}).reset_index()

# Make the BookID and Year columns of integer type
items[['BookID', 'Year']] = items[['BookID', 'Year']].astype(int)

items

Unnamed: 0,BookID,Year,Average_Rating,Adult,Adult Fiction,Adventure,Audiobook,Biography,Chick Lit,Childrens,...,Nonfiction,Novels,Paranormal,Romance,Science Fiction,Science Fiction Fantasy,Suspense,Thriller,Urban Fantasy,Young Adult
0,1,2008,4.34,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,2,1997,4.44,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,3,2005,3.57,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
3,4,1960,4.25,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,1925,3.89,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9821,9996,2010,4.09,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,1,0
9822,9997,1990,4.25,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
9823,9998,1977,4.35,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9824,9999,2011,3.65,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [8]:
# Add the genres to the ratings dataframe
users = pd.merge(ratings, items_pre, on='BookID', how='left')

# Get the mean rating per genre of each user
users = users.groupby(['UserID','Genre']).agg(
    Average_Rating = ('Rating', 'mean')
).reset_index()

# Convert the dataframe into a new dataframe where the columns are the genres 
# and the rows are the mean rating for each genre of each user
users = users.pivot(index='UserID', columns='Genre', values='Average_Rating').fillna(0).reset_index().rename_axis(None, axis=1)

The target, y, will be the ratings. I need the users and items dataframes to have the same length as the target. This is because for each rating there must be a users row with the user information and an items row with the book information. And all of them must be ordered in the same way.

In [9]:
# Size of the dataframes in memory
ratings_size = sys.getsizeof(ratings)
items_size = sys.getsizeof(items)
users_size = sys.getsizeof(users)

# Approximate the partitions
ratings_partitions = max(1, ratings_size // 100_000_000)
items_partitions = max(1, items_size // 100_000_000)
users_partitions = max(1, users_size // 100_000_000)

ratings_dd = dd.from_pandas(ratings, npartitions=ratings_partitions)
items_dd = dd.from_pandas(items, npartitions=items_partitions)
users_dd = dd.from_pandas(users, npartitions=users_partitions)

# Merge the dataframes
items_merged_dd = dd.merge(ratings_dd, items_dd, on="BookID", how="left")
users_merged_dd = dd.merge(ratings_dd, users_dd, on="UserID", how="left")

In [10]:
# Convert to pandas again
items = items_merged_dd.compute()
users = users_merged_dd.compute()

In [11]:
# Target, y. It includes the ratings
y = items['Rating'].reset_index().drop(['index'], axis=1)

### Step 2. Training Data

In [12]:
# Scale data

# items
scaler_items = StandardScaler()
scaler_items.fit(items)
items_scaled = scaler_items.transform(items)

# users
scaler_users = StandardScaler()
scaler_users.fit(users)
users_scaled = scaler_users.transform(users)

# targets
scaler_targets = StandardScaler()
scaler_targets.fit(y)
y_scaled = scaler_targets.transform(y)

# Checks
print(np.allclose(items, scaler_items.inverse_transform(items_scaled)))
print(np.allclose(users, scaler_users.inverse_transform(users_scaled)))
print(np.allclose(y, scaler_targets.inverse_transform(y_scaled)))

True
True
True


In [13]:
# Split the data and shuffle identically
items_train, items_test = train_test_split(items_scaled, train_size=0.80, shuffle=True, random_state=1)
users_train, users_test = train_test_split(users_scaled, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test = train_test_split(y_scaled, train_size=0.80, shuffle=True, random_state=1)

print(f"Items training data shape: {items_train.shape}")
print(f"Items test data shape: {items_test.shape}")

Items training data shape: (805574, 41)
Items test data shape: (201394, 41)


### Step 3. Neural Network for content-based filtering

In [29]:
# Create a personalized layer
class L2NormalizeLayer(tf.keras.layers.Layer):
    def call(self, inputs):
        return tf.linalg.l2_normalize(inputs, axis=1)

In [37]:
num_outputs = 32
tf.random.set_seed(1)

# Users Neural Network
users_NN = tf.keras.models.Sequential([ 
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs), 
])

# Items Neural Network
items_NN = tf.keras.models.Sequential([ 
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs), 
])

# Define the normalization layer
l2_normalize = L2NormalizeLayer()

# Create the users input and point to the base network
num_users_features = users.shape[1] - 3 # Remove UserID, BookID, Rating
input_users = tf.keras.layers.Input(shape=(num_users_features,))
vu = users_NN(input_users)
vu = l2_normalize(vu)

# create the item input and point to the base network
num_items_features = items.shape[1] - 3 # Remove UserID, BookID, Rating
input_items = tf.keras.layers.Input(shape=(num_items_features,))
vm = items_NN(input_items)
vm = l2_normalize(vm)

# Compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# Specify the inputs and output of the model
model = tf.keras.Model([input_users, input_items], output)

model.summary()

In [38]:
# Using a mean squared error loss and an Adam optimizer
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss=cost_fn)

In [41]:
# Train the model
tf.random.set_seed(1)
model.fit([users_train[:, 3:], items_train[:, 3:]], y_train, batch_size=16, epochs=30)

[1m50349/50349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 4ms/step - loss: 0.6879


<keras.src.callbacks.history.History at 0x7f07e0365d50>