# Book Recommender System

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# For Pre-Processing
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer


# For GNN
import torch
import torch.nn as nn
import torch.nn.functional as F

import networkx as nx
import dgl
import dgl.function as fn
from dgl.nn import GraphConv
from dgl.nn import HeteroGNNExplainer
import dgl.nn.pytorch as dglnn


# Data Exploration

In [2]:
ratings = pd.read_csv("Data/Ratings.csv")
books = pd.read_csv("Data/Books.csv")
users = pd.read_csv("Data/Users.csv")

  books = pd.read_csv("Data/Books.csv")


# Data Preprocessing

- We will only use users and books present in the ratings dataset 

In [3]:
def extract_country(location):
    if not location:
        return None
    parts = [part.strip() for part in location.split(',')]
    
    return parts[-1] if parts and parts[-1] else None

users['Country'] = users['Location'].apply(extract_country)

# label_encoder = LabelEncoder()
# users['Country'] = label_encoder.fit_transform(users['Country'])

# Removing Book Ratings that have a 0 rating
ratings = ratings[ratings['Book-Rating'] > 0]

### Filtering for Books and Users that have a rating

In [4]:
# Renaming User IDs
rename_user_ids = {userid: idx for idx, userid in enumerate(ratings['User-ID'].unique())}
# Mapping new User IDs to Users that have a rating
ratings['New-User-ID'] = ratings['User-ID'].map(rename_user_ids)
# Getting the unique User IDs Ratings
ratings_user_ids = list(ratings['New-User-ID'].unique())
print(f"Number of User IDs in raitngs: {len(ratings_user_ids)}")

# ISBN in Ratings Data sets
isbn_to_id = {isbn: idx for idx, isbn in enumerate(ratings['ISBN'].unique())}
# Map new ISBN to Books
ratings['New-Book-ISBN'] = ratings['ISBN'].map(isbn_to_id)
# Get unique Book Ratings 
ratings_book_ids = list(ratings['New-Book-ISBN'].unique())
print(f"There are: {len(ratings_book_ids)}, unique book IDs")

print(f'There are {len(ratings_user_ids)} unique users, and {len(ratings_book_ids)} unique books in the ratings dataset.\n')

# ===========================================================================================================================
# ===========================================================================================================================

# ISBN in Books Data set
books['New-Book-ISBN'] = books['ISBN'].map(isbn_to_id)
# Filtering for books that have a rating
books_clean = books[books['New-Book-ISBN'].isin(ratings_book_ids)]

print(f"There are: {len(books_clean['New-Book-ISBN'].unique())}, books that have an ISBN")

books_clean_ids = books_clean['New-Book-ISBN'].unique()
percent_books_missing = round((len(ratings_book_ids)-len(books_clean_ids))/len(ratings_book_ids)*100, 0)

print(f'There are around {percent_books_missing}% of books in the graph missing in the books data')

users['New-User-ID'] = users['User-ID'].map(rename_user_ids)
users_clean = users[users['New-User-ID'].isin(ratings_user_ids)]
print(f"There are: {len(users_clean['New-User-ID'])}, who have rated at least one book")

Number of User IDs in raitngs: 77805
There are: 185973, unique book IDs
There are 77805 unique users, and 185973 unique books in the ratings dataset.

There are: 149836, books that have an ISBN
There are around 19.0% of books in the graph missing in the books data
There are: 77805, who have rated at least one book


Aroung 1/5 of the books that have rating information do not have further information on the books dataset. However, as our objective is to investigate an user-based recommender system, this is irrelevant. We are able to embed the age and location data of users. As the age data is sparse, location data will be our main source of information.

In [5]:
# Including average rating
avg_rating = ratings.groupby("New-Book-ISBN")["Book-Rating"].mean().reset_index()
avg_rating = avg_rating.rename(columns={'Book-Rating': 'AVG_Rating'})

ratings = ratings.merge(avg_rating, on="New-Book-ISBN", how='left')

ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,New-User-ID,New-Book-ISBN,AVG_Rating
0,276726,0155061224,5,0,0,5.0
1,276729,052165615X,3,1,1,3.0
2,276729,0521795028,6,1,2,6.0
3,276736,3257224281,8,2,3,6.75
4,276737,0600570967,6,3,4,6.0


#### Fill in missing value Age with simple imputer

In [6]:
knn_imputer = KNNImputer(n_neighbors=5)
users_clean['Age'] = knn_imputer.fit_transform(users_clean[['Age']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_clean['Age'] = knn_imputer.fit_transform(users_clean[['Age']])


## Creating the Graph

In [7]:
src_tensor = torch.tensor(ratings['New-User-ID'].values)
dst_tensor = torch.tensor(ratings['New-Book-ISBN'].values)

# Users and Books datasets are metadata

print(len(src_tensor), len(dst_tensor))

num_users = len(ratings_user_ids)
num_books = len(ratings_book_ids)
print(f"There are {num_users} users, and {num_books} books")

# # Initialize the adjacency matrix with zeros
# adjacency_matrix = np.zeros((num_users, num_books))
# # Populate the adjacency matrix
# for user, book, rating in zip(src_tensor, dst_tensor, ratings_values):
#     adjacency_matrix[user, book] = rating
# print(f"Size of the adjacency matrix: {adjacency_matrix.shape}")

# Dictionary which defines the Heterograph
edges = {
    ('user', 'rating', 'book'): (src_tensor, dst_tensor)
}
g = dgl.heterograph(edges, num_nodes_dict={'user': num_users, 'book': num_books})
print(g)

433671 433671
There are 77805 users, and 185973 books
Graph(num_nodes={'book': 185973, 'user': 77805},
      num_edges={('user', 'rating', 'book'): 433671},
      metagraph=[('user', 'book', 'rating')])


In [8]:
# Weigth the edges by ratings
rating_data = ratings['Book-Rating'].values
g.edges['rating'].data['rating'] = torch.tensor(rating_data, dtype=torch.float32) 

In [9]:
# Add age to user feature
ages = users_clean.set_index('New-User-ID')['Age'].sort_index().values
g.nodes['user'].data['age'] =  torch.tensor(ages, dtype=torch.float32)

We extract the country from the location by obtaining the expression after the last comma in e.g. nyc, new york, usa

In [10]:
# Extracting only the Country from the Location
users_clean['Country'] = users_clean['Location'].str.rsplit(',', n=1).str[-1].str.strip()
# Country Frequency
country_counts = users_clean['Country'].value_counts(normalize=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_clean['Country'] = users_clean['Location'].str.rsplit(',', n=1).str[-1].str.strip()


We see that less frequent locations do not always contain country names, so we remove values of locations representing less than 1%

In [11]:
rare_countries = country_counts[country_counts < 0.01].index
users_clean.loc[users_clean['Country'].isin(rare_countries), 'Country'] = np.nan

Encoding Countries to a unique interger (same as label encoding)

In [12]:
# country_ids = {country: idx for idx, country in enumerate(users_clean['Country'].unique())}  # map country to a unique integer
# users_clean['CountryId'] = users_clean['Country'].map(country_ids)

country_ids = {country: idx for idx, country in enumerate(users_clean['Country'].dropna().unique())}
users_clean['CountryId'] = users_clean['Country'].map(country_ids).fillna(-1).astype(int)

countries_value_count = users_clean['CountryId'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_clean['CountryId'] = users_clean['Country'].map(country_ids).fillna(-1).astype(int)


In [13]:
print(countries_value_count)

CountryId
 1    45368
 0     6986
-1     6273
 4     4445
 6     4040
 2     2643
 8     2499
 3     2386
 7     2191
 5      974
Name: count, dtype: int64


In [14]:
countries = users_clean['CountryId'].values
countries_tensor = torch.tensor(countries, dtype=torch.float32)
g.nodes['user'].data['country'] = countries_tensor

#### Adding Average Ratings as Meta Data to books nodes

In [15]:
unique_book_ratings = ratings.drop_duplicates(subset='New-Book-ISBN')

In [16]:
# Adding Average Books
book_avg_rating = unique_book_ratings["AVG_Rating"]
avg_rating_tensor = torch.tensor(book_avg_rating, dtype=torch.float32)
g.nodes['book'].data['AVG_Rating'] = avg_rating_tensor

### Basic graph info

In [17]:
print(g.nodes['user'])
print(g.nodes['book'])

NodeSpace(data={'age': tensor([35.8179, 16.0000, 35.8179,  ..., 38.0000, 14.0000, 12.0000]), 'country': tensor([0., 1., 2.,  ..., 1., 8., 1.])})
NodeSpace(data={'AVG_Rating': tensor([5., 3., 6.,  ..., 5., 5., 8.])})


In [18]:
print(g)  # Prints the basic info of the graph, such as number of nodes and edges per type

# Print number of nodes for each type
print("Number of users:", g.number_of_nodes('user'))
print("Number of books:", g.number_of_nodes('book'))

# Print number of edges
print("Number of ratings:", g.number_of_edges('rating'))

Graph(num_nodes={'book': 185973, 'user': 77805},
      num_edges={('user', 'rating', 'book'): 433671},
      metagraph=[('user', 'book', 'rating')])
Number of users: 77805
Number of books: 185973
Number of ratings: 433671


### Node and Edge feature inspection

In [19]:
# Print user node features
print("User features:", g.nodes['user'].data.keys())

# Print book node features, if any
print("Book features:", g.nodes['book'].data.keys())

# Print edge features
print("Edge features:", g.edges['rating'].data.keys())

# Example to print specific feature details:
print("Sample user ages:", g.nodes['user'].data['age'][:5])  # prints first 5 user ages
print("Sample ratings:", g.edges['rating'].data['rating'][:5])  # prints first 5 ratings

User features: dict_keys(['age', 'country'])
Book features: dict_keys(['AVG_Rating'])
Edge features: dict_keys(['rating'])
Sample user ages: tensor([35.8179, 16.0000, 35.8179, 14.0000, 35.8179])
Sample ratings: tensor([5., 3., 6., 8., 6.])


### Eliminate isolated nodes if any

In [20]:
compact_g = dgl.compact_graphs(g)

### Create synthetic features for book based on degree of the node

In [21]:
book_in_degrees = compact_g.in_degrees(etype=('user', 'rating', 'book')).float().unsqueeze(1)

In [22]:
compact_g.nodes['book'].data['in_degree'] = book_in_degrees

In [23]:
g = compact_g

In [24]:
CIAO

NameError: name 'CIAO' is not defined

# Creating the GNN

## Architecture

In [32]:
g.nodes['user']

NodeSpace(data={'age': tensor([35.8179, 16.0000, 35.8179,  ..., 38.0000, 14.0000, 12.0000]), 'country': tensor([0., 1., 2.,  ..., 1., 8., 1.]), '_ID': tensor([    0,     1,     2,  ..., 77802, 77803, 77804])})

In [58]:
class GNNRecommender(nn.Module):
    def __init__(self, in_feats, hidden_feats, out_feats):
        super(GNNRecommender, self).__init__()
        self.user_embedding = nn.Embedding(num_users, in_feats)
        self.book_embedding = nn.Embedding(num_books, in_feats)
        self.user_conv1 = GraphConv(in_feats, hidden_feats, allow_zero_in_degree=True)
        self.user_conv2 = GraphConv(hidden_feats, out_feats, allow_zero_in_degree=True)
        self.book_conv1 = GraphConv(in_feats, hidden_feats, allow_zero_in_degree=True)
        self.book_conv2 = GraphConv(hidden_feats, out_feats, allow_zero_in_degree=True)
        self.fc = nn.Linear(out_feats * 2, 1)

    def forward(self, g, user_ids, book_ids):
        # Get initial node features
        user_feats = self.user_embedding(torch.arange(num_users))
        book_feats = self.book_embedding(torch.arange(num_books))

        # Assign initial features to graph
        g.nodes['user'].data['h'] = user_feats
        g.nodes['book'].data['h'] = book_feats

        # Apply graph convolution separately for users and books
        g.nodes['user'].data['h'] = torch.relu(self.user_conv1(g, g.nodes['user'].data['h']))
        g.nodes['user'].data['h'] = torch.relu(self.user_conv2(g, g.nodes['user'].data['h']))

        g.nodes['book'].data['h'] = torch.relu(self.book_conv1(g, g.nodes['book'].data['h']))
        g.nodes['book'].data['h'] = torch.relu(self.book_conv2(g, g.nodes['book'].data['h']))

        # Get the updated node features
        user_feats = g.nodes['user'].data['h']
        book_feats = g.nodes['book'].data['h']

        # Concatenate user and book features for final prediction
        final_feats = torch.cat([user_feats[user_ids], book_feats[book_ids]], dim=1)

        # Predict rating
        rating_pred = self.fc(final_feats)
        return rating_pred.squeeze()

# Hyperparameters
in_feats = 64
hidden_feats = 128
out_feats = 64

model = GNNRecommender(in_feats, hidden_feats, out_feats)

In [56]:
model

GNNRecommender(
  (user_embedding): Embedding(77805, 64)
  (book_embedding): Embedding(185973, 64)
  (conv1): GraphConv(in=64, out=128, normalization=both, activation=None)
  (conv2): GraphConv(in=128, out=64, normalization=both, activation=None)
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

In [50]:
import dgl
import torch
import torch.nn as nn
import torch.optim as optim
from dgl.nn import GraphConv

In [59]:
# Loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    predictions = model(g, src_tensor, dst_tensor)
    
    # Compute loss
    loss = criterion(predictions, ratings)
    
    # Backward pass and optimization
    loss.backward()
    optimizer.step()
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

DGLError: Expect number of features to match number of nodes (len(u)). Got 185973 and 77805 instead.

In [None]:
# Define node features (for simplicity, we use random features here, replace with actual features if available)
user_features = torch.randn(num_users, 10)  # 10-dimensional features for users
book_features = torch.randn(num_books, 10)  # 10-dimensional features for books

# Combine features into a single tensor, while keeping track of user and book indices
features = torch.cat([user_features, book_features], dim=0)

# Create and train the model
model = LinkPredictor(in_feats=10, hidden_feats=16, out_feats=8)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.MSELoss()

In [None]:
# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    scores = model(g, features)
    
    # Assuming we have ground truth scores
    ground_truth = ratings_values
    
    loss = loss_fn(scores, ground_truth)
    loss.backward()
    optimizer.step()
    
    print(f'Epoch {epoch}, Loss: {loss.item()}')

In [None]:
# Combine user and book features into tensors
user_features = torch.cat([g.nodes['user'].data['age'].unsqueeze(1), g.nodes['user'].data['country'].unsqueeze(1)], dim=1)
book_features = torch.cat([g.nodes['book'].data['AVG_Rating'].unsqueeze(1), g.nodes['book'].data['in_degree']], dim=1)

# Edge features (ratings)
edge_weights = g.edges['rating'].data['rating']

# Define input feature dimensions and number of classes
user_input_dim = user_features.shape[1]
book_input_dim = book_features.shape[1]
h_feats = 16  # Number of hidden features
out_feats = 1  # Regression task (predicting ratings)

# Initialize the model
model = HeteroGNN(user_input_dim, book_input_dim, h_feats, out_feats)

# Print model architecture
print(model)


In [None]:
# Training settings
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.MSELoss()  # Mean Squared Error for regression tasks

# Training loop
num_epochs = 2
for epoch in range(num_epochs):
    model.train()
    user_logits, book_logits = model(g, user_features, book_features, edge_weights)  # Forward pass

    # Combine user and book embeddings to predict ratings
    user_emb = user_logits[g.edges(etype='rating')[0]]
    book_emb = book_logits[g.edges(etype='rating')[1]]
    predicted_ratings = (user_emb + book_emb).mean(dim=1)  # Simplistic approach for combining embeddings
    loss = loss_fn(predicted_ratings, g.edges['rating'].data['rating'])
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}/{num_epochs}, Loss: {loss.item()}")

## Setting up model

In [None]:
# CHANGED g. to compact_g.

# Create USER Features Tensor
age_tensor = compact_g.nodes['user'].data['age'].unsqueeze(1)
country_tensor = compact_g.nodes['user'].data['country'].unsqueeze(1)
user_feats = torch.cat([age_tensor, country_tensor], dim=1)

# Create BOOK Features Tensor
book_feats = compact_g.nodes['book'].data['in_degree']

user_feat_dim = user_feats.shape[1]  # the size of user feature
book_feat_dim = book_feats.shape[1]  # the size of book feature

print(f"User feature dimension {user_feat_dim}")
print(f"Book feature dimension {book_feat_dim}")

In [None]:
hidden_size = 32
num_classes = 1  # predicting a single rating value
model = GNNRecommender(user_feat_dim, book_feat_dim, hidden_size, num_classes)

## Split graph for training and validation set

In [None]:
def split_graph(g, proportion=0.8):
    # Split edges randomly for training and validation
    num_edges = g.number_of_edges('rating')
    all_edges = np.arange(num_edges)
    np.random.shuffle(all_edges)
    
    train_size = int(num_edges * proportion)
    train_edges = all_edges[:train_size]
    val_edges = all_edges[train_size:]
    
    # # Create subgraphs based on the edges (change to True)
    g_train = dgl.edge_subgraph(g, train_edges, relabel_nodes=True)
    g_val = dgl.edge_subgraph(g, val_edges, relabel_nodes=True)
    
    # Create subgraphs based on the edges
    # g_train = dgl.edge_subgraph(g, {'rating': train_edges}, relabel_nodes=False)
    # g_val = dgl.edge_subgraph(g, {'rating': val_edges}, relabel_nodes=False)
    
    return g_train, g_val

In [None]:
g_train, g_val = split_graph(compact_g, proportion=0.8)

# Verify subgraphs
print("Number of users in training graph:", g_train.number_of_nodes('user'))
print("Number of books in training graph:", g_train.number_of_nodes('book'))
print("Number of ratings in training graph:", g_train.number_of_edges('rating'), "\n")

print("Number of users in validation graph:", g_val.number_of_nodes('user'))
print("Number of books in validation graph:", g_val.number_of_nodes('book'))
print("Number of ratings in validation graph:", g_val.number_of_edges('rating'))

In [None]:
# Inspect the structure of the edge data
print(g_train.edges['rating'].data)
print(g_val.edges['rating'].data)

In [None]:
# Get the features and ratings for the Training Set
age_tensor_train = g_train.nodes['user'].data['age'].unsqueeze(1) # (N, 1)
country_tensor_train = g_train.nodes['user'].data['country'].unsqueeze(1) # (N, 1)

assert age_tensor_train.shape[0] == country_tensor_train.shape[0], "Mismatch in number of users"


# Training Set
user_features_train = torch.cat([age_tensor_train, country_tensor_train], dim=1) # (N, 2)
book_features_train = g_train.nodes['book'].data['in_degree'] # (M, 1)

ratings_train = g_train.edges['rating'].data['rating']

# Add user and book features
g_train.nodes['user'].data['features'] = user_features_train
g_train.nodes['book'].data['features'] = book_features_train


# Get the features and ratings for the Validation Set
age_tensor_val = g_val.nodes['user'].data['age'].unsqueeze(1) # (N_val, 1)
country_tensor_val = g_val.nodes['user'].data['country'].unsqueeze(1) # (N_val, 1)

assert age_tensor_val.shape[0] == country_tensor_val.shape[0], "Mismatch in number of validation users"

# Validation Set
user_features_val = torch.cat([age_tensor_val, country_tensor_val], dim=1) # (N_val, 2)
book_features_val = g_val.nodes['book'].data['in_degree'] # (M_val, 1)
ratings_val = g_val.edges['rating'].data['rating']

# Verify feature dimensions
print("User features train shape:", user_features_train.shape)
print("Book features train shape:", book_features_train.shape)
print("Ratings train shape:", ratings_train.shape, "\n")

print("User features val shape:", user_features_val.shape)
print("Book features val shape:", book_features_val.shape)
print("Ratings val shape:", ratings_val.shape)


In [None]:
g_train.edges['rating'].data.keys()

## Train

In [None]:
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
def train(model, g, user_features, book_features, labels, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    outputs = model(g, user_features, book_features)
    # Added .squeeze()
    loss = criterion(outputs.squeeze(), labels)
    loss.backward()
    optimizer.step()
    return loss.item()

In [None]:
g_train

In [None]:
num_epochs = 2  # or however many epochs you deem necessary

for epoch in range(num_epochs):
    loss = train(model, g_train, user_features_train, book_features_train, ratings_train, optimizer, criterion)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss}')

## Evaluate model

In [None]:
def evaluate(model, g, user_features, book_features, labels, criterion):
    model.eval()
    with torch.no_grad():
        outputs = model(g, user_features, book_features)
        # Added .squeeze()
        loss = criterion(outputs.squeeze(), labels)
    return loss.item()

In [None]:
# Validation loss
validation_loss = evaluate(model, g_val, user_features_val, book_features_val, ratings_val, criterion)
print(f'Validation Loss: {validation_loss}')