# Book Recommender System

In [30]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
from dgl.nn import GraphConv
import pandas as pd
import numpy as np

In [31]:
ratings = pd.read_csv("Data/Ratings.csv")
print(f"Ratings Data Shape: {ratings.shape} \n")
print(f"{ratings.dtypes}\n")
print(ratings.head())

Ratings Data Shape: (1149780, 3) 

User-ID         int64
ISBN           object
Book-Rating     int64
dtype: object

   User-ID        ISBN  Book-Rating
0   276725  034545104X            0
1   276726  0155061224            5
2   276727  0446520802            0
3   276729  052165615X            3
4   276729  0521795028            6


In [32]:
books = pd.read_csv("Data/Books.csv")
print(f"Books Data Shape: {books.shape} \n")
print(f"{books.dtypes}\n")
print(books.head(3))

  books = pd.read_csv("Data/Books.csv")


Books Data Shape: (271360, 8) 

ISBN                   object
Book-Title             object
Book-Author            object
Year-Of-Publication    object
Publisher              object
Image-URL-S            object
Image-URL-M            object
Image-URL-L            object
dtype: object

         ISBN            Book-Title           Book-Author Year-Of-Publication  \
0  0195153448   Classical Mythology    Mark P. O. Morford                2002   
1  0002005018          Clara Callan  Richard Bruce Wright                2001   
2  0060973129  Decision in Normandy          Carlo D'Este                1991   

                 Publisher                                        Image-URL-S  \
0  Oxford University Press  http://images.amazon.com/images/P/0195153448.0...   
1    HarperFlamingo Canada  http://images.amazon.com/images/P/0002005018.0...   
2          HarperPerennial  http://images.amazon.com/images/P/0060973129.0...   

                                         Image-URL-M  \
0  http

In [33]:
print('Nan-values by column')
print(books.isna().sum())

Nan-values by column
ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64


In [34]:
users = pd.read_csv("Data/Users.csv")
print(f"Users Data Shape: {users.shape} \n")
print(f"{users.dtypes}\n")
print(users.head())

print('Nan-values by column')
print(users.isna().sum())

Users Data Shape: (278858, 3) 

User-ID       int64
Location     object
Age         float64
dtype: object

   User-ID                            Location   Age
0        1                  nyc, new york, usa   NaN
1        2           stockton, california, usa  18.0
2        3     moscow, yukon territory, russia   NaN
3        4           porto, v.n.gaia, portugal  17.0
4        5  farnborough, hants, united kingdom   NaN
Nan-values by column
User-ID          0
Location         0
Age         110762
dtype: int64


## Preprocessing

In [35]:
# We will only use users and books present in the ratings dataset 
lessen_user_ids = {userid: idx for idx, userid in enumerate(ratings['User-ID'].unique())} #renumber IDs to reduce inactive users
ratings['New-User-ID'] = ratings['User-ID'].map(lessen_user_ids)
user_ids = list(ratings['New-User-ID'].unique())
num_users = len(set(user_ids))

# Map book identifiers (ISBN) to a unique integer identifier for datatype compatibility of dgl
isbn_to_id = {isbn: idx for idx, isbn in enumerate(ratings['ISBN'].unique())}
ratings['Book-ID'] = ratings['ISBN'].map(isbn_to_id)
book_ids = list(ratings['Book-ID'].unique())
num_books = len(set(book_ids))

print(f'There are {len(user_ids)} unique users, and {len(book_ids)} unique books in the ratings dataset.')
 
# Remove users and books not included in the ratings dataset
books['Book-ID'] = books['ISBN'].map(isbn_to_id)
books_clean = books[books['Book-ID'].isin(book_ids)]
books_clean_ids = books_clean['Book-ID'].unique()
percent_books_missing = (num_books-len(books_clean_ids))/num_books*100

print(f'There are {percent_books_missing}% of books in the graph missing in the books data')

users['New-User-ID'] = users['User-ID'].map(lessen_user_ids)
users_clean = users[users['New-User-ID'].isin(user_ids)]

There are 105283 unique users, and 340556 unique books in the ratings dataset.
There are 20.673545613643572% of books in the graph missing in the books data


1/5 of the books that have rating information do not have further information on the books dataset. However, as our objective is to investivate an user-based recommender system, this is irrelevant. We are able to embed the age and location data of users. As the age data is sparse, location data will be our main source of information.

In [36]:
# Make graph
src = torch.tensor(ratings['New-User-ID'].values)
dst = torch.tensor(ratings['Book-ID'].values)

edges = {
    ('user', 'rating', 'book'): (src, dst)
}

g = dgl.heterograph(edges, num_nodes_dict={'user': num_users, 'book': num_books})
print(g)

Graph(num_nodes={'book': 340556, 'user': 105283},
      num_edges={('user', 'rating', 'book'): 1149780},
      metagraph=[('user', 'book', 'rating')])


In [37]:
# Weigh the edges by ratings
rating_data = ratings['Book-Rating'].values
ratings_tensor = torch.tensor(rating_data, dtype=torch.float32)
g.edges['rating'].data['rating'] = ratings_tensor 

In [38]:
# Add age to user feature
ages = users_clean['Age'].values
ages_tensor = torch.tensor(ages, dtype=torch.float32)
g.nodes['user'].data['age'] = ages_tensor

We extract the country from the location by obtaining the expression after the last comma in e.g. nyc, new york, usa

In [39]:
users_clean['Country'] = users_clean['Location'].str.rsplit(',', n=1).str[-1].str.strip()
country_counts = users_clean['Country'].value_counts(normalize=True) 

# We see that less frequent locations do not always contain country names, so we remove values of locations representing less than 1%
rare_countries = country_counts[country_counts < 0.01].index
users_clean.loc[users_clean['Country'].isin(rare_countries), 'Country'] = np.nan

country_ids = {country: idx for idx, country in enumerate(users_clean['Country'].unique())}  # map country to a unique integer
users_clean['CountryId'] = users_clean['Country'].map(country_ids)
countries = users_clean['CountryId'].values
countries_tensor = torch.tensor(countries, dtype=torch.float32)
g.nodes['user'].data['country'] = countries_tensor

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_clean['Country'] = users_clean['Location'].str.rsplit(',', n=1).str[-1].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_clean['CountryId'] = users_clean['Country'].map(country_ids)


In [40]:
print(g.nodes['user'])
print(g.nodes['book'])

NodeSpace(data={'age': tensor([18., nan, nan,  ..., 33., 32., nan]), 'country': tensor([0., 0., 1.,  ..., 0., 8., 0.])})
NodeSpace(data={})


### Basic graph info

In [41]:
print(g)  # Prints the basic info of the graph, such as number of nodes and edges per type

# Print number of nodes for each type
print("Number of users:", g.number_of_nodes('user'))
print("Number of books:", g.number_of_nodes('book'))

# Print number of edges
print("Number of ratings:", g.number_of_edges('rating'))

Graph(num_nodes={'book': 340556, 'user': 105283},
      num_edges={('user', 'rating', 'book'): 1149780},
      metagraph=[('user', 'book', 'rating')])
Number of users: 105283
Number of books: 340556
Number of ratings: 1149780


### Node and Edge feature inspection

In [42]:
# Print user node features
print("User features:", g.nodes['user'].data.keys())

# Print book node features, if any
print("Book features:", g.nodes['book'].data.keys())

# Print edge features
print("Edge features:", g.edges['rating'].data.keys())

# Example to print specific feature details:
print("Sample user ages:", g.nodes['user'].data['age'][:5])  # prints first 5 user ages
print("Sample ratings:", g.edges['rating'].data['rating'][:5])  # prints first 5 ratings

User features: dict_keys(['age', 'country'])
Book features: dict_keys([])
Edge features: dict_keys(['rating'])
Sample user ages: tensor([18., nan, nan, nan, 26.])
Sample ratings: tensor([0., 5., 0., 3., 6.])


### Eliminate isolated nodes if any

In [43]:
compact_g = dgl.compact_graphs(g)

### Create synthetic features for book based on degree of the node

In [58]:
book_in_degrees = g.in_degrees(etype=('user', 'rating', 'book')).float().unsqueeze(1)

In [59]:
g.nodes['book'].data['in_degree'] = book_in_degrees

# Creating the GNN

## Architecture

In [46]:
class GNNRecommender(nn.Module):
    def __init__(self, user_feats, book_feats, hidden_size, num_classes):
        super(GNNRecommender, self).__init__()
        self.user_conv = GraphConv(user_feats, hidden_size)
        self.book_conv = GraphConv(book_feats, hidden_size)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, g, user_features, book_features):
        user_h = F.relu(self.user_conv(g, user_features))
        book_h = F.relu(self.book_conv(g, book_features))
        user_book_h = user_h + book_h
        return self.fc(user_book_h)

## Setting up model

In [60]:
age_tensor = g.nodes['user'].data['age'].unsqueeze(1)
country_tensor = g.nodes['user'].data['country'].unsqueeze(1)
user_feats = torch.cat([age_tensor, country_tensor], dim=1)
user_feat_dim = user_feats.shape[1]  # the size of user feature
book_feats = g.nodes['book'].data['in_degree'].shape[1]  # the size of book features

hidden_size = 32
num_classes = 1  # predicting a single rating value

model = GNNRecommender(user_feat_dim, book_feats, hidden_size, num_classes)

## Split graph for training and validation set

In [63]:
def split_graph(g, proportion=0.8):
    # Split edges randomly for training and validation
    num_edges = g.number_of_edges('rating')
    all_edges = np.arange(num_edges)
    np.random.shuffle(all_edges)
    
    train_size = int(num_edges * proportion)
    train_edges = all_edges[:train_size]
    val_edges = all_edges[train_size:]
    
    # Create subgraphs based on the edges
    g_train = dgl.edge_subgraph(g, train_edges, relabel_nodes=False)
    g_val = dgl.edge_subgraph(g, val_edges, relabel_nodes=False)
    
    return g_train, g_val

In [67]:
g_train, g_val = split_graph(g, proportion=0.8)

# Get the features and ratings for each set
age_tensor_train = g_train.nodes['user'].data['age'].unsqueeze(1)
country_tensor_train = g_train.nodes['user'].data['country'].unsqueeze(1)
user_features_train = torch.cat([age_tensor_train, country_tensor_train], dim=1)
book_features_train = g_train.nodes['book'].data['in_degree']
ratings_train = g_train.edges['rating'].data['rating']

age_tensor_val = g_val.nodes['user'].data['age'].unsqueeze(1)
country_tensor_val = g_val.nodes['user'].data['country'].unsqueeze(1)
user_features_val = torch.cat([age_tensor_train, country_tensor_train], dim=1)
book_features_val = g_val.nodes['book'].data['in_degree']
ratings_val = g_val.edges['rating'].data['rating']

## Train

In [68]:
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [69]:
def train(model, g, user_features, book_features, labels, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    outputs = model(g, user_features, book_features)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    return loss.item()

In [70]:
num_epochs = 50  # or however many epochs you deem necessary

for epoch in range(num_epochs):
    loss = train(model, g_train, user_features_train, book_features_train, ratings_train, optimizer, criterion)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss}')

DGLError: There are 0-in-degree nodes in the graph, output for those nodes will be invalid. This is harmful for some applications, causing silent performance regression. Adding self-loop on the input graph by calling `g = dgl.add_self_loop(g)` will resolve the issue. Setting ``allow_zero_in_degree`` to be `True` when constructing this module will suppress the check and let the code run.

## Evaluate model

In [None]:
def evaluate(model, g, user_features, book_features, labels, criterion):
    model.eval()
    with torch.no_grad():
        outputs = model(g, user_features, book_features)
        loss = criterion(outputs, labels)
    return loss.item()

In [None]:
# Validation loss
validation_loss = evaluate(model, g_val, user_features_val, book_features_val, ratings_val, criterion)
print(f'Validation Loss: {validation_loss}')a