# GNN Book Recommender PyG implementation (attempt)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.nn import HeteroConv, SAGEConv, Linear
from sklearn.preprocessing import LabelEncoder
import torch_geometric.transforms as T
from torch.optim import Adam
from torch.nn import BCELoss
from torch_geometric.loader import NeighborLoader

## Data Preprocessing

In [None]:
# Load datasets
ratings = pd.read_csv("Data/Ratings.csv")   # contains user-book ratings
books = pd.read_csv("Data/Books.csv", dtype={3: str})   # contains book attributes
users = pd.read_csv("Data/Users.csv")   # contains user profiles

In [None]:
# We will only use users and books present in the ratings dataset 
lessen_user_ids = {userid: idx for idx, userid in enumerate(ratings['User-ID'].unique())} #renumber IDs to reduce inactive users
ratings['New-User-ID'] = ratings['User-ID'].map(lessen_user_ids)
user_ids = list(ratings['New-User-ID'].unique())
num_users = len(set(user_ids))


# Map book identifiers (ISBN) to a unique integer identifier for datatype compatibility of dgl
isbn_to_id = {isbn: idx for idx, isbn in enumerate(ratings['ISBN'].unique())}
ratings['Book-ID'] = ratings['ISBN'].map(isbn_to_id)
book_ids = list(ratings['Book-ID'].unique())
num_books = len(set(book_ids))
print(f'There are {len(user_ids)} unique users, and {len(book_ids)} unique books in the ratings dataset.')
 
# Remove books not included in the ratings dataset
books['Book-ID'] = books['ISBN'].map(isbn_to_id)
books_clean = books[books['Book-ID'].isin(book_ids)]
books_clean_ids = books_clean['Book-ID'].unique()
percent_books_missing = round((num_books-len(books_clean_ids))/num_books*100, 0)
print(f'There are around {percent_books_missing}% of books in the graph missing in the books data')

# Remove users that are not included in the ratings dataset
users['New-User-ID'] = users['User-ID'].map(lessen_user_ids)
users_clean = users[users['New-User-ID'].isin(user_ids)]
print(f"There are: {len(users_clean['New-User-ID'])}, who have rated at least one book")

## Merging Datasets

In [None]:
# Merge books and ratings dataset
ratings_with_book_titles = ratings.merge(books,on='ISBN')
ratings_with_book_titles.drop(columns=["ISBN","Image-URL-S","Image-URL-M"],axis=1,inplace=True)
# Drop Age because tooo many missing values
complete_df = ratings_with_book_titles.merge(users.drop("Age", axis=1), on="User-ID")

In [None]:
# Merge user location data and ratings
complete_df['Location'] = complete_df['Location'].str.split(',').str[-1].str.strip()
print(complete_df.columns)

In [None]:
# Remove books with no rating informaiton
df = complete_df.loc[complete_df['Book-Rating'] != 0]
print(len(df))

In [None]:
# Remove anomaly due to data entry errors
df = df[df['Year-Of-Publication'] != 'DK Publishing Inc']

In [None]:
df.head()

In [None]:
df.shape

In [None]:
print(f'There are {len(df['User-ID'].unique())} users in the dataset.\n')
print(f'There are {len(df['Book-Title'].unique())} booksin the dataset.')

## Start with graph structure

### Create nodes

In [None]:
data = HeteroData() #initialize heterogenous graph

User nodes

In [None]:
# Get unique users
unique_users = df[['User-ID', 'Location']].drop_duplicates()

# Take care of the User-IDs
user_id_encoder = LabelEncoder()
df['encoded-user-ID'] = user_id_encoder.fit_transform(df['User-ID'].astype(str))    # Do so otherwise it is going to create a problem with tensor indexing later
unique_users['User-ID'] = df['encoded-user-ID']

# Encode the Location feature
location_encoder = LabelEncoder()
unique_users['Location'] = location_encoder.fit_transform(unique_users['Location'])

# Get user tensor
user_features_tensor = torch.tensor(unique_users[['User-ID', 'Location']].values, dtype=torch.float)

In [None]:
data['users'].x = user_features_tensor

In [None]:
data['users'].x.shape

Book nodes

In [None]:
# Create a composite key that uniquely identifies each book
df['Book-Key'] = df['Book-Title'] + '|' + df['Book-Author'] + '|' + df['Publisher'] + '|' + df['Year-Of-Publication'].astype(str)

# Now, get unique books using the new composite key
unique_books = df[['Book-Key', 'Book-Title', 'Book-Author', 'Publisher', 'Year-Of-Publication']].drop_duplicates()

# Encode categorical features
book_key_encoder = LabelEncoder()
unique_books['Book-Key'] = book_key_encoder.fit_transform(unique_books['Book-Key'])

title_encoder = LabelEncoder()
author_encoder = LabelEncoder()
publisher_encoder = LabelEncoder()

unique_books['Book-Title'] = title_encoder.fit_transform(unique_books['Book-Title'])
unique_books['Book-Author'] = author_encoder.fit_transform(unique_books['Book-Author'])
unique_books['Publisher'] = publisher_encoder.fit_transform(unique_books['Publisher'])

# Normalize year of publication
unique_books['Year-Of-Publication'] = unique_books['Year-Of-Publication'].astype(int)
min_year = unique_books['Year-Of-Publication'].min()
max_year = unique_books['Year-Of-Publication'].max()
unique_books['Year-Of-Publication'] = (unique_books['Year-Of-Publication'] - min_year) / (max_year - min_year)

# Convert to tensor
book_features_tensor = torch.tensor(unique_books[['Book-Title', 'Book-Author', 'Publisher', 'Year-Of-Publication']].values, dtype=torch.float)

In [None]:
data['books'].x = book_features_tensor

In [None]:
data['books'].x.shape

### Create edges

In [None]:
user_indices = df['encoded-user-ID'].to_numpy()
book_indices = book_key_encoder.transform(df['Book-Key'])

# Create tensors for user indices, book indices, and ratings
user_indices_tensor = torch.tensor(user_indices, dtype=torch.long)
book_indices_tensor = torch.tensor(book_indices, dtype=torch.long)
ratings_tensor = torch.tensor(df['Book-Rating'].values, dtype=torch.float)

# Adding edge data (edges from 'user' to 'book' with a relationship 'rated')
data['users', 'rated', 'books'].edge_index = torch.stack([user_indices_tensor, book_indices_tensor], dim=0)
data['users', 'rated', 'books'].edge_attr = ratings_tensor

In [None]:
data['users', 'rated', 'books'].num_edges

In [None]:
# Get indices range for users and books
edge_index = data['users', 'rated', 'books'].edge_index

print("User indices range:", edge_index[0].min().item(), edge_index[0].max().item())
print("Book indices range:", edge_index[1].min().item(), edge_index[1].max().item())

# GNN Construction

## Model

In [None]:
class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, num_layers):
        super().__init__()
        
        # create SageConv layers
        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            conv = HeteroConv({
                ('user', 'rates', 'book'): SAGEConv((-1, -1), hidden_channels), # SAGEConv updates nodes based on neighbors
            }, aggr='sum') # aggregate by sum 
            self.convs.append(conv)

        self.lin = Linear(hidden_channels, out_channels) # linear transformation

    def forward(self, x_dict, edge_index_dict):
        for conv in self.convs:
            x_dict = conv(x_dict, edge_index_dict) # apply conv layers
            x_dict = {key: F.relu(x) for key, x in x_dict.items()}  # apply ReLU to the output of each layer
        return self.lin(x_dict['user']) # pass final output through a linear layer

# Base model
model = HeteroGNN(hidden_channels=64, out_channels=1, num_layers=2)

## Training

In [None]:
# Assuming 'data' is your HeteroData object and it has a 'book' key
num_books = data['books'].x.size(0)  # Total number of books

# Creating a mask with 80% of the data for training
train_mask = torch.rand(num_books) < 0.8

# Assigning the mask to your data object
data['books'].train_mask = train_mask

In [None]:
# Use Adam optimizer
optimizer = Adam(model.parameters(), lr=0.01)

def train(batch):
    model.train()
    optimizer.zero_grad()
    out = model(batch.x_dict, batch.edge_index_dict) # Forward pass
    mask = batch['book'].train_mask  #  Use mask defined previously 
    loss = F.cross_entropy(out[mask], batch['book'].y[mask]) # Calculate loss
    loss.backward() # Backward pass
    optimizer.step() # Update model parameters
    return float(loss)


## Eval

In [None]:
transform = T.ToUndirected()  # Make sure edges are bidirectional.

data = transform(data)  # Apply transformation.

train_loader = NeighborLoader(
    data,
    num_neighbors=[15] * 2, # Sample 15 neighbors for each node and each edge type for 2 iterations:
    batch_size=128,
    input_nodes=('books', data['books'].train_mask),  
)

for batch in train_loader:
    loss = train(batch)
    print(f'Loss: {loss:.4f}')