# Attempt at a GNN

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch

## Preprocessing of our data

In [2]:
ratings = pd.read_csv("Data/Ratings.csv")
books = pd.read_csv("Data/Books.csv", dtype={3: str})
users = pd.read_csv("Data/Users.csv")

In [3]:
# We will only use users and books present in the ratings dataset 
lessen_user_ids = {userid: idx for idx, userid in enumerate(ratings['User-ID'].unique())} #renumber IDs to reduce inactive users
ratings['New-User-ID'] = ratings['User-ID'].map(lessen_user_ids)
user_ids = list(ratings['New-User-ID'].unique())
num_users = len(set(user_ids))

# Map book identifiers (ISBN) to a unique integer identifier for datatype compatibility of dgl
isbn_to_id = {isbn: idx for idx, isbn in enumerate(ratings['ISBN'].unique())}
ratings['Book-ID'] = ratings['ISBN'].map(isbn_to_id)
book_ids = list(ratings['Book-ID'].unique())
num_books = len(set(book_ids))

print(f'There are {len(user_ids)} unique users, and {len(book_ids)} unique books in the ratings dataset.')
 
# Remove users and books not included in the ratings dataset
books['Book-ID'] = books['ISBN'].map(isbn_to_id)
books_clean = books[books['Book-ID'].isin(book_ids)]
books_clean_ids = books_clean['Book-ID'].unique()
percent_books_missing = round((num_books-len(books_clean_ids))/num_books*100, 0)

print(f'There are around {percent_books_missing}% of books in the graph missing in the books data')

users['New-User-ID'] = users['User-ID'].map(lessen_user_ids)
users_clean = users[users['New-User-ID'].isin(user_ids)]
print(f"There are: {len(users_clean['New-User-ID'])}, who have rated at least one book")

There are 105283 unique users, and 340556 unique books in the ratings dataset.
There are around 21.0% of books in the graph missing in the books data
There are: 105283, who have rated at least one book


In [4]:
ratings_with_book_titles = ratings.merge(books,on='ISBN')
ratings_with_book_titles.drop(columns=["ISBN","Image-URL-S","Image-URL-M"],axis=1,inplace=True)
# Drop Age because tooo many missing values
complete_df = ratings_with_book_titles.merge(users.drop("Age", axis=1), on="User-ID")

In [5]:
complete_df['Location'] = complete_df['Location'].str.split(',').str[-1].str.strip()
print(complete_df.columns)

Index(['User-ID', 'Book-Rating', 'New-User-ID_x', 'Book-ID_x', 'Book-Title',
       'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-L',
       'Book-ID_y', 'Location', 'New-User-ID_y'],
      dtype='object')


In [6]:
print((complete_df['Book-Rating'] == 0).sum())
print(len(complete_df))

647294
1031136


In [7]:
df = complete_df.loc[complete_df['Book-Rating'] != 0]
print(len(df))

383842


In [8]:
print((df['Year-Of-Publication'] == 'DK Publishing Inc').sum())
df = df[df['Year-Of-Publication'] != 'DK Publishing Inc']
print((df['Year-Of-Publication'] == 'DK Publishing Inc').sum())

1
0


In [9]:
df.head()

Unnamed: 0,User-ID,Book-Rating,New-User-ID_x,Book-ID_x,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L,Book-ID_y,Location,New-User-ID_y
1,276726,5,1,1,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...,1.0,usa,1.0
3,276729,3,3,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,http://images.amazon.com/images/P/052165615X.0...,3.0,croatia,3.0
4,276729,6,3,4,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,http://images.amazon.com/images/P/0521795028.0...,4.0,croatia,3.0
6,276744,7,7,8,A Painted House,JOHN GRISHAM,2001,Doubleday,http://images.amazon.com/images/P/038550120X.0...,8.0,usa,7.0
13,276747,9,10,16,Little Altars Everywhere,Rebecca Wells,2003,HarperTorch,http://images.amazon.com/images/P/0060517794.0...,16.0,usa,10.0


In [10]:
df.shape

(383841, 12)

In [11]:
print(len(df['User-ID'].unique()))
print(len(df['Book-Title'].unique()))

68091
135566


## Start with graph structure

In [12]:
from torch_geometric.data import HeteroData
from sklearn.preprocessing import LabelEncoder

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



### Create nodes

In [73]:
data = HeteroData()

User nodes

In [74]:
# Get unique users
unique_users = df[['User-ID', 'Location']].drop_duplicates()

# Take care of the User-IDs
user_id_encoder = LabelEncoder()
df['encoded-user-ID'] = user_id_encoder.fit_transform(df['User-ID'].astype(str))    # Do so otherwise it is going to create a problem with tensor indexing later
unique_users['User-ID'] = df['encoded-user-ID']

# Encode the Location feature
location_encoder = LabelEncoder()
unique_users['Location'] = location_encoder.fit_transform(unique_users['Location'])

# Get tensor
user_features_tensor = torch.tensor(unique_users[['User-ID', 'Location']].values, dtype=torch.float)
data['users'].x = user_features_tensor

In [75]:
data['users'].x.shape

torch.Size([68091, 2])

Book nodes

In [76]:
# Create a composite key that uniquely identifies each book
df['Book-Key'] = df['Book-Title'] + '|' + df['Book-Author'] + '|' + df['Publisher'] + '|' + df['Year-Of-Publication'].astype(str)

# Now, get unique books using the new composite key
unique_books = df[['Book-Key', 'Book-Title', 'Book-Author', 'Publisher', 'Year-Of-Publication']].drop_duplicates()

# Encode categorical features
book_key_encoder = LabelEncoder()
unique_books['Book-Key'] = book_key_encoder.fit_transform(unique_books['Book-Key'])

title_encoder = LabelEncoder()
author_encoder = LabelEncoder()
publisher_encoder = LabelEncoder()

unique_books['Book-Title'] = title_encoder.fit_transform(unique_books['Book-Title'])
unique_books['Book-Author'] = author_encoder.fit_transform(unique_books['Book-Author'])
unique_books['Publisher'] = publisher_encoder.fit_transform(unique_books['Publisher'])

# Normalize year of publication
unique_books['Year-Of-Publication'] = unique_books['Year-Of-Publication'].astype(int)
min_year = unique_books['Year-Of-Publication'].min()
max_year = unique_books['Year-Of-Publication'].max()
unique_books['Year-Of-Publication'] = (unique_books['Year-Of-Publication'] - min_year) / (max_year - min_year)

# Convert to tensor
book_features_tensor = torch.tensor(unique_books[['Book-Title', 'Book-Author', 'Publisher', 'Year-Of-Publication']].values, dtype=torch.float)
data['books'].x = book_features_tensor

In [77]:
data['books'].x.shape

torch.Size([149243, 4])

### Create edges

In [78]:
user_indices = df['encoded-user-ID'].to_numpy()
book_indices = book_key_encoder.transform(df['Book-Key'])

# Create tensors for user indices, book indices, and ratings
user_indices_tensor = torch.tensor(user_indices, dtype=torch.long)
book_indices_tensor = torch.tensor(book_indices, dtype=torch.long)
ratings_tensor = torch.tensor(df['Book-Rating'].values, dtype=torch.float)

# Adding edge data (edges from 'user' to 'book' with a relationship 'rated')
data['users', 'rated', 'books'].edge_index = torch.stack([user_indices_tensor, book_indices_tensor], dim=0)
data['users', 'rated', 'books'].edge_attr = ratings_tensor

In [79]:
data['users', 'rated', 'books'].num_edges

383841

In [80]:
print(data)
print(data.edge_index_dict)
print(data['users', 'rated', 'books'].edge_attr)

HeteroData(
  users={ x=[68091, 2] },
  books={ x=[149243, 4] },
  (users, rated, books)={
    edge_index=[2, 383841],
    edge_attr=[383841],
  }
)
{('users', 'rated', 'books'): tensor([[ 47760,  47761,  47761,  ...,  47756,  47757,  47759],
        [ 92341,  50886, 107531,  ...,  45852, 128517,  43549]])}
tensor([ 5.,  3.,  6.,  ...,  9., 10., 10.])


In [81]:
edge_index = data['users', 'rated', 'books'].edge_index

print("User indices range:", edge_index[0].min().item(), edge_index[0].max().item())
print("Book indices range:", edge_index[1].min().item(), edge_index[1].max().item())

User indices range: 0 68090
Book indices range: 0 149239


In [82]:
print(len(data.x_dict))

2


## Split train and test

In [83]:
def create_masks(num_nodes, train_frac=0.7, val_frac=0.15):
    indices = np.random.permutation(num_nodes)
    train_size = int(num_nodes * train_frac)
    val_size = int(num_nodes * val_frac)

    train_mask = np.zeros(num_nodes, dtype=bool)
    val_mask = np.zeros(num_nodes, dtype=bool)
    test_mask = np.zeros(num_nodes, dtype=bool)

    train_mask[indices[:train_size]] = True
    val_mask[indices[train_size:train_size + val_size]] = True
    test_mask[indices[train_size + val_size:]] = True
    
    return torch.from_numpy(train_mask), torch.from_numpy(val_mask), torch.from_numpy(test_mask)

# Create masks for users and books
user_train_mask, user_val_mask, user_test_mask = create_masks(68091)
book_train_mask, book_val_mask, book_test_mask = create_masks(149243)

In [84]:
data['users'].train_mask = user_train_mask
data['users'].val_mask = user_val_mask
data['users'].test_mask = user_test_mask

data['books'].train_mask = book_train_mask
data['books'].val_mask = book_val_mask
data['books'].test_mask = book_test_mask

In [85]:
print(data)

HeteroData(
  users={
    x=[68091, 2],
    train_mask=[68091],
    val_mask=[68091],
    test_mask=[68091],
  },
  books={
    x=[149243, 4],
    train_mask=[149243],
    val_mask=[149243],
    test_mask=[149243],
  },
  (users, rated, books)={
    edge_index=[2, 383841],
    edge_attr=[383841],
  }
)


# Create GNN

In [86]:
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, SAGEConv, GATConv, Linear

In [87]:
import torch_geometric.transforms as T

In [88]:
data = T.ToUndirected(merge=True)(data)

In [89]:
print(data)

HeteroData(
  users={
    x=[68091, 2],
    train_mask=[68091],
    val_mask=[68091],
    test_mask=[68091],
  },
  books={
    x=[149243, 4],
    train_mask=[149243],
    val_mask=[149243],
    test_mask=[149243],
  },
  (users, rated, books)={
    edge_index=[2, 383841],
    edge_attr=[383841],
  },
  (books, rev_rated, users)={
    edge_index=[2, 383841],
    edge_attr=[383841],
  }
)


## Model

In [90]:

class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, num_layers):
        super().__init__()
        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            conv = HeteroConv({
                ('users', 'rated', 'books'): SAGEConv((-1, -1), hidden_channels),
                ('books', 'rev_rated', 'users'): GATConv((-1, -1), hidden_channels, add_self_loops=False),
            }, aggr='sum')
            self.convs.append(conv)

        self.user_lin = Linear(hidden_channels, out_channels)
        self.book_lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        for conv in self.convs:
            x_dict = conv(x_dict, edge_index_dict)
            x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        return {
            'users': self.user_lin(x_dict['users']),
            'books': self.book_lin(x_dict['books'])
        }

model = HeteroGNN(hidden_channels=64, out_channels=1, num_layers=2)

with torch.no_grad():  # Initialize lazy modules.
    out = model(data.x_dict, data.edge_index_dict)
    print(out)

{'users': tensor([[-1.2565e+01],
        [-1.2689e+04],
        [-8.5426e+03],
        ...,
        [-1.4228e+04],
        [-1.1571e+04],
        [-8.0344e+03]]), 'books': tensor([[-308.0213],
        [-501.9356],
        [ -93.6838],
        ...,
        [-341.3477],
        [ 486.0679],
        [ 273.0971]])}


## Training

In [91]:
from torch.optim import Adam
from torch_geometric.loader import NeighborLoader

optimizer = Adam(model.parameters(), lr=0.01)

In [92]:
def train(num_epochs, data):
    model.train()
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        out = model(data.x_dict, data.edge_index_dict)
        
        # Calculate interaction scores
        interaction_scores = torch.sigmoid((out['users'][data['users', 'rated', 'books'].edge_index[0]] * out['books'][data['users', 'rated', 'books'].edge_index[1]]).sum(dim=1))
        
        # Convert boolean tensor to float tensor for BCE loss
        target_scores = (data['users', 'rated', 'books'].edge_attr > 0).float()
        
        # Compute the loss
        loss = F.binary_cross_entropy(interaction_scores, target_scores)
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

num_epochs = 20
train(num_epochs, data)

Epoch 1/20, Loss: 30.586101531982422
Epoch 2/20, Loss: 30.586101531982422
Epoch 3/20, Loss: 30.586101531982422
Epoch 4/20, Loss: 30.586101531982422
Epoch 5/20, Loss: 30.586101531982422
Epoch 6/20, Loss: 30.586101531982422
Epoch 7/20, Loss: 30.586101531982422
Epoch 8/20, Loss: 30.586101531982422
Epoch 9/20, Loss: 30.586101531982422
Epoch 10/20, Loss: 30.586101531982422
Epoch 11/20, Loss: 30.586101531982422
Epoch 12/20, Loss: 30.586101531982422
Epoch 13/20, Loss: 30.586101531982422
Epoch 14/20, Loss: 30.586101531982422
Epoch 15/20, Loss: 30.586101531982422
Epoch 16/20, Loss: 30.586101531982422
Epoch 17/20, Loss: 30.586101531982422
Epoch 18/20, Loss: 30.586101531982422
Epoch 19/20, Loss: 30.586101531982422
Epoch 20/20, Loss: 30.586101531982422


## Eval

In [93]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [96]:
def assign_edge_masks(data, relation, num_edges):
    # This is a simple random split for illustration; adjust based on your needs
    indices = torch.randperm(num_edges)
    train_size = int(num_edges * 0.7)
    val_size = int(num_edges * 0.15)

    # Assuming you don't have these masks
    if not hasattr(data[relation], 'train_mask'):
        data[relation].train_mask = torch.zeros(num_edges, dtype=torch.bool)
        data[relation].val_mask = torch.zeros(num_edges, dtype=torch.bool)
        data[relation].test_mask = torch.zeros(num_edges, dtype=torch.bool)

        data[relation].train_mask[indices[:train_size]] = True
        data[relation].val_mask[indices[train_size:train_size + val_size]] = True
        data[relation].test_mask[indices[train_size + val_size:]] = True

# Example usage:
assign_edge_masks(data, ('users', 'rated', 'books'), data[('users', 'rated', 'books')].edge_index.size(1))
assign_edge_masks(data, ('books', 'rev_rated', 'users'), data[('books', 'rev_rated', 'users')].edge_index.size(1))

In [99]:
def evaluate(model, data, relation=('users', 'rated', 'books')):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)

        # Ensure the mask is for edges, not nodes
        if 'val_mask' not in data[relation]:
            raise ValueError("Validation mask for edges not found in data!")
        
        val_mask = data[relation].val_mask
        
        # Use the validation mask to filter the edge indices for users and books
        test_user_indices = data[relation].edge_index[0][val_mask]
        test_book_indices = data[relation].edge_index[1][val_mask]
        
        # Calculate interaction scores based on the filtered indices
        user_scores = out['users'][test_user_indices]
        book_scores = out['books'][test_book_indices]
        interaction_scores = torch.sigmoid((user_scores * book_scores).sum(dim=1))
        
        # True labels for the test data
        true_labels = (data[relation].edge_attr[val_mask] > 0).float()
        
        # Calculate metrics
        predicted_labels = interaction_scores > 0.5  # Thresholding at 0.5
        accuracy = accuracy_score(true_labels.cpu().numpy(), predicted_labels.cpu().numpy())
        precision, recall, f1, _ = precision_recall_fscore_support(true_labels.cpu().numpy(), predicted_labels.cpu().numpy(), average='binary')
        
        return accuracy, precision, recall, f1

# Run evaluation
accuracy, precision, recall, f1 = evaluate(model, data)
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")


Accuracy: 0.6960539113519522, Precision: 1.0, Recall: 0.6960539113519522, F1 Score: 0.8207922008765821
