# Attempt at a GNN

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch

## Preprocessing of our data

In [2]:
ratings = pd.read_csv("Data/Ratings.csv")
books = pd.read_csv("Data/Books.csv", dtype={3: str})
users = pd.read_csv("Data/Users.csv")

In [3]:
# We will only use users and books present in the ratings dataset 
lessen_user_ids = {userid: idx for idx, userid in enumerate(ratings['User-ID'].unique())} #renumber IDs to reduce inactive users
ratings['New-User-ID'] = ratings['User-ID'].map(lessen_user_ids)
user_ids = list(ratings['New-User-ID'].unique())
num_users = len(set(user_ids))

# Map book identifiers (ISBN) to a unique integer identifier for datatype compatibility of dgl
isbn_to_id = {isbn: idx for idx, isbn in enumerate(ratings['ISBN'].unique())}
ratings['Book-ID'] = ratings['ISBN'].map(isbn_to_id)
book_ids = list(ratings['Book-ID'].unique())
num_books = len(set(book_ids))

print(f'There are {len(user_ids)} unique users, and {len(book_ids)} unique books in the ratings dataset.')
 
# Remove users and books not included in the ratings dataset
books['Book-ID'] = books['ISBN'].map(isbn_to_id)
books_clean = books[books['Book-ID'].isin(book_ids)]
books_clean_ids = books_clean['Book-ID'].unique()
percent_books_missing = round((num_books-len(books_clean_ids))/num_books*100, 0)

print(f'There are around {percent_books_missing}% of books in the graph missing in the books data')

users['New-User-ID'] = users['User-ID'].map(lessen_user_ids)
users_clean = users[users['New-User-ID'].isin(user_ids)]
print(f"There are: {len(users_clean['New-User-ID'])}, who have rated at least one book")

There are 105283 unique users, and 340556 unique books in the ratings dataset.
There are around 21.0% of books in the graph missing in the books data
There are: 105283, who have rated at least one book


In [4]:
ratings_with_book_titles = ratings.merge(books,on='ISBN')
ratings_with_book_titles.drop(columns=["ISBN","Image-URL-S","Image-URL-M"],axis=1,inplace=True)
# Drop Age because tooo many missing values
complete_df = ratings_with_book_titles.merge(users.drop("Age", axis=1), on="User-ID")

In [5]:
complete_df['Location'] = complete_df['Location'].str.split(',').str[-1].str.strip()
print(complete_df.columns)

Index(['User-ID', 'Book-Rating', 'New-User-ID_x', 'Book-ID_x', 'Book-Title',
       'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-L',
       'Book-ID_y', 'Location', 'New-User-ID_y'],
      dtype='object')


In [6]:
print((complete_df['Book-Rating'] == 0).sum())
print(len(complete_df))

647294
1031136


In [7]:
df = complete_df.loc[complete_df['Book-Rating'] != 0]
print(len(df))

383842


In [8]:
print((df['Year-Of-Publication'] == 'DK Publishing Inc').sum())
df = df[df['Year-Of-Publication'] != 'DK Publishing Inc']
print((df['Year-Of-Publication'] == 'DK Publishing Inc').sum())

1
0


In [9]:
df.head()

Unnamed: 0,User-ID,Book-Rating,New-User-ID_x,Book-ID_x,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L,Book-ID_y,Location,New-User-ID_y
1,276726,5,1,1,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...,1.0,usa,1.0
3,276729,3,3,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,http://images.amazon.com/images/P/052165615X.0...,3.0,croatia,3.0
4,276729,6,3,4,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,http://images.amazon.com/images/P/0521795028.0...,4.0,croatia,3.0
6,276744,7,7,8,A Painted House,JOHN GRISHAM,2001,Doubleday,http://images.amazon.com/images/P/038550120X.0...,8.0,usa,7.0
13,276747,9,10,16,Little Altars Everywhere,Rebecca Wells,2003,HarperTorch,http://images.amazon.com/images/P/0060517794.0...,16.0,usa,10.0


In [10]:
df.shape

(383841, 12)

In [11]:
print(len(df['User-ID'].unique()))
print(len(df['Book-Title'].unique()))

68091
135566


## Start with graph structure

In [12]:
from torch_geometric.data import HeteroData
from sklearn.preprocessing import LabelEncoder

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



### Create nodes

In [13]:
data = HeteroData()

User nodes

In [14]:
# Get unique users
unique_users = df[['User-ID', 'Location']].drop_duplicates()

# Take care of the User-IDs
user_id_encoder = LabelEncoder()
df['encoded-user-ID'] = user_id_encoder.fit_transform(df['User-ID'].astype(str))    # Do so otherwise it is going to create a problem with tensor indexing later
unique_users['User-ID'] = df['encoded-user-ID']

# Encode the Location feature
location_encoder = LabelEncoder()
unique_users['Location'] = location_encoder.fit_transform(unique_users['Location'])

# Get tensor
user_features_tensor = torch.tensor(unique_users[['User-ID', 'Location']].values, dtype=torch.float)

In [15]:
data['users'].x = user_features_tensor

In [16]:
data['users'].x.shape

torch.Size([68091, 2])

Book nodes

In [17]:
# Create a composite key that uniquely identifies each book
df['Book-Key'] = df['Book-Title'] + '|' + df['Book-Author'] + '|' + df['Publisher'] + '|' + df['Year-Of-Publication'].astype(str)

# Now, get unique books using the new composite key
unique_books = df[['Book-Key', 'Book-Title', 'Book-Author', 'Publisher', 'Year-Of-Publication']].drop_duplicates()

# Encode categorical features
book_key_encoder = LabelEncoder()
unique_books['Book-Key'] = book_key_encoder.fit_transform(unique_books['Book-Key'])

title_encoder = LabelEncoder()
author_encoder = LabelEncoder()
publisher_encoder = LabelEncoder()

unique_books['Book-Title'] = title_encoder.fit_transform(unique_books['Book-Title'])
unique_books['Book-Author'] = author_encoder.fit_transform(unique_books['Book-Author'])
unique_books['Publisher'] = publisher_encoder.fit_transform(unique_books['Publisher'])

# Normalize year of publication
unique_books['Year-Of-Publication'] = unique_books['Year-Of-Publication'].astype(int)
min_year = unique_books['Year-Of-Publication'].min()
max_year = unique_books['Year-Of-Publication'].max()
unique_books['Year-Of-Publication'] = (unique_books['Year-Of-Publication'] - min_year) / (max_year - min_year)

# Convert to tensor
book_features_tensor = torch.tensor(unique_books[['Book-Title', 'Book-Author', 'Publisher', 'Year-Of-Publication']].values, dtype=torch.float)

In [18]:
data['books'].x = book_features_tensor

In [19]:
data['books'].x.shape

torch.Size([149243, 4])

### Create edges

In [20]:
user_indices = df['encoded-user-ID'].to_numpy()
book_indices = book_key_encoder.transform(df['Book-Key'])

# Create tensors for user indices, book indices, and ratings
user_indices_tensor = torch.tensor(user_indices, dtype=torch.long)
book_indices_tensor = torch.tensor(book_indices, dtype=torch.long)
ratings_tensor = torch.tensor(df['Book-Rating'].values, dtype=torch.float)

# Adding edge data (edges from 'user' to 'book' with a relationship 'rated')
data['users', 'rated', 'books'].edge_index = torch.stack([user_indices_tensor, book_indices_tensor], dim=0)
data['users', 'rated', 'books'].edge_attr = ratings_tensor

In [21]:
data['users', 'rated', 'books'].num_edges

383841

In [22]:
print(data)
print(data.edge_index_dict)
print(data['users', 'rated', 'books'].edge_attr)

HeteroData(
  users={ x=[68091, 2] },
  books={ x=[149243, 4] },
  (users, rated, books)={
    edge_index=[2, 383841],
    edge_attr=[383841],
  }
)
{('users', 'rated', 'books'): tensor([[ 47760,  47761,  47761,  ...,  47756,  47757,  47759],
        [ 92341,  50886, 107531,  ...,  45852, 128517,  43549]])}
tensor([ 5.,  3.,  6.,  ...,  9., 10., 10.])


In [23]:
edge_index = data['users', 'rated', 'books'].edge_index

print("User indices range:", edge_index[0].min().item(), edge_index[0].max().item())
print("Book indices range:", edge_index[1].min().item(), edge_index[1].max().item())

User indices range: 0 68090
Book indices range: 0 149239


In [24]:
print(len(data.x_dict))

2


# Create GNN

In [25]:
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, SAGEConv, Linear

In [26]:
import torch_geometric.transforms as T

In [27]:
# data = T.ToUndirected(merge=True)(data)

In [28]:
print(data)

HeteroData(
  users={ x=[68091, 2] },
  books={ x=[149243, 4] },
  (users, rated, books)={
    edge_index=[2, 383841],
    edge_attr=[383841],
  }
)


## Model

In [29]:
# class LinkPredictionModel(torch.nn.Module):
#     def __init__(self, hidden_channels):
#         super().__init__()
#         self.user_conv = SAGEConv(-1, hidden_channels)  # Automatically infer input channels
#         self.book_conv = SAGEConv(-1, hidden_channels)
#         self.fc = Linear(2 * hidden_channels, 1)

#     def forward(self, x_dict, edge_index_dict):
#         # Update node features
#         x_dict['users'] = F.relu(self.user_conv(x_dict['users'], edge_index_dict[('users', 'rated', 'books')]))
#         x_dict['books'] = F.relu(self.book_conv(x_dict['books'], edge_index_dict[('books', 'rev_rated', 'users')]))

#         # Concatenate user and book features for each edge
#         user_features = x_dict['users'][edge_index_dict[('users', 'rated', 'books')][0]]
#         book_features = x_dict['books'][edge_index_dict[('users', 'rated', 'books')][1]]
#         edge_features = torch.cat([user_features, book_features], dim=1)

#         # Predict the existence of a link
#         return torch.sigmoid(self.fc(edge_features))

# model = LinkPredictionModel(hidden_channels=32)

class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, num_layers):
        super().__init__()

        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            conv = HeteroConv({
                ('user', 'rates', 'book'): SAGEConv((-1, -1), hidden_channels),
            }, aggr='sum')
            self.convs.append(conv)

        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        for conv in self.convs:
            x_dict = conv(x_dict, edge_index_dict)
            x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        return self.lin(x_dict['user'])

model = HeteroGNN(hidden_channels=64, out_channels=1, num_layers=2)



## Training

In [30]:
from torch.optim import Adam
from torch.nn import BCELoss

In [35]:
# Assuming 'data' is your HeteroData object and it has a 'book' key
num_books = data['books'].x.size(0)  # Total number of books

# Creating a mask with 80% of the data for training
train_mask = torch.rand(num_books) < 0.8

# Assigning the mask to your data object
data['books'].train_mask = train_mask


In [36]:
optimizer = Adam(model.parameters(), lr=0.01)
# criterion = BCELoss()

# def train(model, data, optimizer, criterion):
#     model.train()
#     optimizer.zero_grad()

#     predictions = model(data.x_dict, data.edge_index_dict)
#     actuals = (data['users', 'rated', 'books'].edge_attr > 7).float()  # Assuming 7+ is positive

#     loss = criterion(predictions.squeeze(), actuals)
#     loss.backward()
#     optimizer.step()
#     return loss.item()

# # Run the training loop
# for epoch in range(1, 11):
#     loss = train(model, data, optimizer, criterion)
#     print(f'Epoch {epoch}, Loss: {loss:.4f}')

def train(batch):
    model.train()
    optimizer.zero_grad()
    out = model(batch.x_dict, batch.edge_index_dict)
    mask = batch['book'].train_mask  # Assuming you have a train mask for books
    loss = F.cross_entropy(out[mask], batch['book'].y[mask])  # Adjust according to your label setup
    loss.backward()
    optimizer.step()
    return float(loss)


## Eval

In [38]:
from torch_geometric.loader import NeighborLoader
import torch_geometric.transforms as T

transform = T.ToUndirected()  # Make sure edges are bidirectional.

data = transform(data)  # Apply transformation to your graph data.

train_loader = NeighborLoader(
    data,
    # Sample 15 neighbors for each node and each edge type for 2 iterations:
    num_neighbors=[15] * 2,
    # Use a batch size of 128 for sampling training nodes of type "book":
    batch_size=128,
    input_nodes=('books', data['books'].train_mask),  # Adjust this to your setup
)

# Example usage
for batch in train_loader:
    loss = train(batch)
    print(f'Loss: {loss:.4f}')



AttributeError: 'EdgeStorage' object has no attribute 'edge_index'