In [None]:
import pandas as pd
import torch
import torch_geometric as pyg

from torch import nn
import torch.optim as optim
import torch.nn.functional as F
from torch_geometric.loader import NeighborLoader

import pandas as pd
import networkx as nx
import numpy as np
from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm

from sentence_transformers import SentenceTransformer

In [None]:
if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'
print(device)

In [None]:
device = "cpu"

## Load data

In [None]:
df_books = pd.read_csv('../data/books.csv')[['book_id', 'title', 'authors']]    # TODO: think about using also the columns

# df_ratings = pd.read_csv('../data/ratings.csv').sample(500000)  # TODO: remove the sampling on the final run
df_ratings = pd.read_csv('../data/ratings.csv')

print(df_books.columns)

In [None]:
# Create features
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

In [None]:
# Books features
df_books["text_to_embed"] = "Title: " + df_books["title"] + " Authors: " + df_books["authors"]
with torch.no_grad():
    titles_emb = model.encode(df_books['text_to_embed'].values, device=device, show_progress_bar=True, batch_size=32)
    
del model
torch.cuda.empty_cache()    

books_features = torch.tensor(titles_emb)
print("Books features shape:", books_features.shape)

# Users features: as we don't have any information we will use random features
# users_features = torch.rand(df_ratings['user_id'].nunique(), 768, device=device)
# print("Users features shape:", users_features.shape)

In [None]:
# embedding users

# # Create a bipartite graph
B = nx.Graph()
# Add nodes with the node attribute "bipartite"
B.add_nodes_from(df_ratings['user_id'].unique(), bipartite=0)  # Users
B.add_nodes_from(df_ratings['book_id'].unique(), bipartite=1)  # Books

# Add edges between users and books
for _, row in tqdm(df_ratings.iterrows(), total=df_ratings.shape[0], desc="Adding edges"):
    B.add_edge(row['user_id'], row['book_id'], weight=row['rating'])

# Compute metrics
centrality = nx.degree_centrality(B)
print('degree centrality computed')
pagerank = nx.pagerank(B, weight='weight')
print('pagerank computed')
average_rating = df_ratings.groupby('user_id')['rating'].mean()
print('all metrics computed')

# # Prepare feature vectors for users
features = pd.DataFrame(index=df_ratings['user_id'].unique())
features['degree'] = [centrality[node] for node in features.index]
features['pagerank'] = [pagerank[node] for node in features.index]
features['average_rating'] = [average_rating.get(node, 0) for node in features.index]  # Add average ratings

# # Normalize features
scaler = MinMaxScaler()
features_scaled = pd.DataFrame(scaler.fit_transform(features), index=features.index, columns=features.columns)

# # Display the normalized features
users_features = features_scaled.to_numpy(dtype=np.float32)

features_scaled.head() 

# aprox 2 minutes

In [None]:
# Merge the two dataframes keeping user_id, book_id, rating, title, authors
df_ratings = pd.merge(df_ratings, df_books, on='book_id')
df_ratings.head()

In [None]:
# Create a mapping from the user_id to a unique consecutive value in the range [0, num_users]:
unique_user_id = df_ratings['user_id'].unique()
unique_user_id = pd.DataFrame(data={
    'user_id': unique_user_id, 
    'mapped_user_id': pd.RangeIndex(len(unique_user_id))
    })
print("Mapping of user IDs to consecutive values:")
print("==========================================")
print(unique_user_id.head())
print()

# Create a mapping from the book_id to a unique consecutive value in the range [0, num_books]:
unique_book_id = df_ratings['book_id'].unique()
unique_book_id = pd.DataFrame(data={
    'book_id': unique_book_id,
    'mapped_book_id': pd.RangeIndex(len(unique_book_id))
    })
print("Mapping of book IDs to consecutive values:")
print("===========================================")
print(unique_book_id.head())
print()

In [None]:
df_ratings = df_ratings.merge(unique_user_id, on='user_id')
df_ratings = df_ratings.merge(unique_book_id, on='book_id')

# With this, we are ready to create the edge_index representation in COO format
# following the PyTorch Geometric semantics:
edge_index = torch.stack([
    torch.tensor(df_ratings['mapped_user_id'].values), 
    torch.tensor(df_ratings['mapped_book_id'].values)]
    , dim=0)

print(edge_index[:, :10])

In [None]:
import torch_geometric.transforms as T
from torch_geometric.data import HeteroData

# Create the heterogeneous graph data object:
data = HeteroData()

# Add the user nodes:
data['user'].x = torch.tensor(users_features,)  # (num_users, num_users_features)

# Add the book nodes:
data['book'].x = torch.tensor(titles_emb,)  # (num_books, num_books_features)

# Add the rating edges:
data['user', 'rates', 'book'].edge_index = edge_index  # (2, num_ratings)

# Add the rating labels:
rating = torch.from_numpy(df_ratings['rating'].values)
data['user', 'rates', 'book'].edge_label = rating  # [num_ratings]

# We also need to make sure to add the reverse edges from books to users
# in order to let a GNN be able to pass messages in both directions.
# We can leverage the `T.ToUndirected()` transform for this from PyG:
data = T.ToUndirected()(data)

# With the above transformation we also got reversed labels for the edges.
# We remove them
del data['book', 'rev_rates', 'user'].edge_label

print(data['user'].num_nodes,len(unique_user_id))
assert data['user'].num_nodes == len(unique_user_id)
assert data['user', 'rates', 'book'].num_edges == len(df_ratings)

data

In [None]:
data

In [None]:
train_data, val_data, test_data = T.RandomLinkSplit(
    add_negative_train_samples=True,
    num_val=0.15,
    num_test=0.15,
    edge_types=[('user', 'rates', 'book')],
    rev_edge_types=[('book', 'rev_rates', 'user')],
)(data)
train_data, val_data, test_data 

In [None]:
from torch_geometric.nn import SAGEConv, to_hetero

class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = torch.nn.Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = torch.nn.Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['book'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)


model = Model(hidden_channels=10).to(device)
print(model)

In [None]:
val_data

In [None]:
from torch_geometric.loader import HGTLoader
train_mask = torch.tensor([True] * train_data["user"].x.shape[0], )
train_loader = HGTLoader(
    train_data,
    num_samples=[1024] * 4,
    shuffle=True,
    batch_size=128,
    input_nodes=("user", train_mask),
)
val_loader = HGTLoader(
    val_data,
    num_samples=[1024] * 4,
    shuffle=False,
    batch_size=128,
    input_nodes=("user", torch.tensor([True] * val_data["user"].x.shape[0], )),
)

In [None]:
# Training Loop
def train(model, data_loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader):
        batch = batch.to(device)
        optimizer.zero_grad()
        pred = model(batch.x_dict, batch.edge_index_dict, batch['user', 'rates', 'book'].edge_label_index)
        loss = criterion(pred, batch['user', 'rates', 'book'].edge_label.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / (len(data_loader.dataset) / 128)

# Testing Loop
def test(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            batch = batch.to(device)
            pred = model(batch.x_dict, batch.edge_index_dict, batch['user', 'rates', 'book'].edge_label_index)
            loss = criterion(pred, batch['user', 'rates', 'book'].edge_label.float())
            total_loss += loss.item()
    return total_loss / (len(data_loader.dataset) / 128)

# Main training and testing routines
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.MSELoss()

train_losses = []
val_losses = []

num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion)
    val_loss = test(model, val_loader, criterion)
    
    # val_loss = test(model, val_data, criterion)
    print(f'Epoch: {epoch+1}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
    val_losses.append(val_loss)
    train_losses.append(train_loss)

# Optionally, after training, you can evaluate your model on the test dataset
# test_loss = test(model, test_data, criterion)
# print(f'Test Loss: {test_loss:.4f}')


In [None]:
import matplotlib.pyplot as plt
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Training Loop
def train(model, data, optimizer, criterion):
    model.train()
    if device != 'cpu':
        data = data.to(device)
    optimizer.zero_grad()
    
    pred = model(data.x_dict, data.edge_index_dict, data['user', 'rates', 'book'].edge_label_index)
    loss = criterion(pred, data['user', 'rates', 'book'].edge_label.float())
    loss.backward()
    optimizer.step()
    total_loss = loss.item()
    return total_loss 

# Testing Loop
def test(model, data, criterion):
    model.eval()
    with torch.no_grad():
        if device != 'cpu':
            data = data.to(device)
        pred = model(data.x_dict, data.edge_index_dict, data['user', 'rates', 'book'].edge_label_index)
        loss = criterion(pred, data['user', 'rates', 'book'].edge_label.float())
        total_loss = loss.item()
    return total_loss

# Main training and testing routines
optimizer = optim.Adam(model.parameters(), lr=0.005)
criterion = nn.MSELoss()

train_losses = []
valid_losses = []

num_epochs = 100
for epoch in range(num_epochs):
    for d in train_loader:
        train_loss = train(model, train_data, optimizer, criterion)
        val_loss = test(model, val_data, criterion)
        print(f'Epoch: {epoch+1}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
        train_losses.append(train_loss)
        valid_losses.append(val_loss)

# Optionally, after training, you can evaluate your model on the test dataset
test_loss = test(model, test_data, criterion)
print(f'Test Loss: {test_loss:.4f}')

In [None]:
test_loss = test(model, test_data, criterion)

In [None]:
print(f'Test Loss: {test_loss:.4f}')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot the training and validation losses
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')
plt.plot(valid_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Save model 
torch.save(model.state_dict(), "model.pt")

## Validation & metrics

In [None]:
model = Model(hidden_channels=10).to(device)
model.load_state_dict(torch.load("model.pt"))

In [None]:
pred_review = model(test_data.x_dict, test_data.edge_index_dict, test_data['user', 'rates', 'book'].edge_label_index)

In [None]:
import seaborn as sns
sns.boxplot(pred_review.cpu().detach().numpy())

In [None]:
test_data['user', 'rates', 'book'].edge_label_index

In [None]:
test_data['user', 'rates', 'book'].edge_label

In [None]:
# Convert tensors to numpy arrays
user_ids_np = test_data['user', 'rates', 'book'].edge_label_index[0].numpy()
book_ids_np = test_data['user', 'rates', 'book'].edge_label_index[1].numpy()
ratings_np = test_data['user', 'rates', 'book'].edge_label.numpy()
ratings_pred_np = pred_review.detach().numpy()

# Create a dictionary with the data
data = {
    'user_id': user_ids_np,
    'book_id': book_ids_np,
    'rating': ratings_np, 
    'predicted_rating': ratings_pred_np
}

# Create a pandas DataFrame
df_ratings = pd.DataFrame(data)

In [None]:
import sys
sys.path.append('../src')
from evaluation_metrics import *

k = 10
top_k_recommendations = get_top_k_recommendations(df_ratings, k)
actual_items = get_actual_items(df_ratings) # ground truth

# Evaluate the recommendations
mean_precision, mean_recall, mean_f1 = evaluate_recommendations(top_k_recommendations, actual_items, k)
print(f"Mean Precision@{k}: {mean_precision}")
print(f"Mean Recall@{k}: {mean_recall}")
print(f"Mean F1 Score@{k}: {mean_f1}")

Matrix factorization:

Mean Precision@10: 0.7722234424908242
Mean Recall@10: 0.5475533441372822
Mean F1 Score@10: 0.6128487333956821

In [None]:
todo
visualization on how the data looks like


report 
objective and motivation 
analysis of the data
method: improving over matrix factorization baseline
results
future study: even an idea about how to use diversity