In [1]:
import pandas as pd
import torch
import torch_geometric as pyg

from torch import nn
import torch.optim as optim
import torch.nn.functional as F
from torch_geometric.loader import NeighborLoader

import pandas as pd
import networkx as nx
import numpy as np
from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm

from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'
print(device)

cuda


In [3]:
# device = "cpu"

## Load data

In [4]:
df_books = pd.read_csv('../data/books.csv')[['book_id', 'title', 'authors']]    # TODO: think about using also the columns

# df_ratings = pd.read_csv('../data/ratings.csv').sample(500000)  # TODO: remove the sampling on the final run
df_ratings = pd.read_csv('../data/ratings.csv')

print(df_books.columns)

Index(['book_id', 'title', 'authors'], dtype='object')


In [5]:
len(df_ratings)

5976479

In [6]:
# Create features
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)



In [7]:
# Books features
df_books["text_to_embed"] = "Title: " + df_books["title"] + " Authors: " + df_books["authors"]
with torch.no_grad():
    titles_emb = model.encode(df_books['text_to_embed'].values, device=device, show_progress_bar=True, batch_size=32)
    
del model
torch.cuda.empty_cache()    

books_features = torch.tensor(titles_emb)
print("Books features shape:", books_features.shape)

# Users features: as we don't have any information we will use random features
# users_features = torch.rand(df_ratings['user_id'].nunique(), 768, device=device)
# print("Users features shape:", users_features.shape)

Batches: 100%|██████████| 313/313 [00:04<00:00, 64.24it/s]


Books features shape: torch.Size([10000, 384])


In [8]:
# embedding users

# # Create a bipartite graph
B = nx.Graph()
# Add nodes with the node attribute "bipartite"
B.add_nodes_from(df_ratings['user_id'].unique(), bipartite=0)  # Users
B.add_nodes_from(df_ratings['book_id'].unique(), bipartite=1)  # Books

# Add edges between users and books
for _, row in tqdm(df_ratings.iterrows(), total=df_ratings.shape[0], desc="Adding edges"):
    B.add_edge(row['user_id'], row['book_id'], weight=row['rating'])

# Compute metrics
centrality = nx.degree_centrality(B)
print('degree centrality computed')
pagerank = nx.pagerank(B, weight='weight')
print('pagerank computed')
average_rating = df_ratings.groupby('user_id')['rating'].mean()
print('all metrics computed')

# # Prepare feature vectors for users
features = pd.DataFrame(index=df_ratings['user_id'].unique())
features['degree'] = [centrality[node] for node in features.index]
features['pagerank'] = [pagerank[node] for node in features.index]
features['average_rating'] = [average_rating.get(node, 0) for node in features.index]  # Add average ratings

# # Normalize features
scaler = MinMaxScaler()
features_scaled = pd.DataFrame(scaler.fit_transform(features), index=features.index, columns=features.columns)

# # Display the normalized features
users_features = features_scaled.to_numpy(dtype=np.float32)

features_scaled.head() 

# aprox 2 minutes

Adding edges: 100%|██████████| 5976479/5976479 [03:59<00:00, 24939.01it/s]


degree centrality computed
pagerank computed
all metrics computed


Unnamed: 0,degree,pagerank,average_rating
1,1.0,1.0,0.647436
2,0.957285,0.975597,0.853846
4,0.838199,0.838433,0.692164
6,0.495383,0.49506,0.830556
8,0.635039,0.541783,0.642857


In [9]:
# Merge the two dataframes keeping user_id, book_id, rating, title, authors
df_ratings = pd.merge(df_ratings, df_books, on='book_id')
df_ratings.head()

Unnamed: 0,user_id,book_id,rating,title,authors,text_to_embed
0,1,258,5,The Shadow of the Wind (The Cemetery of Forgot...,"Carlos Ruiz Zafón, Lucia Graves",Title: The Shadow of the Wind (The Cemetery of...
1,2,4081,4,I am Charlotte Simmons,Tom Wolfe,Title: I am Charlotte Simmons Authors: Tom Wolfe
2,2,260,5,How to Win Friends and Influence People,Dale Carnegie,Title: How to Win Friends and Influence People...
3,2,9296,5,The Drama of the Gifted Child: The Search for ...,"Alice Miller, Ruth Ward",Title: The Drama of the Gifted Child: The Sear...
4,2,2318,3,The Millionaire Next Door: The Surprising Secr...,"Thomas J. Stanley, William D. Danko",Title: The Millionaire Next Door: The Surprisi...


In [10]:
# Create a mapping from the user_id to a unique consecutive value in the range [0, num_users]:
unique_user_id = df_ratings['user_id'].unique()
unique_user_id = pd.DataFrame(data={
    'user_id': unique_user_id, 
    'mapped_user_id': pd.RangeIndex(len(unique_user_id))
    })
print("Mapping of user IDs to consecutive values:")
print("==========================================")
print(unique_user_id.head())
print()

# Create a mapping from the book_id to a unique consecutive value in the range [0, num_books]:
unique_book_id = df_ratings['book_id'].unique()
unique_book_id = pd.DataFrame(data={
    'book_id': unique_book_id,
    'mapped_book_id': pd.RangeIndex(len(unique_book_id))
    })
print("Mapping of book IDs to consecutive values:")
print("===========================================")
print(unique_book_id.head())
print()

Mapping of user IDs to consecutive values:
   user_id  mapped_user_id
0        1               0
1        2               1
2        4               2
3        6               3
4        8               4

Mapping of book IDs to consecutive values:
   book_id  mapped_book_id
0      258               0
1     4081               1
2      260               2
3     9296               3
4     2318               4



In [11]:
df_ratings = df_ratings.merge(unique_user_id, on='user_id')
df_ratings = df_ratings.merge(unique_book_id, on='book_id')

# With this, we are ready to create the edge_index representation in COO format
# following the PyTorch Geometric semantics:
edge_index = torch.stack([
    torch.tensor(df_ratings['mapped_user_id'].values), 
    torch.tensor(df_ratings['mapped_book_id'].values)]
    , dim=0)

print(edge_index[:, :10])

tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])


In [12]:
import torch_geometric.transforms as T
from torch_geometric.data import HeteroData

# Create the heterogeneous graph data object:
data = HeteroData()

# Add the user nodes:
data['user'].x = torch.tensor(users_features,)  # (num_users, num_users_features)

# Add the book nodes:
data['book'].x = torch.tensor(titles_emb,)  # (num_books, num_books_features)

# Add the rating edges:
data['user', 'rates', 'book'].edge_index = edge_index  # (2, num_ratings)

# Add the rating labels:
rating = torch.from_numpy(df_ratings['rating'].values)
data['user', 'rates', 'book'].edge_label = rating  # [num_ratings]

# We also need to make sure to add the reverse edges from books to users
# in order to let a GNN be able to pass messages in both directions.
# We can leverage the `T.ToUndirected()` transform for this from PyG:
data = T.ToUndirected()(data)

# With the above transformation we also got reversed labels for the edges.
# We remove them
del data['book', 'rev_rates', 'user'].edge_label

print(data['user'].num_nodes,len(unique_user_id))
assert data['user'].num_nodes == len(unique_user_id)
assert data['user', 'rates', 'book'].num_edges == len(df_ratings)

data

53424 53424


HeteroData(
  user={ x=[53424, 3] },
  book={ x=[10000, 384] },
  (user, rates, book)={
    edge_index=[2, 5976479],
    edge_label=[5976479],
  },
  (book, rev_rates, user)={ edge_index=[2, 5976479] }
)

In [13]:
data

HeteroData(
  user={ x=[53424, 3] },
  book={ x=[10000, 384] },
  (user, rates, book)={
    edge_index=[2, 5976479],
    edge_label=[5976479],
  },
  (book, rev_rates, user)={ edge_index=[2, 5976479] }
)

In [14]:
train_data, val_data, test_data = T.RandomLinkSplit(
    add_negative_train_samples=False,
    num_val=0.15,
    num_test=0.15,
    edge_types=[('user', 'rates', 'book')],
    rev_edge_types=[('book', 'rev_rates', 'user')],
)(data)
train_data, val_data, test_data 

(HeteroData(
   user={ x=[53424, 3] },
   book={ x=[10000, 384] },
   (user, rates, book)={
     edge_index=[2, 4183537],
     edge_label=[4183537],
     edge_label_index=[2, 4183537],
   },
   (book, rev_rates, user)={ edge_index=[2, 4183537] }
 ),
 HeteroData(
   user={ x=[53424, 3] },
   book={ x=[10000, 384] },
   (user, rates, book)={
     edge_index=[2, 4183537],
     edge_label=[1792942],
     edge_label_index=[2, 1792942],
   },
   (book, rev_rates, user)={ edge_index=[2, 4183537] }
 ),
 HeteroData(
   user={ x=[53424, 3] },
   book={ x=[10000, 384] },
   (user, rates, book)={
     edge_index=[2, 5080008],
     edge_label=[1792942],
     edge_label_index=[2, 1792942],
   },
   (book, rev_rates, user)={ edge_index=[2, 5080008] }
 ))

#### **NOTE:**
To be more specific:
* At **training** time:
  * $\text{training supervision edges} = \text{training message passing edges}$ <br>
* At **validation** time:
  * $\text{validation message passing edges} = \text{training message passing edges} = \text{training supervision edges}$ <br>
  * $\text{validation supervision edges} \notin \text{training supervision edges}$: disjoint with training supervision edges <br>

* At **test** time:
  * $\text{test message passing edges} = \text{validation supervison edges} + \text{training supervision edges}$ <br>
  * $\text{test supervision edges} \notin \lbrace \text{training supervision edges}, \text{valid supervision edges} \rbrace$: disjoint with training supervision edges and validation supervision edges. <br>

In [15]:
from torch_geometric.nn import SAGEConv, to_hetero
from torch import Tensor

class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        # Takes the edge_index (not the edge_label_index) as input, and performs
        # message passing on the graph.
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()

    def forward(self, x_user: Tensor, x_book: Tensor, edge_label_index):     
        # NEW VERSION (simplified)
        row, col = edge_label_index

        # Apply dot-product to get a prediction per supervision edge:
        return (x_user[row] * x_book[col]).sum(dim=-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels: int, data: HeteroData):
        super().__init__()
        
        self.book_lin = torch.nn.Linear(384, hidden_channels)
        self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
        self.book_emb = torch.nn.Embedding(data["book"].num_nodes, hidden_channels)
        
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, data: HeteroData):
        # This is completely ignoring the user features and only caring about the book features.
        x_dict = {
          "user": self.user_emb(data["user"].n_id),
          "book": self.book_lin(data["book"].x) + self.book_emb(data["book"].n_id),
        }
                
        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.encoder(x_dict, data.edge_index_dict)
        return self.decoder(
            x_dict["user"],
            x_dict["book"],
            data["user", "rates", "book"].edge_label_index,
        )

In [16]:
model = Model(hidden_channels=64, data=data).to(device)
print(model)

Model(
  (book_lin): Linear(in_features=384, out_features=64, bias=True)
  (user_emb): Embedding(53424, 64)
  (book_emb): Embedding(10000, 64)
  (encoder): GraphModule(
    (conv1): ModuleDict(
      (user__rates__book): SAGEConv((-1, -1), 64, aggr=mean)
      (book__rev_rates__user): SAGEConv((-1, -1), 64, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__book): SAGEConv((-1, -1), 64, aggr=mean)
      (book__rev_rates__user): SAGEConv((-1, -1), 64, aggr=mean)
    )
  )
  (decoder): EdgeDecoder()
)


In [17]:
from torch_geometric.loader import LinkNeighborLoader

# Define seed edges:
edge_label_index = train_data["user", "rates", "book"].edge_label_index
edge_label = train_data["user", "rates", "book"].edge_label

train_loader = LinkNeighborLoader(
    data=train_data,  # TODO
    num_neighbors=[25, 25],  # TODO
    neg_sampling_ratio=2,  # TODO
    edge_label_index=(("user", "rates", "book"), edge_label_index),
    edge_label=edge_label,
    batch_size=4096,
    shuffle=True,
)

val_loader = LinkNeighborLoader(
    data=val_data,  # TODO
    num_neighbors=[25, 25],  # TODO
    neg_sampling_ratio=2,  # TODO
    edge_label_index=(("user", "rates", "book"), edge_label_index),
    edge_label=edge_label,
    batch_size=4096,
    shuffle=True,
)


In [18]:
from tqdm import tqdm
import torch.nn.functional as F

logging_steps = 15
train_losses = []
valid_losses = []

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


for epoch in range(1, 10):
    total_loss = total_examples = 0
    
    est_loss = float("inf")
    pbar = tqdm(enumerate(train_loader), desc=f"Epoch {epoch:03d} - Estimated loss {est_loss}", total=len(train_loader))
    
    for i, sampled_data in pbar:
        optimizer.zero_grad()
        batch = sampled_data.to(device)
        pred = model.forward(batch)
        loss = F.mse_loss(pred, batch["user", "rates", "book"].edge_label.to(torch.float32))
        
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
        
        # Update progress bar
        if i % logging_steps == 0:
            est_loss = total_loss / total_examples
            pbar.set_description(f"Epoch {epoch:03d} - Estimated loss {est_loss}")
            train_losses.append(est_loss)
            
    # Run Validation
    with torch.no_grad():
        model.eval()
        total_valid_loss = total_valid_examples = 0
        for batch in tqdm(val_loader, desc=f"Validation {epoch:03d}"):
            batch = batch.to(device)
            pred = model.forward(batch)
            loss = F.mse_loss(pred, batch["user", "rates", "book"].edge_label.to(torch.float32))
            
            total_valid_loss += float(loss) * pred.numel()
            total_valid_examples += pred.numel()
        valid_loss = total_valid_loss / total_valid_examples
        valid_losses.append(valid_loss)
        print(f"Validation loss: {valid_loss}")
        model.train()

Device: 'cuda'


Epoch 001 - Estimated loss 2.6434264808864016: 100%|██████████| 1022/1022 [03:42<00:00,  4.59it/s]
Validation 001: 100%|██████████| 1022/1022 [03:17<00:00,  5.17it/s]


Validation loss: 2.1951655055545634


Epoch 002 - Estimated loss 2.0523059635507956: 100%|██████████| 1022/1022 [03:59<00:00,  4.27it/s]
Validation 002: 100%|██████████| 1022/1022 [03:16<00:00,  5.20it/s]


Validation loss: 1.9584298187481408


Epoch 003 - Estimated loss 1.9093362033425536: 100%|██████████| 1022/1022 [03:50<00:00,  4.44it/s]
Validation 003:  30%|██▉       | 306/1022 [00:54<02:19,  5.15it/s]

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_losses, label="Train loss")
plt.plot(valid_losses, label="Validation loss")
plt.legend()
plt.show()

In [None]:
# Save model
torch.save(model.state_dict(), "new_model.pt")