In [14]:
import torch
torch.manual_seed(500)
from torch import Tensor, nn
import csv
import pandas as pd
import numpy as np
from torch_geometric.nn import to_hetero, SAGEConv
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from typing import List, Tuple
from datetime import datetime

In [2]:
def read_tmdb_movies():
    columns_of_interest = ['budget', 'revenue', 'vote_average', 'vote_count', 'production_companies', 'id']
    movies_df = pd.read_csv("./data/tmdb_5000_movies.csv", usecols=columns_of_interest)
    movies_df.fillna({
        'budget': 0,
        'revenue': 0,
        'vote_count': 0,
        'production_companies': ''
        })

    links_df = pd.read_csv("./data/links.csv", usecols=['movieId', 'tmdbId'])
    movies_linked_df = pd.merge(movies_df, links_df, left_on='id', right_on='tmdbId', how='inner')

    movies_info_df = pd.read_csv("./data/movies.csv")
    movies_df = pd.merge(movies_linked_df, movies_info_df, on='movieId', how='inner')

    ratings_df = pd.read_csv("./data/ratings.csv")
    ratings_df = ratings_df[ratings_df['movieId'].isin(movies_df['movieId'])]
    return movies_df, ratings_df

movies_df, ratings_df = read_tmdb_movies()

In [3]:
def standardize(x):
    return (x - x.mean()) / x.std()

# Filter movies based on ratings
movies_df = movies_df[movies_df['movieId'].isin(ratings_df['movieId'])]

# Process genres
genres = movies_df['genres'].str.get_dummies('|')

# Process numerical features
numerical_features = ['vote_average', 'revenue', 'budget', 'vote_count']
movies_df['vote_average'] = movies_df['vote_average'].fillna(5.0)

# Convert to numpy, standardize, and create tensors
num_features = movies_df[numerical_features].values.astype(float)
num_features_standardized = standardize(num_features)
num_tensors = torch.from_numpy(num_features_standardized).float()

# Combine all features
movie_feat = torch.cat([torch.from_numpy(genres.values).float(), num_tensors], dim=1)

In [4]:
# construct a compact representation of the data
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedId': pd.RangeIndex(len(unique_user_id)),
})
unique_movie_id = ratings_df['movieId'].unique()
unique_movie_id = pd.DataFrame(data={
    'movieId': unique_movie_id,
    'mappedId': pd.RangeIndex(len(unique_movie_id)),
})

ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id, on='userId', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mappedId'].values)
ratings_movie_id = pd.merge(ratings_df['movieId'], unique_movie_id, on='movieId', how='left')
ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedId'].values)

edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)

In [5]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

data = HeteroData()
# save node indices
data['user'].node_id = torch.arange(len(unique_user_id))
data['movie'].node_id = torch.arange(len(unique_movie_id))

# add node features
data['movie'].x = movie_feat

data['user', 'rates', 'movie'].edge_index = edge_index_user_to_movie
data['user', 'rates', 'movie'].edge_label = torch.from_numpy(ratings_df['rating'].values).to(torch.long) # TODO: this cuts off the .5 steps
mask = data['user', 'rates', 'movie'].edge_label >= 4
del data['user', 'rates', 'movie'].edge_label 
data['user', 'rates', 'movie'].edge_index = data['user', 'rates', 'movie'].edge_index[:, mask]

data = T.ToUndirected()(data)

print(data)

  from .autonotebook import tqdm as notebook_tqdm


HeteroData(
  user={ node_id=[327536] },
  movie={
    node_id=[4632],
    x=[4632, 24],
  },
  (user, rates, movie)={ edge_index=[2, 11821435] },
  (movie, rev_rates, user)={ edge_index=[2, 11821435] }
)


In [6]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
    edge_types=[("user", "rates", "movie")],
    rev_edge_types=[("movie", "rev_rates", "user")],
)
train_data, val_data, test_data = transform(data)

from torch_geometric.loader import LinkNeighborLoader
# Example configuration with correct tuple formatting for edge_label_index:
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[10,10,10],
    edge_label_index=(('user', 'rates', 'movie'), train_data['user', 'rates', 'movie'].edge_label_index),  # Correct tuple format
    edge_label=train_data['user', 'rates', 'movie'].edge_label,  # Labels for the edges
    batch_size=128,
    shuffle=True,
    neg_sampling_ratio=2.0  # Example negative sampling ratio
)

# Define the validation seed edges:
edge_label_index = val_data["user", "rates", "movie"].edge_label_index
edge_label = val_data["user", "rates", "movie"].edge_label
val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[10,10,10],
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=128*3,
    shuffle=False,
)

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 

In [7]:
config = {
    'movie_feature_dim': 24,
    'hidden_dim': 64,
    'num_layers': 2
}

In [10]:
class GNN(nn.Module):
    """Graph Neural Network using SAGEConv layers."""
    def __init__(self, in_channels: int, hidden_channels: int, num_layers: int):
        super().__init__()
        self.convs = nn.ModuleList()
        self.batch_norms = nn.ModuleList()
        
        for i in range(num_layers):
            in_channels = in_channels if i == 0 else hidden_channels
            self.convs.append(SAGEConv(in_channels, hidden_channels))
            self.batch_norms.append(nn.BatchNorm1d(hidden_channels))
    
    def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
        for conv, bn in zip(self.convs, self.batch_norms):
            x = conv(x, edge_index)
            x = bn(x)
            x = torch.relu(x)
        return x

class Classifier(nn.Module):
    """Classifier for link prediction."""
    def __init__(self, hidden_dim: int):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(2*hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, user_features: Tensor, movie_features: Tensor, edge_index: Tensor) -> Tensor:
        edge_features = torch.cat([user_features[edge_index[0]], movie_features[edge_index[1]]], dim=1)
        return self.layers(edge_features).view(-1)

class RecommendationModel(nn.Module):
    """Main model for movie recommendations."""
    def __init__(self, config: dict, data: HeteroData):
        super().__init__()
        self.movie_lin = nn.Linear(config['movie_feature_dim'], config['hidden_dim'])
        self.user_emb = nn.Embedding(data['user'].num_nodes, config['hidden_dim'])
        self.movie_emb = nn.Embedding(data['movie'].num_nodes, config['hidden_dim'])
        
        self.gnn = GNN(config['hidden_dim'], config['hidden_dim'], config['num_layers'])
        self.gnn = to_hetero(self.gnn, metadata=data.metadata(), aggr='sum')
        
        self.classifier = Classifier(config['hidden_dim'])
    
    def forward(self, data: HeteroData) -> Tensor:
        x_dict = {
            "user": self.user_emb(data['user'].node_id),
            "movie": self.movie_lin(data['movie'].x) + self.movie_emb(data['movie'].node_id),
        }
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        
        return self.classifier(
            x_dict["user"],
            x_dict["movie"],
            data["user", "rates", "movie"].edge_label_index,
        )

model = RecommendationModel(config, data)
print(model)

RecommendationModel(
  (movie_lin): Linear(in_features=24, out_features=64, bias=True)
  (user_emb): Embedding(327536, 64)
  (movie_emb): Embedding(4632, 64)
  (gnn): GraphModule(
    (convs): ModuleList(
      (0-1): 2 x ModuleDict(
        (user__rates__movie): SAGEConv(64, 64, aggr=mean)
        (movie__rev_rates__user): SAGEConv(64, 64, aggr=mean)
      )
    )
    (batch_norms): ModuleList(
      (0-1): 2 x ModuleDict(
        (user): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (movie): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (classifier): Classifier(
    (layers): Sequential(
      (0): Linear(in_features=128, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=1, bias=True)
    )
  )
)


In [15]:
def generate_simple_filename(auc: float) -> str:
    """
    Generate a simple filename for saving the model.
    
    Args:
    auc (float): The AUC score of the model.
    
    Returns:
    str: A formatted filename string.
    """
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    auc_str = f"{auc:.4f}".replace('.', '')
    
    return f"model_{timestamp}_auc{auc_str}.pt"

In [16]:
def train_epoch(model: torch.nn.Module, 
                loader: torch.utils.data.DataLoader, 
                optimizer: torch.optim.Optimizer, 
                device: torch.device) -> float:
    model.train()
    total_loss = 0
    total_examples = 0
    
    for batch in tqdm(loader, desc="Training"):
        batch = batch.to(device)
        optimizer.zero_grad()
        pred = model(batch)
        label = batch['user', 'rates', 'movie'].edge_label
        loss = F.binary_cross_entropy_with_logits(pred, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * label.numel()
        total_examples += label.numel()
    
    return total_loss / total_examples

def validate(model: torch.nn.Module, 
             loader: torch.utils.data.DataLoader, 
             device: torch.device) -> Tuple[float, List[float], List[float]]:
    model.eval()
    preds = []
    ground_truths = []
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Validating"):
            batch = batch.to(device)
            pred = model(batch)
            preds.append(pred)
            ground_truths.append(batch["user", "rates", "movie"].edge_label)
    
    pred = torch.cat(preds, dim=0).cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
    auc = roc_auc_score(ground_truth, pred)
    
    return auc, pred.tolist(), ground_truth.tolist()

def train_model(model: torch.nn.Module, 
                train_loader: torch.utils.data.DataLoader, 
                val_loader: torch.utils.data.DataLoader, 
                num_epochs: int = 150, 
                lr: float = 0.001, 
                device: str = "cuda" if torch.cuda.is_available() else "cpu", 
                save_path: str = "./model") -> Tuple[List[float], List[float]]:
    print(f'Device: {device}')
    device = torch.device(device)
    model = model.to(device)
    optimizer = Adam(model.parameters(), lr=lr)
    
    train_losses = []
    val_aucs = []
    
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_loader, optimizer, device)
        val_auc, _, _ = validate(model, val_loader, device)
        
        train_losses.append(train_loss)
        val_aucs.append(val_auc)
        
        print(f'Epoch {epoch + 1:03d}, Loss: {train_loss:.4f}, Validation AUC: {val_auc:.4f}')
        
        if val_auc == max(val_aucs):
            filename = generate_simple_filename(val_auc)
            torch.save(model.state_dict(), f"{save_path}/{filename}")

    
    return train_losses, val_aucs

# Usage
num_epochs = 5
learning_rate = 0.001

train_losses, val_aucs = train_model(model, train_loader, val_loader, 
                                     num_epochs=num_epochs, 
                                     lr=learning_rate)

Device: cuda


Training:   0%|          | 1/22166 [00:00<37:34,  9.83it/s]

Training: 100%|██████████| 22166/22166 [05:16<00:00, 69.94it/s]
Validating: 100%|██████████| 9236/9236 [01:28<00:00, 103.97it/s]


Epoch 001, Loss: 0.2096, Validation AUC: 0.9724


Training: 100%|██████████| 22166/22166 [05:18<00:00, 69.65it/s]
Validating: 100%|██████████| 9236/9236 [01:26<00:00, 106.57it/s]


Epoch 002, Loss: 0.2004, Validation AUC: 0.9738


Training: 100%|██████████| 22166/22166 [05:18<00:00, 69.65it/s]
Validating: 100%|██████████| 9236/9236 [01:35<00:00, 96.81it/s] 


Epoch 003, Loss: 0.1938, Validation AUC: 0.9750


Training: 100%|██████████| 22166/22166 [05:18<00:00, 69.61it/s]
Validating: 100%|██████████| 9236/9236 [01:09<00:00, 133.75it/s]


Epoch 004, Loss: 0.1877, Validation AUC: 0.9756


Training: 100%|██████████| 22166/22166 [05:09<00:00, 71.55it/s]
Validating: 100%|██████████| 9236/9236 [01:39<00:00, 92.65it/s] 


Epoch 005, Loss: 0.1831, Validation AUC: 0.9760


There is now a GNN for link prediciton. We should also do joint learning for some graph embedding method.
This can be used for several usecases:
- interpretability of predicitons
- cluster group of similar users
- connect similar users with each other

After that, logical reasoning can be done to infer further information about the knowledge graph
Goal is to create a knowledge graph that can be used for further link prediction and analyzing of the data we have