In [1]:
import torch
import torch.nn.functional as F
import numpy as np
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

import lovely_tensors as lt
lt.monkey_patch()

In [27]:
# Step 1: Load and prepare the graph data
print("Loading graph data...")
graph_data = torch.load('graph_data_new_topology.pt')
print("Graph data loaded successfully!")

Loading graph data...


  graph_data = torch.load('graph_data_new_topology.pt')


Graph data loaded successfully!


In [29]:
graph_data.edge_features = torch.cat([graph_data.edge_features, graph_data.edge_features], dim=0)
edges = torch.cat([graph_data.edge_index[0], graph_data.edge_index[1]])
edges_reversed = torch.stack([reversed(e) for e in edges.reshape(-1, 2)]).flatten()
graph_data.edge_index = torch.stack([edges, edges_reversed])

In [30]:
loc1 = F.one_hot(graph_data.y_location_1.long())[:, 1:]
loc2 = F.one_hot(graph_data.y_location_2.long())[:, 1:]
y = (loc1 + loc2).T
y

tensor[32, 17520000] i64 n=560640000 (4.2Gb) x∈[0, 1] μ=0.008 σ=0.087

In [31]:
graph_data = Data(
    edge_index=graph_data.edge_index,
    num_nodes=graph_data.num_nodes,
    node_features=graph_data.node_features,
    edge_features=graph_data.edge_features,
    y=y
)

In [32]:
graph_data

Data(edge_index=[2, 68], y=[32, 17520000], num_nodes=32, node_features=[32, 17520000], edge_features=[68, 17520000])

In [33]:
torch.save(graph_data, "graph_data_new_topology_preprocessed.pt")

In [9]:
graph_data = torch.load("graph_data_new_topology_preprocessed.pt")

  graph_data = torch.load("graph_data_new_topology_preprocessed.pt")


In [34]:
# Step 2: Clip the graph data to the first million data points
num_data_points = 1_000_000  # Define the limit for the data points
graph_data.node_features = graph_data.node_features[:, :num_data_points]
graph_data.edge_features = graph_data.edge_features[:, :num_data_points]
graph_data.y = graph_data.y[:, :num_data_points]

# Verify the updated data shape
print("Updated graph data structure:")
print(f"y: {graph_data.y}")
print(f"node_features: {graph_data.node_features}")
print(f"edge_features: {graph_data.edge_features}")

# Step 3: Reshape node and edge features for GCN
def reshape_for_gcn(data, window_size=20, stride=10):
    """
    Reshape time-series data for GCN format. 
    Returns reshaped data with shape [num_elements, num_windows].
    """
    num_elements, num_timesteps = data.shape
    num_windows = (num_timesteps - window_size) // stride + 1
    
    reshaped_data = torch.zeros((num_elements, num_windows))
    for i in range(num_windows):
        start_idx = i * stride
        end_idx = start_idx + window_size
        reshaped_data[:, i] = data[:, start_idx:end_idx].mean(dim=1)
    
    return reshaped_data

# Reshape node and edge features
print("\nReshaping data for GCN...")
graph_data.node_features = reshape_for_gcn(graph_data.node_features, window_size=20, stride=10)
graph_data.edge_features = reshape_for_gcn(graph_data.edge_features, window_size=20, stride=10)
print(f"Reshaped node features: {graph_data.node_features}")
print(f"Reshaped edge features: {graph_data.edge_features}")

Updated graph data structure:
y: tensor[32, 1000000] i64 n=32000000 (0.2Gb) x∈[0, 1] μ=0.009 σ=0.092
node_features: tensor[32, 1000000] n=32000000 (0.1Gb) x∈[0., 69.984] μ=61.334 σ=11.887
edge_features: tensor[68, 1000000] n=68000000 (0.3Gb) x∈[-2.045e+03, 1.263e+04] μ=841.751 σ=1.522e+03

Reshaping data for GCN...
Reshaped node features: tensor[32, 99999] n=3199968 (12Mb) x∈[0., 69.967] μ=61.334 σ=11.615
Reshaped edge features: tensor[68, 99999] n=6799932 (26Mb) x∈[-1.758e+03, 1.103e+04] μ=841.755 σ=1.479e+03


In [35]:
def aggregate_labels(data, window_size=20, stride=10):
    num_elements, num_timesteps = data.shape
    num_windows = (num_timesteps - window_size) // stride + 1
    
    reshaped_data = torch.zeros((num_elements, num_windows))
    for i in range(num_windows):
        start_idx = i * stride
        end_idx = start_idx + window_size
        reshaped_data[:, i] = data[:, start_idx:end_idx].mean(dim=1) > 0.5
    
    return reshaped_data

    
y = aggregate_labels(graph_data.y.float()).long()
graph_data.y = y

In [36]:
num_graphs = graph_data.y.shape[1]
graphs = []
for i in tqdm(range(num_graphs)):
    graph = Data(
        x=graph_data.node_features[:, i].unsqueeze(1),
        edge_index=graph_data.edge_index,
        edge_attr=graph_data.edge_features[:, i].unsqueeze(1),
        y=graph_data.y[:, i]
    )
    graphs.append(graph)

100%|██████████| 99999/99999 [00:10<00:00, 9949.43it/s] 


## Model training

In [37]:
import torch.nn as nn

from torch_geometric.data import DataLoader
from torch_geometric.nn import GATConv

In [38]:
NUM_EDGE_FEATURES = 1
NUM_NODE_FEATURES = 1

In [39]:
# Basically the same as the baseline except we pass edge features 
class GDPModel(torch.nn.Module):
    def __init__(self, num_features=3, hidden_size=32, target_size=1):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_features = num_features
        self.target_size = target_size
        self.convs = [GATConv(self.num_features, self.hidden_size, edge_dim = NUM_EDGE_FEATURES),
                      GATConv(self.hidden_size, self.hidden_size, edge_dim = NUM_EDGE_FEATURES)]
        self.linear = nn.Linear(self.hidden_size, self.target_size)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        for conv in self.convs[:-1]:
            x = conv(x, edge_index, edge_attr=edge_attr) # adding edge features here!
            x = F.relu(x)
            x = F.dropout(x, training=self.training)
        x = self.convs[-1](x, edge_index, edge_attr=edge_attr) # edge features here as well
        x = self.linear(x)

        return F.relu(x)

In [40]:
graphs[:10]

[Data(x=[32, 1], edge_index=[2, 68], edge_attr=[68, 1], y=[32]),
 Data(x=[32, 1], edge_index=[2, 68], edge_attr=[68, 1], y=[32]),
 Data(x=[32, 1], edge_index=[2, 68], edge_attr=[68, 1], y=[32]),
 Data(x=[32, 1], edge_index=[2, 68], edge_attr=[68, 1], y=[32]),
 Data(x=[32, 1], edge_index=[2, 68], edge_attr=[68, 1], y=[32]),
 Data(x=[32, 1], edge_index=[2, 68], edge_attr=[68, 1], y=[32]),
 Data(x=[32, 1], edge_index=[2, 68], edge_attr=[68, 1], y=[32]),
 Data(x=[32, 1], edge_index=[2, 68], edge_attr=[68, 1], y=[32]),
 Data(x=[32, 1], edge_index=[2, 68], edge_attr=[68, 1], y=[32]),
 Data(x=[32, 1], edge_index=[2, 68], edge_attr=[68, 1], y=[32])]

In [41]:
TEST_SIZE = 0.2
train_graphs, test_graphs = train_test_split(
    graphs, 
    test_size=TEST_SIZE
    )
print("Train Graph Data")
print(len(train_graphs))
print("Test Graph Data")
print(len(test_graphs))

Train Graph Data
79999
Test Graph Data
20000


In [47]:
from sklearn.metrics import classification_report

In [63]:
def evaluate_model(model, val_data):
    loader = DataLoader(val_data, batch_size=len(val_data), shuffle=True)
    data = next(iter(loader))
    with torch.no_grad():
        y_pred = model(data)
    y_pred_leak = (y_pred.sum(dim=1) > 0).int()
    # y_true_leak = (data.y.sum(dim=1) > 0).int()
    print(classification_report(y_pred_leak, data.y))
    loss = F.cross_entropy(y_pred, data.y)
    return loss
            

In [60]:
def train(model, name_prefix, hyperparams):
    ''' 
    Train model with given hyperparams dict.
    Saves the following CSVs over the course of training:
    1. the loss trajectory: the val and train loss every save_loss_interval epochs at
       filename 'results/{name_prefix}_{learning_rate}_train.csv' e.g. 'results/baseline_0.05_train.csv'
    2. every save_model_interval save both the model at e.g. 'models/baseline_0.05_0_out_of_1000.pt`
       and the predicted values vs actual values in `results/baseline_0.05_0_out_of_1000_prediction.csv' on the test data.
    '''
    learning_rate = hyperparams['learning_rate']
    batch_size = hyperparams['batch_size']
    n_epochs = hyperparams['n_epochs']
    save_loss_interval = hyperparams['save_loss_interval']
    print_interval = hyperparams['print_interval']
    save_model_interval = hyperparams['save_model_interval']

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    loader = DataLoader(train_graphs, batch_size=batch_size, shuffle=True)
    losses = []
    for epoch in range(n_epochs):
        print(f"Epoch {epoch}")
        epoch_loss = 0
        model.train()
        for data in tqdm(loader):
            optimizer.zero_grad()
            out = model(data)
            loss = F.cross_entropy(out, data.y)
            epoch_loss += loss.item() 
            loss.backward()
            optimizer.step()
        if epoch % save_loss_interval == 0:
            val_loss = evaluate_model(model, test_graphs) / len(test_graphs)
            train_loss = epoch_loss / len(train_graphs) * batch_size
            if epoch % print_interval == 0:
                print("Epoch: {} Train loss: {:.2e} Validation loss: {:.2e}".format(epoch, train_loss, val_loss))
            losses.append((epoch, train_loss, val_loss))
        if epoch % save_model_interval == 0:
            # save predictions for plotting
            model.eval()
        print(loss)

    return losses

In [50]:
model = GDPModel(num_features=NUM_NODE_FEATURES, hidden_size=64, target_size=graph_data.num_nodes)
model

GDPModel(
  (linear): Linear(in_features=64, out_features=32, bias=True)
)

In [56]:
hyperparameters = dict(
    learning_rate=0.001,
    n_epochs=50,
    batch_size=64,
    #
    save_loss_interval=1,
    print_interval=1,
    save_model_interval=1,
)

In [64]:
train(model, "", hyperparameters)

Epoch 0


100%|██████████| 1250/1250 [00:18<00:00, 68.21it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.01      0.02    640000

    accuracy                           0.01    640000
   macro avg       0.50      0.00      0.01    640000
weighted avg       1.00      0.01      0.02    640000

Epoch: 0 Train loss: 4.96e-02 Validation loss: 2.44e-06
tensor grad NllLossBackward0 0.045
Epoch 1


100%|██████████| 1250/1250 [00:17<00:00, 70.42it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.01      0.02    640000

    accuracy                           0.01    640000
   macro avg       0.50      0.00      0.01    640000
weighted avg       1.00      0.01      0.02    640000

Epoch: 1 Train loss: 4.96e-02 Validation loss: 2.44e-06
tensor grad NllLossBackward0 0.047
Epoch 2


100%|██████████| 1250/1250 [00:17<00:00, 71.11it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KeyboardInterrupt: 