### Extracting dataset 

In [192]:
from pathlib import Path
import tarfile
import torch
from torch.utils.data import Dataset
from torch_geometric.data import Data


class CustomDataset(Dataset):
    def __init__(self, root_dir):
        self.root_dir = Path(root_dir)
        self.graphs = []

        # Iterate through each .tar.gz file
        for i in range(0, 10):
            file_path = self.root_dir / f"batch_1_{i}.tar.gz"

            # Extract all .pt files from the .tar.gz file
            with tarfile.open(file_path, "r:gz") as tar:
                for member in tar.getmembers():
                    if member.name.endswith(".pt"):
                        f = tar.extractfile(member)
                        self.graphs.append(torch.load(f))

    def __len__(self):
        return len(self.graphs)

    def __getitem__(self, idx):
        graph = self.graphs[idx]
        return graph

    def display_graph_attributes(graph):
        print(f"Number of nodes: {graph.num_nodes}")
        print(f"Number of edges: {graph.num_edges}")
        print(f"Edge index: {graph.edge_index}")
        print(f"Edge attributes: {graph.edge_attr}")
        print(f"Node attributes: {graph.x}")

In [193]:
# Dataset created from the tar files
dataset = CustomDataset("tar_data")
print(f"Number of graphs: {len(dataset)}")

AttributeError: 'ExFileObject' object has no attribute 'read'

In [61]:
# Iterate through the dataset and print the first graph
for i in range(len(dataset)):
    print(f"Graph {i+1}:")
    print((dataset[i]))
    break

Graph 1:
Data(x=[419, 6], edge_index=[2, 4882], edge_attr=[4882, 4], y=[4882])


### Model Creation

In [104]:
from torch import Tensor

import torch.nn as nn
import torch.nn.functional as F
import torch_geometric.transforms as T
from torch_geometric.nn import MessagePassing
from torch.nn import Sequential as Seq, Linear, ReLU, Sigmoid


class RelationalModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(RelationalModel, self).__init__()

        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
        )

    def forward(self, m):
        return self.layers(m)


class ObjectModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(ObjectModel, self).__init__()

        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
        )

    def forward(self, C):
        return self.layers(C)


class InteractionNetwork(MessagePassing):
    def __init__(self, hidden_size):
        super(InteractionNetwork, self).__init__(aggr='add',
                                                 flow='source_to_target')
        self.R1 = RelationalModel(16, 4, hidden_size)
        self.O = ObjectModel(10, 3, hidden_size)
        self.R2 = RelationalModel(10, 1, hidden_size)
        self.E: Tensor = Tensor()

    def forward(self, x: Tensor, edge_index: Tensor, edge_attr: Tensor) -> Tensor:

        # propagate_type: (x: Tensor, edge_attr: Tensor)
        x_tilde = self.propagate(
            edge_index, x=x, edge_attr=edge_attr, size=None)

        m2 = torch.cat([x_tilde[edge_index[1]],
                        x_tilde[edge_index[0]],
                        self.E], dim=1)
        return torch.sigmoid(self.R2(m2))

    def message(self, x_i, x_j, edge_attr):
        # x_i --> incoming
        # x_j --> outgoing
        m1 = torch.cat([x_i, x_j, edge_attr], dim=1)
        self.E = self.R1(m1)
        return self.E

    def update(self, aggr_out, x):
        c = torch.cat([x, aggr_out], dim=1)
        return self.O(c)


### Data loader

In [None]:
import random
from torch.utils.data import random_split

# Shuffle dataset
random.shuffle(dataset.graphs)

# Split dataset into train and eval
train_size = int(0.8 * len(dataset))
eval_size = len(dataset) - train_size
train_dataset, eval_dataset = random_split(dataset, [train_size, eval_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=32)


## Train data

In [176]:
def train_model(model, train_loader, eval_loader, criterion, optimizer, n_epochs):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    for epoch in range(n_epochs):
        # Train loop
        train_loss = 0
        correct = 0
        total = 0
        for data in train_loader:
            data = data.to(device)
            optimizer.zero_grad()
            out = model(data.x, data.edge_index, data.edge_attr).squeeze(1)
            # print(out.shape)
            # print(data.y.shape)
            # break
            loss = criterion(out, data.y)
            loss.backward()
            optimizer.step()

            # Calculate accuracy
            predicted = torch.round(torch.sigmoid(out))
            correct += (predicted == data.y).sum().item()
            total += data.y.size(0)

            train_loss += loss.item()
        # break
        train_acc = 100 * correct / total
        train_loss /= len(train_loader)

        # Evaluation loop
        eval_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for data in eval_loader:
                data = data.to(device)
                out = model(data.x, data.edge_index, data.edge_attr).squeeze(1)
                loss = criterion(out, data.y)

                # Calculate accuracy
                predicted = torch.round(torch.sigmoid(out))
                correct += (predicted == data.y).sum().item()
                total += data.y.size(0)

                eval_loss += loss.item()

        eval_acc = 100 * correct / total
        eval_loss /= len(eval_loader)

        # Print epoch stats
        print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Eval Loss: {eval_loss:.4f}, Eval Acc: {eval_acc:.2f}%")

    print("Training finished!")



In [186]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = InteractionNetwork(10).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [181]:
# Train with binary cross-entropy loss
bce_loss = torch.nn.BCEWithLogitsLoss()
train_model(model, train_loader, eval_loader, bce_loss, optimizer, 50)


Epoch 0: Train Loss: 0.6598, Train Acc: 82.27%, Eval Loss: 0.6596, Eval Acc: 82.34%
Epoch 1: Train Loss: 0.6581, Train Acc: 83.47%, Eval Loss: 0.6587, Eval Acc: 83.15%
Epoch 2: Train Loss: 0.6576, Train Acc: 84.19%, Eval Loss: 0.6584, Eval Acc: 83.86%
Epoch 3: Train Loss: 0.6573, Train Acc: 84.85%, Eval Loss: 0.6581, Eval Acc: 84.55%
Epoch 4: Train Loss: 0.6571, Train Acc: 85.42%, Eval Loss: 0.6578, Eval Acc: 85.29%
Epoch 5: Train Loss: 0.6569, Train Acc: 86.07%, Eval Loss: 0.6576, Eval Acc: 86.00%
Epoch 6: Train Loss: 0.6567, Train Acc: 86.64%, Eval Loss: 0.6574, Eval Acc: 86.53%
Epoch 7: Train Loss: 0.6565, Train Acc: 87.06%, Eval Loss: 0.6573, Eval Acc: 86.85%
Epoch 8: Train Loss: 0.6564, Train Acc: 87.38%, Eval Loss: 0.6571, Eval Acc: 87.17%
Epoch 9: Train Loss: 0.6562, Train Acc: 87.66%, Eval Loss: 0.6570, Eval Acc: 87.44%
Epoch 10: Train Loss: 0.6561, Train Acc: 87.92%, Eval Loss: 0.6569, Eval Acc: 87.68%
Epoch 11: Train Loss: 0.6560, Train Acc: 88.14%, Eval Loss: 0.6568, Eval Ac

## Class Imbalance Problem

In [190]:
label_0_count = sum([dataset[i].y.eq(0).sum().item()
                    for i in range(len(dataset))])
label_1_count = sum([dataset[i].y.eq(1).sum().item()
                    for i in range(len(dataset))])

print("Number of label 0 samples:", label_0_count)
print("Number of label 1 samples:", label_1_count)
print("Ratio of label 0 samples to label 1 samples:",
      label_0_count / label_1_count)
print("Imbalance ratio:", label_1_count / (label_0_count + label_1_count))


Number of label 0 samples: 3182276
Number of label 1 samples: 697606
Ratio of label 0 samples to label 1 samples: 4.561709618323237
Imbalance ratio: 0.17980082899428385


Class imbalance is a comman problem is ML which can be make model biased. There are several approaches to deal with it: 
1. Resampling data: undersample the majority class by removing some of the samples
2. Class weighting: assign higher weights to the minority class and lower weights to the majority class during training
3. Data augmentation: new samples by applying transformations to the existing samples
4. Ensemble methods: bagging or boosting to train multiple models on different samples of the data and combine their predictions

We are using class weighting during training where the minority class is assigned a higher weight

In [182]:
# Train with class-balanced binary cross-entropy loss
n_positives =  sum([dataset[i].y.eq(1).sum().item() for i in range(len(dataset))])
n_negatives = sum([dataset[i].y.eq(0).sum().item() for i in range(len(dataset))])
pos_weight = torch.tensor(n_negatives / n_positives, dtype=torch.float32)
bce_loss_class_balanced = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)

# bce_loss_class_balanced = torch.nn.BCEWithLogitsLoss(pos_weight=class_weights)
train_model(model, train_loader, eval_loader, bce_loss_class_balanced, optimizer, 50)


Epoch 0: Train Loss: 0.9433, Train Acc: 88.52%, Eval Loss: 0.9401, Eval Acc: 85.66%
Epoch 1: Train Loss: 0.9392, Train Acc: 84.73%, Eval Loss: 0.9353, Eval Acc: 82.52%
Epoch 2: Train Loss: 0.9356, Train Acc: 83.66%, Eval Loss: 0.9336, Eval Acc: 83.32%
Epoch 3: Train Loss: 0.9342, Train Acc: 84.06%, Eval Loss: 0.9326, Eval Acc: 83.34%
Epoch 4: Train Loss: 0.9332, Train Acc: 83.59%, Eval Loss: 0.9317, Eval Acc: 82.20%
Epoch 5: Train Loss: 0.9319, Train Acc: 83.55%, Eval Loss: 0.9303, Eval Acc: 83.34%
Epoch 6: Train Loss: 0.9310, Train Acc: 83.91%, Eval Loss: 0.9297, Eval Acc: 83.49%
Epoch 7: Train Loss: 0.9304, Train Acc: 84.16%, Eval Loss: 0.9291, Eval Acc: 83.82%
Epoch 8: Train Loss: 0.9298, Train Acc: 84.20%, Eval Loss: 0.9288, Eval Acc: 83.93%
Epoch 9: Train Loss: 0.9292, Train Acc: 84.29%, Eval Loss: 0.9284, Eval Acc: 84.21%
Epoch 10: Train Loss: 0.9288, Train Acc: 84.41%, Eval Loss: 0.9281, Eval Acc: 84.56%
Epoch 11: Train Loss: 0.9284, Train Acc: 84.48%, Eval Loss: 0.9278, Eval Ac