In [1]:
from torch_geometric.data import Dataset
import json
import torch
from torch_geometric.data import Data
import pytorch_lightning as pl
from torch.utils.data import Subset
from torch_geometric.loader import DataLoader
import random

def add_zeros(data):
    data.x = torch.zeros(data.num_nodes, dtype=torch.long)
    return data

class GraphJSONDataset(Dataset):
    def __init__(self, path, transform=None, pre_transform=None, has_labels=True):
        self.path = path
        self.has_labels = has_labels
        self.offsets = []
        with open(self.path, 'r', encoding='utf-8') as f:
            offset = 0
            for line in f:
                self.offsets.append(offset)
                offset += len(line.encode('utf-8'))
        super().__init__(None, transform, pre_transform)

    def len(self):
        return len(self.offsets)

    def get(self, idx):
        offset = self.offsets[idx]
        with open(self.path, 'r', encoding='utf-8') as f:
            f.seek(offset)
            line = f.readline()
        return self.parse_graph(line)

    def parse_graph(self, line):
        item = json.loads(line)
        edge_index = torch.tensor(item['edge_index'], dtype=torch.long)
        edge_attr = torch.tensor(item["edge_attr"], dtype=torch.float) if "edge_attr" in item else None
        num_nodes = item['num_nodes']
        y = torch.tensor(item['y'][0], dtype=torch.long) if self.has_labels else None
        return Data(edge_index=edge_index, edge_attr=edge_attr, num_nodes=num_nodes, y=y)


class GraphDataModule(pl.LightningDataModule):
    def __init__(self, train_path, test_path, batch_size=32, val_split=0.2, seed=42):
        super().__init__()
        self.train_path = train_path
        self.test_path = test_path
        self.batch_size = batch_size
        self.val_split = val_split
        self.seed = seed
        self.setup()

    def setup(self, stage=None):
        full_train_dataset = GraphJSONDataset(self.train_path, transform = add_zeros, has_labels=True)
        total_size = len(full_train_dataset)
        print(f"Dataset size: {total_size}")
        indices = list(range(total_size))
        random.seed(self.seed)
        random.shuffle(indices)

        val_size = int(self.val_split * total_size)
        self.train_dataset = Subset(full_train_dataset, indices[val_size:])
        print(f"Train dataset size: {len(self.train_dataset)}")
        self.val_dataset = Subset(full_train_dataset, indices[:val_size])
        print(f"Validation dataset size: {len(self.val_dataset)}")
        self.test_dataset = GraphJSONDataset(self.test_path, transform = add_zeros, has_labels=False)
        print(f"Test dataset size: {len(self.test_dataset)}")

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

#Nostra
class SCELoss(nn.Module):
    def __init__(self, alpha=0.1, beta=1.0, num_classes=6):
        super().__init__()
        self.alpha = alpha
        self.beta = beta
        self.ce = nn.CrossEntropyLoss()
        self.num_classes = num_classes

    def forward(self, pred, labels):
        ce = self.ce(pred, labels)
        pred_soft = F.softmax(pred, dim=1).clamp(min=1e-7, max=1.0)
        labels_one_hot = F.one_hot(labels, self.num_classes).float()
        rce = -torch.sum(pred_soft * torch.log(labels_one_hot + 1e-7), dim=1).mean()
        return self.alpha * ce + self.beta * rce
#Prof
class NoisyCrossEntropyLoss(torch.nn.Module):
    def __init__(self, p_noisy):
        super().__init__()
        self.p = p_noisy
        self.ce = torch.nn.CrossEntropyLoss(reduction='none')

    def forward(self, logits, targets):
        losses = self.ce(logits, targets)
        weights = (1 - self.p) + self.p * (1 - torch.nn.functional.one_hot(targets, num_classes=logits.size(1)).float().sum(dim=1))
        return (losses * weights).mean()

In [None]:
import torch
from torch_geometric.nn import MessagePassing
from torch_geometric.nn import global_add_pool, global_mean_pool, global_max_pool, GlobalAttention, Set2Set
import torch.nn.functional as F
from torch_geometric.utils import degree
import torchmetrics
import os
import pandas as pd

### GIN convolution along the graph structure (PROF)
class GINConv(MessagePassing):
    def __init__(self, emb_dim):
        '''
            emb_dim (int): node embedding dimensionality
        '''

        super(GINConv, self).__init__(aggr = "add")

        self.mlp = torch.nn.Sequential(torch.nn.Linear(emb_dim, 2*emb_dim), torch.nn.BatchNorm1d(2*emb_dim), torch.nn.ReLU(), torch.nn.Linear(2*emb_dim, emb_dim))
        self.eps = torch.nn.Parameter(torch.Tensor([0]))

        self.edge_encoder = torch.nn.Linear(7, emb_dim)

    def forward(self, x, edge_index, edge_attr):
        edge_embedding = self.edge_encoder(edge_attr)
        out = self.mlp((1 + self.eps) *x + self.propagate(edge_index, x=x, edge_attr=edge_embedding))

        return out

    def message(self, x_j, edge_attr):
        return F.relu(x_j + edge_attr)

    def update(self, aggr_out):
        return aggr_out

### GCN convolution along the graph structure (PROF)
class GCNConv(MessagePassing):
    def __init__(self, emb_dim):
        super(GCNConv, self).__init__(aggr='add')

        self.linear = torch.nn.Linear(emb_dim, emb_dim)
        self.root_emb = torch.nn.Embedding(1, emb_dim)
        self.edge_encoder = torch.nn.Linear(7, emb_dim)

    def forward(self, x, edge_index, edge_attr):
        x = self.linear(x)
        edge_embedding = self.edge_encoder(edge_attr)

        row, col = edge_index

        #edge_weight = torch.ones((edge_index.size(1), ), device=edge_index.device)
        deg = degree(row, x.size(0), dtype = x.dtype) + 1
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0

        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]

        return self.propagate(edge_index, x=x, edge_attr = edge_embedding, norm=norm) + F.relu(x + self.root_emb.weight) * 1./deg.view(-1,1)

    def message(self, x_j, edge_attr, norm):
        return norm.view(-1, 1) * F.relu(x_j + edge_attr)

    def update(self, aggr_out):
        return aggr_out


### GNN to generate node embedding (PROF)
class GNN_node(torch.nn.Module):
    """
    Output:
        node representations
    """
    def __init__(self, num_layer, emb_dim, drop_ratio = 0.5, JK = "last", residual = False, gnn_type = 'gin'):
        '''
            emb_dim (int): node embedding dimensionality
            num_layer (int): number of GNN message passing layers

        '''

        super(GNN_node, self).__init__()
        self.num_layer = num_layer
        self.drop_ratio = drop_ratio
        self.JK = JK
        ### add residual connection or not
        self.residual = residual

        if self.num_layer < 2:
            raise ValueError("Number of GNN layers must be greater than 1.")

        self.node_encoder = torch.nn.Embedding(1, emb_dim) # uniform input node embedding

        ###List of GNNs
        self.convs = torch.nn.ModuleList()
        self.batch_norms = torch.nn.ModuleList()

        for layer in range(num_layer):
            if gnn_type == 'gin':
                self.convs.append(GINConv(emb_dim))
            elif gnn_type == 'gcn':
                self.convs.append(GCNConv(emb_dim))
            else:
                raise ValueError('Undefined GNN type called {}'.format(gnn_type))

            self.batch_norms.append(torch.nn.BatchNorm1d(emb_dim))

    def forward(self, batched_data):
        x, edge_index, edge_attr, batch = batched_data.x, batched_data.edge_index, batched_data.edge_attr, batched_data.batch

         ### computing input node embedding

        h_list = [self.node_encoder(x)]
        for layer in range(self.num_layer):

            h = self.convs[layer](h_list[layer], edge_index, edge_attr)
            h = self.batch_norms[layer](h)

            if layer == self.num_layer - 1:
                #remove relu for the last layer
                h = F.dropout(h, self.drop_ratio, training = self.training)
            else:
                h = F.dropout(F.relu(h), self.drop_ratio, training = self.training)

            if self.residual:
                h += h_list[layer]

            h_list.append(h)

        ### Different implementations of Jk-concat
        if self.JK == "last":
            node_representation = h_list[-1]
        elif self.JK == "sum":
            node_representation = 0
            for layer in range(self.num_layer + 1):
                node_representation += h_list[layer]

        return node_representation


### Virtual GNN to generate node embedding (PROF)
class GNN_node_Virtualnode(torch.nn.Module):
    """
    Output:
        node representations
    """
    def __init__(self, num_layer, emb_dim, drop_ratio = 0.5, JK = "last", residual = False, gnn_type = 'gin'):
        '''
            emb_dim (int): node embedding dimensionality
        '''

        super(GNN_node_Virtualnode, self).__init__()
        self.num_layer = num_layer
        self.drop_ratio = drop_ratio
        self.JK = JK
        ### add residual connection or not
        self.residual = residual

        if self.num_layer < 2:
            raise ValueError("Number of GNN layers must be greater than 1.")

        self.node_encoder = torch.nn.Embedding(1, emb_dim) # uniform input node embedding

        ### set the initial virtual node embedding to 0.
        self.virtualnode_embedding = torch.nn.Embedding(1, emb_dim)
        torch.nn.init.constant_(self.virtualnode_embedding.weight.data, 0)

        ### List of GNNs
        self.convs = torch.nn.ModuleList()
        ### batch norms applied to node embeddings
        self.batch_norms = torch.nn.ModuleList()

        ### List of MLPs to transform virtual node at every layer
        self.mlp_virtualnode_list = torch.nn.ModuleList()

        for layer in range(num_layer):
            if gnn_type == 'gin':
                self.convs.append(GINConv(emb_dim))
            elif gnn_type == 'gcn':
                self.convs.append(GCNConv(emb_dim))
            else:
                raise ValueError('Undefined GNN type called {}'.format(gnn_type))

            self.batch_norms.append(torch.nn.BatchNorm1d(emb_dim))

        for layer in range(num_layer - 1):
            self.mlp_virtualnode_list.append(torch.nn.Sequential(torch.nn.Linear(emb_dim, 2*emb_dim), torch.nn.BatchNorm1d(2*emb_dim), torch.nn.ReLU(), \
                                                    torch.nn.Linear(2*emb_dim, emb_dim), torch.nn.BatchNorm1d(emb_dim), torch.nn.ReLU()))


    def forward(self, batched_data):

        x, edge_index, edge_attr, batch = batched_data.x, batched_data.edge_index, batched_data.edge_attr, batched_data.batch

        ### virtual node embeddings for graphs
        virtualnode_embedding = self.virtualnode_embedding(torch.zeros(batch[-1].item() + 1).to(edge_index.dtype).to(edge_index.device))

        h_list = [self.node_encoder(x)]
        for layer in range(self.num_layer):
            ### add message from virtual nodes to graph nodes
            h_list[layer] = h_list[layer] + virtualnode_embedding[batch]

            ### Message passing among graph nodes
            h = self.convs[layer](h_list[layer], edge_index, edge_attr)

            h = self.batch_norms[layer](h)
            if layer == self.num_layer - 1:
                #remove relu for the last layer
                h = F.dropout(h, self.drop_ratio, training = self.training)
            else:
                h = F.dropout(F.relu(h), self.drop_ratio, training = self.training)

            if self.residual:
                h = h + h_list[layer]

            h_list.append(h)

            ### update the virtual nodes
            if layer < self.num_layer - 1:
                ### add message from graph nodes to virtual nodes
                virtualnode_embedding_temp = global_add_pool(h_list[layer], batch) + virtualnode_embedding
                ### transform virtual nodes using MLP

                if self.residual:
                    virtualnode_embedding = virtualnode_embedding + F.dropout(self.mlp_virtualnode_list[layer](virtualnode_embedding_temp), self.drop_ratio, training = self.training)
                else:
                    virtualnode_embedding = F.dropout(self.mlp_virtualnode_list[layer](virtualnode_embedding_temp), self.drop_ratio, training = self.training)

        ### Different implementations of Jk-concat
        if self.JK == "last":
            node_representation = h_list[-1]
        elif self.JK == "sum":
            node_representation = 0
            for layer in range(self.num_layer + 1):
                node_representation += h_list[layer]

        return node_representation

#(PROF)
class GNN(torch.nn.Module):
    
    def __init__(self, num_class, num_layer = 5, emb_dim = 300, 
                    gnn_type = 'gin', virtual_node = True, residual = False, drop_ratio = 0.5, JK = "last", graph_pooling = "mean"):
        '''
            num_tasks (int): number of labels to be predicted
            virtual_node (bool): whether to add virtual node or not
        '''

        super(GNN, self).__init__()

        self.num_layer = num_layer
        self.drop_ratio = drop_ratio
        self.JK = JK
        self.emb_dim = emb_dim
        self.num_class = num_class
        self.graph_pooling = graph_pooling

        if self.num_layer < 2:
            raise ValueError("Number of GNN layers must be greater than 1.")

        ### GNN to generate node embeddings
        if virtual_node:
            self.gnn_node = GNN_node_Virtualnode(num_layer, emb_dim, JK = JK, drop_ratio = drop_ratio, residual = residual, gnn_type = gnn_type)
        else:
            self.gnn_node = GNN_node(num_layer, emb_dim, JK = JK, drop_ratio = drop_ratio, residual = residual, gnn_type = gnn_type)


        ### Pooling function to generate whole-graph embeddings
        if self.graph_pooling == "sum":
            self.pool = global_add_pool
        elif self.graph_pooling == "mean":
            self.pool = global_mean_pool
        elif self.graph_pooling == "max":
            self.pool = global_max_pool
        elif self.graph_pooling == "attention":
            self.pool = GlobalAttention(gate_nn = torch.nn.Sequential(torch.nn.Linear(emb_dim, 2*emb_dim), torch.nn.BatchNorm1d(2*emb_dim), torch.nn.ReLU(), torch.nn.Linear(2*emb_dim, 1)))
        elif self.graph_pooling == "set2set":
            self.pool = Set2Set(emb_dim, processing_steps = 2)
        else:
            raise ValueError("Invalid graph pooling type.")

        if graph_pooling == "set2set":
            self.graph_pred_linear = torch.nn.Linear(2*self.emb_dim, self.num_class)
        else:
            self.graph_pred_linear = torch.nn.Linear(self.emb_dim, self.num_class)

    def forward(self, batched_data):
        h_node = self.gnn_node(batched_data)

        h_graph = self.pool(h_node, batched_data.batch)

        return self.graph_pred_linear(h_graph)
    
class GNNLightning(pl.LightningModule):
    def __init__(self, gnn, dataset_name, num_layer = 5, emb_dim = 300, drop_ratio = 0.5):
        super(GNNLightning, self).__init__()
        if gnn == 'gin':
            self.model = GNN(gnn_type = 'gin', num_class = 6, num_layer = num_layer, emb_dim = emb_dim, drop_ratio = drop_ratio, virtual_node = False)
        elif gnn == 'gin-virtual':
            self.model = GNN(gnn_type = 'gin', num_class = 6, num_layer = num_layer, emb_dim = emb_dim, drop_ratio = drop_ratio, virtual_node = True)
        elif gnn == 'gcn':
            self.model = GNN(gnn_type = 'gcn', num_class = 6, num_layer = num_layer, emb_dim = emb_dim, drop_ratio = drop_ratio, virtual_node = False)
        elif gnn == 'gcn-virtual':
            self.model = GNN(gnn_type = 'gcn', num_class = 6, num_layer = num_layer, emb_dim = emb_dim, drop_ratio = drop_ratio, virtual_node = True)
        else:
            raise ValueError('Invalid GNN type')
        #self.loss_fn = NoisyCrossEntropyLoss(p_noisy=0.2)
        #self.loss_fn = SCELoss(alpha=0.1, beta=1.0, num_classes=6)
        self.loss_fn = torch.nn.CrossEntropyLoss()

        self.eval_metric = torchmetrics.Accuracy(task="multiclass", num_classes=6)

        self.dataset_name = dataset_name

        self.log_train = f"logs/{dataset_name}/train.log"
        os.makedirs(os.path.dirname(self.log_train), exist_ok=True)
        self.log_val = f"logs/{dataset_name}/val.log"
        os.makedirs(os.path.dirname(self.log_val), exist_ok=True)

        self.train_loss_list = []
        self.train_acc_list = []
        self.val_loss_list = []
        self.val_acc_list = []
        self.test_predictions = []


    def forward(self, batched_data):
        output = self.model(batched_data)
        return output

    def training_step(self, batch, batch_idx):
        output = self.forward(batch)
        loss = self.loss_fn(output, batch.y)
        self.log('train_loss_step', loss, on_step=True, on_epoch = False, prog_bar=True)
        self.train_loss_list.append(loss.item())
        preds = torch.argmax(output, dim=1)
        acc = self.eval_metric(preds, batch.y)
        self.log('train_acc_step', acc, on_step=True, on_epoch = False, prog_bar=True)
        self.train_acc_list.append(acc.item())
        return loss

    def validation_step(self, batch, batch_idx):
        output = self.forward(batch)
        loss = self.loss_fn(output, batch.y)
        self.log('val_loss_step', loss, on_step=True, on_epoch = False, prog_bar=True)
        preds = torch.argmax(output, dim=1)
        acc = self.eval_metric(preds, batch.y)
        self.log('val_acc_step', acc, on_step=True, on_epoch = False, prog_bar=True)
        self.val_loss_list.append(loss.item())
        self.val_acc_list.append(acc.item())

    def on_train_epoch_end(self):
        avg_loss = sum(self.train_loss_list) / len(self.train_loss_list)
        self.log('train_loss', avg_loss, on_step=False, on_epoch = True, prog_bar=True)
        avg_acc = sum(self.train_acc_list) / len(self.train_acc_list)
        self.log('train_acc', avg_acc, on_step=False, on_epoch = True, prog_bar=True)
        self.train_acc_list = []
        self.train_loss_list = []

        with open(self.log_train, 'a') as f:
            f.write(f"Epoch {self.current_epoch}: train_loss: {avg_loss}, train_acc: {avg_acc}\n")
        
    def on_validation_epoch_end(self):
        avg_val_loss = sum(self.val_loss_list) / len(self.val_loss_list)
        self.log('val_loss', avg_val_loss, on_step=False, on_epoch = True, prog_bar=True)
        self.val_loss_list = []

        avg_val_acc = sum(self.val_acc_list) / len(self.val_acc_list)
        self.log('val_acc', avg_val_acc, on_step=False, on_epoch = True, prog_bar=True)
        self.val_acc_list = []
        if not self.trainer.sanity_checking:
            with open(self.log_val, 'a') as f:
                f.write(f"Epoch {self.current_epoch}: val_loss: {avg_val_loss}, val_acc: {avg_val_acc}\n")
        
    def test_step(self, batch, batch_idx):
        output = self.forward(batch)
        preds = torch.argmax(output, dim=1)
        preds = preds.cpu().numpy().tolist()
        self.test_predictions.extend(preds)
    
        return preds
    
    def on_test_epoch_end(self):
        test_graph_ids = list(range(len(self.test_predictions)))  # Generate IDs for graphs

        # Save predictions to CSV
        # Flatten predictions and create DataFrame
        output_df = pd.DataFrame({
            "id": test_graph_ids,
            "pred": self.test_predictions
        })
        output_csv_path = f"submission/testset_{self.dataset_name}.csv"
        os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
        output_df.to_csv(output_csv_path, index=False)
        self.test_predictions = []

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        return optimizer

In [4]:
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import RichProgressBar
from pytorch_lightning.loggers import CSVLogger
import os

# 1. Dataset path
train_path = "/home/palu001/Github/datasets_jsonl/data/A/train.jsonl"
test_path = "/home/palu001/Github/datasets_jsonl/data/A/test.jsonl"

# 2. Estrai il nome del dataset (es. A)
dataset_name = os.path.basename(os.path.dirname(train_path))

# 3. Costruisci il path per i checkpoint
checkpoint_dir = os.path.join("/home/palu001/Github/Deep-Learning-Hackaton/our_code", "checkpoints", dataset_name)

# 4. Inizializza il DataModule
dm = GraphDataModule(train_path=train_path, test_path=test_path, batch_size=32)

# 5. Configura i checkpoint
checkpoint_callback = ModelCheckpoint(
    monitor='val_acc',
    mode='max',
    dirpath=checkpoint_dir,
    filename=f"model_{dataset_name}_epoch_{{epoch}}",
    save_top_k=5,
    save_last=True,
    save_on_train_epoch_end=True,
    verbose=True,
    auto_insert_metric_name=False
)

# 6. Crea il modello
model = GNNLightning(gnn='gin', num_layer=5, emb_dim=300, drop_ratio=0.0, dataset_name=dataset_name)

# 7. Trainer
trainer = pl.Trainer(
    max_epochs=10,
    accelerator="cuda",
    devices=1,
    callbacks=[checkpoint_callback],
    logger=False
)

# 8. Esecuzione
trainer.fit(model, dm)

Dataset size: 11280
Train dataset size: 9024
Validation dataset size: 2256


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3080 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


Test dataset size: 2340



Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [15]:
# Path to the checkpoint you want to test
checkpoint_path = os.path.join(checkpoint_dir, "model_A_epoch_0.ckpt")  # Change filename as needed
print(f"Loading checkpoint from: {checkpoint_path}")

# Load the model from checkpoint
model_test = GNNLightning.load_from_checkpoint(checkpoint_path, gnn='gin', num_layer=5, emb_dim=300, drop_ratio=0.0, dataset_name=dataset_name)

# Run test
trainer.test(model_test, dm)

Loading checkpoint from: /home/palu001/Github/Deep-Learning-Hackaton/our_code/checkpoints/A/model_A_epoch_0.ckpt
Dataset size: 11280
Train dataset size: 9024
Validation dataset size: 2256


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Test dataset size: 2340


Testing: |          | 0/? [00:00<?, ?it/s]

Sono qui 2


[{}]