### Install Libraries

In [1]:
!pip install datasets
!pip install torch

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

### Import Libraries

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report


### Load the PROTEINS Dataset

In [3]:
dataset = load_dataset("graphs-datasets/PROTEINS")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

full.jsonl:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1113 [00:00<?, ? examples/s]

### Explore the Dataset Structure

In [4]:
print("Dataset Features:", dataset['train'].features)
print("Number of Graphs:", len(dataset['train']))

Dataset Features: {'edge_index': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None), 'node_feat': Sequence(feature=Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), length=-1, id=None), 'y': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'num_nodes': Value(dtype='int64', id=None)}
Number of Graphs: 1113


### Check Class Distribution

In [5]:
labels = [graph['y'] for graph in dataset['train']]
unique_labels, counts = np.unique(labels, return_counts=True)
print("Class Distribution:")
for label, count in zip(unique_labels, counts):
    print(f"Class {label}: {count} graphs")

Class Distribution:
Class 0: 663 graphs
Class 1: 450 graphs


### Compute Class Weights

In [6]:
# Extract labels correctly
labels = [graph['y'][0] for graph in dataset['train']]
labels = labels[600:800]
unique_labels, counts = np.unique(labels, return_counts=True)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=unique_labels, y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Class Weights:", class_weights)


Class Weights: tensor([1.5873, 0.7299])


### Custom Dataset Class

In [7]:
class PROTEINSDataset(Dataset):
    def __init__(self, data_list):
        self.data_list = data_list

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        graph = self.data_list[idx]

        # Convert data to tensors
        edge_index = torch.tensor(graph['edge_index'], dtype=torch.long)
        node_features = torch.tensor(graph['node_feat'], dtype=torch.float)
        num_nodes = graph['num_nodes']
        graph_label = torch.tensor(graph['y'][0], dtype=torch.long)  # Corrected label extraction

        # Create adjacency matrix
        adj = torch.zeros((num_nodes, num_nodes), dtype=torch.float)
        adj[edge_index[0], edge_index[1]] = 1.0
        adj[edge_index[1], edge_index[0]] = 1.0  # Assuming undirected graph

        return node_features, adj, graph_label


### Split Dataset into Training and Testing Sets

In [8]:
# Convert the dataset to a list
data_list = dataset['train']
print(len(data_list))
# Generate labels for stratification
#labels = [graph['y'][0] for graph in data_list]

# Generate indices for splitting
indices = list(range(200))
train_indices, test_indices = train_test_split(indices, test_size=0.25)

# Create subsets
train_data_list = [data_list[i+600] for i in train_indices]
test_data_list = [data_list[i+600] for i in test_indices]


# Create dataset instances
train_dataset = PROTEINSDataset(train_data_list)
test_dataset = PROTEINSDataset(test_data_list)
print(len(train_dataset))
print(len(test_dataset))

1113
150
50


### Create Data Loaders

In [9]:

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1)

### Graph Attention Layer

In [10]:
class GraphAttentionLayer(nn.Module):
    def __init__(self, in_features, out_features, dropout, concat=True):
        super(GraphAttentionLayer, self).__init__()
        self.dropout = dropout
        self.in_features = in_features
        self.out_features = out_features
        self.concat = concat

        self.W = nn.Parameter(torch.empty(size=(in_features, out_features)))
        # Changed initialization
        nn.init.kaiming_normal_(self.W.data)

        self.a = nn.Parameter(torch.empty(size=(2*out_features, 1)))
        nn.init.kaiming_normal_(self.a.data)

        # Add skip connection
        self.skip = nn.Linear(in_features, out_features) if in_features != out_features else nn.Identity()

        # Add layer normalization
        self.layer_norm = nn.LayerNorm(out_features)

        self.relu = nn.ReLU()

    def forward(self, h, adj):
        # Store identity for skip connection
        identity = self.skip(h)

        Wh = torch.mm(h, self.W)
        a_input = self._prepare_attentional_mechanism_input(Wh)
        e = self.relu(torch.matmul(a_input, self.a).squeeze(2))

        zero_vec = -9e15*torch.ones_like(e)
        attention = torch.where(adj > 0, e, zero_vec)
        attention = F.dropout(attention, self.dropout, training=self.training)
        attention = torch.softmax(attention, dim=1)

        h_prime = torch.matmul(attention, Wh)

        # Add skip connection and normalize
        out = self.layer_norm(h_prime + identity)

        if self.concat:
            return F.elu(out)
        else:
            return out

    def _prepare_attentional_mechanism_input(self, Wh):
        N = Wh.size()[0]  # Number of nodes

        Wh_repeated_in_chunks = Wh.repeat_interleave(N, dim=0)
        Wh_repeated_alternating = Wh.repeat(N, 1)

        all_combinations_matrix = torch.cat([Wh_repeated_in_chunks, Wh_repeated_alternating], dim=1)
        return all_combinations_matrix.view(N, N, 2 * self.out_features)


### Global Attention Pooling Layer

In [11]:
class GlobalAttentionPooling(nn.Module):
    def __init__(self, in_features):
        super(GlobalAttentionPooling, self).__init__()
        hidden_dim = in_features // 2
        self.gate_nn = nn.Sequential(
            nn.Linear(in_features, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x):
        gate_scores = self.gate_nn(x)
        attention_weights = torch.softmax(gate_scores, dim=0)
        weighted_x = x * attention_weights
        return torch.sum(weighted_x, dim=0)


### Enhanced GAT Model


In [12]:
class GAT(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout, nheads, n_layers=3):
        super(GAT, self).__init__()
        self.dropout = dropout
        self.n_layers = n_layers

        # Add input normalization
        self.input_norm = nn.BatchNorm1d(nfeat)

        # Increase width in first layer
        first_layer_hidden = nhid * 2
        self.attentions = nn.ModuleList([
            GraphAttentionLayer(nfeat, first_layer_hidden, dropout, concat=True)
            for _ in range(nheads)
        ])

        # Hidden layers
        self.hidden_layers = nn.ModuleList()
        current_dim = first_layer_hidden * nheads
        for _ in range(n_layers - 2):
            layer = nn.ModuleList([
                GraphAttentionLayer(current_dim, nhid, dropout, concat=True)
                for _ in range(nheads)
            ])
            self.hidden_layers.append(layer)
            current_dim = nhid * nheads

        # Final attention layer
        self.final_attentions = nn.ModuleList([
            GraphAttentionLayer(current_dim, nhid, dropout, concat=False)
            for _ in range(nheads)
        ])

        # Improved global pooling
        self.global_pool = GlobalAttentionPooling(nhid * nheads)

        # Multi-layer classifier
        self.classifier = nn.Sequential(
            nn.Linear(nhid * nheads, nhid),
            nn.LayerNorm(nhid),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(nhid, nclass)
        )

    def forward(self, x, adj):
        # Normalize input features
        x = self.input_norm(x)

        # First GAT Layer
        x = torch.cat([att(x, adj) for att in self.attentions], dim=1)
        x = F.dropout(x, self.dropout, training=self.training)

        # Hidden layers with residual
        for layer in self.hidden_layers:
            x_res = x
            x = torch.cat([att(x, adj) for att in layer], dim=1)
            x = F.dropout(x, self.dropout, training=self.training)
            if x.shape == x_res.shape:
                x = x + x_res

        # Final attention layer
        x = torch.cat([att(x, adj) for att in self.final_attentions], dim=1)

        # Global pooling
        x = self.global_pool(x)

        # Classification
        x = self.classifier(x)
        return F.log_softmax(x, dim=0)

### Set Hyperparameters and Initialize Model

In [13]:
# Hyperparameters
nfeat = train_dataset[0][0].shape[1]
nhid = 256                  # Increased from 256
nclass = len(unique_labels)
dropout = 0.2                 # Reduced from 0.5
nheads = 16                   # Adjusted from 16
lr = 0.0001              # Changed from 0.0001
epochs = 200


# Initialize the model
model = GAT(nfeat=nfeat,
            nhid=nhid,
            nclass=nclass,
            dropout=dropout,
            nheads=nheads).to('cuda')

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(weight=class_weights).to('cuda')


### Training Loop with Early Stopping

In [14]:
# Initialize variables to track the best model
best_accuracy = 0.0
best_model_weights = None

for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct = 0

    # Training loop
    for node_features, adj, label in train_loader:
        optimizer.zero_grad()
        node_features = node_features.squeeze(0).to('cuda')
        adj = adj.squeeze(0).to('cuda')
        label = label.squeeze(0).to('cuda')

        # Forward pass
        output = model(node_features, adj)
        loss = criterion(output.unsqueeze(0), label.unsqueeze(0))

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        pred = output.argmax(dim=0)
        correct += (pred == label).sum().item()

    # Calculate training accuracy for the epoch
    train_accuracy = correct / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Train Acc: {train_accuracy:.4f}")

    # Testing every 10 epochs
    if (epoch + 1) % 5 == 0:
        model.eval()
        test_loss = 0
        correct = 0

        with torch.no_grad():
            for node_features, adj, label in test_loader:
                # Move data to GPU
                node_features = node_features.squeeze(0).to('cuda')
                adj = adj.squeeze(0).to('cuda')
                label = label.squeeze(0).to('cuda')

                # Forward pass for test data
                output = model(node_features, adj)
                loss = criterion(output.unsqueeze(0), label.unsqueeze(0))

                test_loss += loss.item()
                pred = output.argmax(dim=0)
                correct += (pred == label).sum().item()

        # Calculate test accuracy for this epoch
        test_accuracy = correct / len(test_loader)
        print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

        # Check if the current model has the best test accuracy
        if test_accuracy > best_accuracy:
            best_accuracy = test_accuracy
            best_model_weights = model.state_dict().copy()  # Save the best model weights

# Load the best model weights after training
model.load_state_dict(best_model_weights)
print(f"Best Test Accuracy: {best_accuracy:.4f}")


Epoch 1/200, Loss: 97.0671, Train Acc: 0.7000
Epoch 2/200, Loss: 89.1253, Train Acc: 0.6667
Epoch 3/200, Loss: 85.7614, Train Acc: 0.6667
Epoch 4/200, Loss: 86.4857, Train Acc: 0.7067
Epoch 5/200, Loss: 85.1855, Train Acc: 0.6867
Test Loss: 31.8641, Test Accuracy: 0.5600
Epoch 6/200, Loss: 84.7327, Train Acc: 0.6867
Epoch 7/200, Loss: 82.2685, Train Acc: 0.7000
Epoch 8/200, Loss: 83.4484, Train Acc: 0.6933
Epoch 9/200, Loss: 84.3490, Train Acc: 0.7200
Epoch 10/200, Loss: 82.6951, Train Acc: 0.7200
Test Loss: 38.5437, Test Accuracy: 0.5200
Epoch 11/200, Loss: 82.4437, Train Acc: 0.6933
Epoch 12/200, Loss: 80.5714, Train Acc: 0.7333
Epoch 13/200, Loss: 82.9797, Train Acc: 0.7067
Epoch 14/200, Loss: 79.7050, Train Acc: 0.7000
Epoch 15/200, Loss: 81.3768, Train Acc: 0.6933
Test Loss: 37.5952, Test Accuracy: 0.5600
Epoch 16/200, Loss: 79.1950, Train Acc: 0.7067
Epoch 17/200, Loss: 80.5221, Train Acc: 0.7267
Epoch 18/200, Loss: 80.6316, Train Acc: 0.6733
Epoch 19/200, Loss: 82.0595, Train Ac