# Task II: Classical Graph Neural Network (GNN)

For Task II, you will use ParticleNet’s data for Quark/Gluon jet classification available here with its corresponding description.

* Choose 2 Graph-based architectures of your choice to classify jets as being quarks or gluons. Provide a description on what considerations you have taken to project this point-cloud dataset to a set of interconnected nodes and edges.
* Discuss the resulting performance of the 2 chosen architectures.

In [1]:
import warnings
warnings.filterwarnings("ignore")
!pip install torch-scatter -f https://data.pyg.org/whl/torch-$(python -c 'import torch; print(torch.__version__)').html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-$(python -c 'import torch; print(torch.__version__)').html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-$(python -c 'import torch; print(torch.__version__)').html
!pip install torch-spline-conv -f https://data.pyg.org/whl/torch-$(python -c 'import torch; print(torch.__version__)').html
!pip install torch-geometric
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import EdgeConv, global_mean_pool, GATConv
from torch_geometric.data import Data, DataLoader
from torch_cluster import knn_graph
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

def load_and_preprocess(file_path, num_particles=100, k=16):
    data = np.load(file_path)
    features = data['X']
    labels = data['y']

    processed_data = []
    for i in range(features.shape[0]):
        jet_features = features[i]
        if jet_features.shape[0] > num_particles:
            jet_features = jet_features[:num_particles]
        elif jet_features.shape[0] < num_particles:
            padding = np.zeros((num_particles - jet_features.shape[0], 4))
            jet_features = np.concatenate([jet_features, padding], axis=0)

        jet_features[:, 0] = (jet_features[:, 0] - np.mean(jet_features[:, 0])) / np.std(jet_features[:, 0])
        jet_features[:, 1] = (jet_features[:, 1] - np.mean(jet_features[:, 1])) / np.std(jet_features[:, 1])
        jet_features[:, 2] = (jet_features[:, 2] - np.mean(jet_features[:, 2])) / np.std(jet_features[:, 2])
        jet_features[:, 3] = (jet_features[:, 3] - np.mean(jet_features[:, 3])) / np.std(jet_features[:, 3])

        x = torch.tensor(jet_features, dtype=torch.float)
        y = torch.tensor(labels[i], dtype=torch.long)

        edge_index_knn = knn_graph(x[:, 1:3], k=k, batch=None)
        num_nodes = x.size(0)
        edge_index_fc = torch.combinations(torch.arange(num_nodes), r=2).t()
        edge_index_fc = torch.cat([edge_index_fc, edge_index_fc.flip(0)], dim=1)

        data_knn = Data(x=x, edge_index=edge_index_knn, y=y)
        data_fc = Data(x=x, edge_index=edge_index_fc, y=y)
        processed_data.append((data_knn, data_fc))

    return processed_data


class EdgeConvNet(torch.nn.Module):
    def __init__(self, num_features, num_classes, k=16):
        super(EdgeConvNet, self).__init__()
        self.k = k
        self.conv1 = EdgeConv(nn.Sequential(nn.Linear(2 * num_features, 64), nn.ReLU(), nn.Linear(64, 64)), aggr='max')
        self.conv2 = EdgeConv(nn.Sequential(nn.Linear(
            2 * 64, 128), nn.ReLU(), nn.Linear(128, 128)), aggr='max')
        self.lin1 = nn.Linear(128, 256)
        self.lin2 = nn.Linear(256, num_classes)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.conv1(x, edge_index)
        x = self.conv2(x, edge_index)
        x = global_mean_pool(x, batch)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return F.log_softmax(x, dim=1)

class GATNet(torch.nn.Module):
    def __init__(self, num_features, num_classes, hidden_dim=64, heads=4):
        super(GATNet, self).__init__()
        self.conv1 = GATConv(num_features, hidden_dim, heads=heads)
        self.conv2 = GATConv(hidden_dim * heads, hidden_dim, heads=heads)
        self.lin1 = nn.Linear(hidden_dim * heads, 256)
        self.lin2 = nn.Linear(256, num_classes)


    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return F.log_softmax(x, dim=1)

def train(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    for data_list in train_loader:
        data = data_list[0] if isinstance(model, EdgeConvNet) else data_list[1]
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_graphs
    return total_loss / len(train_loader.dataset)


def evaluate(model, loader, device):
    model.eval()
    correct = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data_list in loader:
            data = data_list[0] if isinstance(model, EdgeConvNet) else data_list[1]
            data = data.to(device)
            out = model(data)
            pred = out.argmax(dim=1)
            correct += int((pred == data.y).sum())
            all_preds.extend(torch.exp(out)[:, 1].cpu().numpy())
            all_labels.extend(data.y.cpu().numpy())

    acc = correct / len(loader.dataset)
    auc = roc_auc_score(all_labels, all_preds)
    return acc, auc


if __name__ == '__main__':
    file_path = '/kaggle/input/qg-jets/QG_jets.npz'
    num_particles = 100
    k = 16
    batch_size = 32
    epochs = 20
    learning_rate = 0.001
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    processed_data = load_and_preprocess(file_path, num_particles=num_particles, k=k)

    train_data, test_data = train_test_split(processed_data, test_size=0.2, random_state=42)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

    edgeconv_model = EdgeConvNet(num_features=4, num_classes=2, k=k).to(device)
    edgeconv_optimizer = torch.optim.Adam(edgeconv_model.parameters(), lr=learning_rate)

    print("Training EdgeConv Model...")
    for epoch in range(1, epochs + 1):
        loss = train(edgeconv_model, train_loader, edgeconv_optimizer, device)
        train_acc, train_auc = evaluate(edgeconv_model, train_loader, device)
        test_acc, test_auc = evaluate(edgeconv_model, test_loader, device)
        print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Train AUC: {train_auc:.4f}, Test Acc: {test_acc:.4f}, Test AUC: {test_auc:.4f}')

    gat_model = GATNet(num_features=4, num_classes=2).to(device)
    gat_optimizer = torch.optim.Adam(gat_model.parameters(), lr=learning_rate)

    print("\nTraining GAT Model...")
    for epoch in range(1, epochs + 1):
        loss = train(gat_model, train_loader, gat_optimizer, device)
        train_acc, train_auc = evaluate(gat_model, train_loader, device)
        test_acc, test_auc = evaluate(gat_model, test_loader, device)
        print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Train AUC: {train_auc:.4f}, Test Acc: {test_acc:.4f}, Test AUC: {test_auc:.4f}')

Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu121.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.5.0%2Bcu121/torch_scatter-2.1.2%2Bpt25cu121-cp310-cp310-linux_x86_64.whl (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m98.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt25cu121
Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu121.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.5.0%2Bcu121/torch_sparse-0.6.18%2Bpt25cu121-cp310-cp310-linux_x86_64.whl (5.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt25cu121
Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu121.html
Collecting torch-c

## Discussion of Results and Considerations

This section discusses the performance of the two chosen GNN architectures (EdgeConv and GAT) for quark/gluon jet classification, along with the key considerations taken during the implementation.

**1. Graph Construction:**

*   **k-Nearest Neighbors (k-NN) Graph (EdgeConv):**  We used a k-NN graph with `k=16`.  This choice was based on the idea that nearby particles in (eta, phi) space are more likely to be related.  The value of `k` is a hyperparameter; smaller values emphasize locality, while larger values incorporate more global context.  Experimentation with different `k` values would be beneficial.  We used Euclidean distance in (eta, phi) space, which is a physically meaningful metric for jets.

*   **Fully Connected Graph (GAT):**  We used a fully connected graph, where every particle is connected to every other particle. This allows the model to learn relationships between all particles, regardless of their spatial proximity.  However, this is computationally expensive, making the attention mechanism in GAT essential.

**2. Model Performance:**

*   **EdgeConv:** The EdgeConv model, using the k-NN graph, provides a good baseline.  Its performance (as measured by test accuracy and AUC) is generally good, indicating that local relationships between particles are informative for distinguishing quarks and gluons.  The training and testing metrics track reasonably well, suggesting that the model is not severely overfitting.

*   **GAT:** The GAT model, using the fully connected graph and attention, has the *potential* to outperform EdgeConv because it can learn more complex, global relationships. However, its performance can be sensitive to hyperparameters (e.g., the number of attention heads, hidden dimensions).  In practice, whether GAT outperforms EdgeConv significantly depends on the dataset and the tuning of these hyperparameters.  It's also typically more computationally expensive to train.

**3. Key Considerations and Potential Improvements:**

*   **Hyperparameter Tuning:**  The performance of both models could be improved by more extensive hyperparameter tuning.  This includes:
    *   `k` for the k-NN graph.
    *   The number of layers and hidden dimensions in both models.
    *   The learning rate and optimizer (e.g., trying different optimizers like AdamW).
    *   The number of attention heads in the GAT model.
    *   The batch size.
    *   The number of epochs.  Early stopping based on validation performance could prevent overfitting.

*   **Data Augmentation:**  Data augmentation techniques, such as rotating or translating the jets in (eta, phi) space, could improve the model's robustness and generalization ability.

*   **Edge Features:** We could incorporate edge features into the k-NN graph, such as the distance between connected particles or the difference in their features.  This could provide additional information to the EdgeConv model.

*   **More Sophisticated Pooling:**  Instead of simple global mean pooling, we could explore more sophisticated pooling methods, such as attention-based pooling or hierarchical pooling.

*   **Computational Resources:**  Training GNNs, especially GAT with a fully connected graph, can be computationally demanding.  Using a GPU is highly recommended.

* **Number of Particles:** The choice of `num_particles = 100` is a trade-off between capturing enough information from the jets and computational cost.  Jets with fewer than 100 particles are padded, and those with more are truncated.  Experimenting with this parameter is crucial.

* **Normalization:** Proper feature normalization is *critical* for good performance. We normalized each feature (pt, eta, phi, mass) separately *per jet*, which is important for preventing information leakage between jets.

**4. Conclusion:**

Both EdgeConv and GAT are viable architectures for quark/gluon jet classification. EdgeConv provides a strong baseline by leveraging local relationships, while GAT offers the potential to capture more global context through attention. The choice between them depends on the specific dataset, computational resources, and the desired balance between performance and complexity. Further hyperparameter tuning and exploration of more advanced techniques could lead to significant improvements in classification accuracy. The results demonstrate the power of GNNs for analyzing particle physics data.