In [None]:
# !pip install torch#==2.2
# !pip install torch_geometric#==2.4.0
# !pip install seaborn#==0.12.2
# !pip install networkx#==2.8.5
# !pip install scikit-learn#==1.3.2
# !pip install matplotlib#==3.5.2
# !pip install pandas#==1.4.3

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.metrics import f1_score
from copy import deepcopy  

import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import GATConv
from torch_geometric.utils import to_networkx
from torch_geometric.datasets import Planetoid

In [None]:
def visualize(data, labels):
    tsne = TSNE(n_components=2, init='pca', random_state=7)
    tsne_res = tsne.fit_transform(data)
    v = pd.DataFrame(data,columns=[str(i) for i in range(data.shape[1])])
    v['color'] = labels
    v['label'] = v['color'].apply(lambda i: str(i))
    v["dim1"] = tsne_res[:,0]
    v["dim2"] = tsne_res[:,1]
    
    plt.figure(figsize=(12,12))

    sns.scatterplot(
        x="dim1", y="dim2",
        hue="color",
        palette=sns.color_palette(["#52D1DC", "#8D0004", "#845218","#563EAA", "#E44658", "#63C100", "#FF7800"]),
        legend=False,
        data=v,
    )

In [None]:
def visualize_graph(G, color):
    plt.figure(figsize=(75,75))
    plt.xticks([])
    plt.yticks([])
    nx.draw_networkx(G, pos=nx.arf_layout(G), with_labels=False,
                     node_color=color, cmap="rainbow")
    plt.show()

In [None]:
from torch_geometric.datasets import EllipticBitcoinDataset

In [None]:
dataset = EllipticBitcoinDataset(root='data/EllipticBitcoinDataset')

In [None]:
%run elliptic_data_visual_schema.py
%run model_trainer_compare.py

In [None]:
visualize_elliptic_dataset_overview(dataset[0], dataset, save_path='elliptic_overview.png')

In [None]:
data = dataset[0]
pd.Series(data.y.numpy()).value_counts()

In [None]:
print(data.train_mask.sum() + data.test_mask.sum())
print(data.y[data.train_mask].sum() + data.y[data.test_mask].sum())

In [None]:
# Ensure `train_mask` exists and is a Boolean tensor
train_mask = data.train_mask.bool()

# Determine the number of training nodes
train_indices = train_mask.nonzero(as_tuple=True)[0]

# Shuffle the training indices for random splitting
train_indices = train_indices[torch.randperm(train_indices.size(0))]

# Define the size of the validation set (e.g., 20% of the training nodes)
val_size = int(0.2 * len(train_indices))

# Split the training indices into validation and new training sets
val_indices = train_indices[:val_size]
new_train_indices = train_indices[val_size:]

# Create new masks for training and validation
new_train_mask = torch.zeros_like(train_mask)
new_val_mask = torch.zeros_like(train_mask)

new_train_mask[new_train_indices] = True
new_val_mask[val_indices] = True

# Update the data object with the new masks
data.train_mask = new_train_mask
data.val_mask = new_val_mask

# Verify the sizes of the new masks
print("Training nodes:", new_train_mask.sum().item())
print("Validation nodes:", new_val_mask.sum().item())
print("Testing nodes:", data.test_mask.sum().item())

In [None]:
data.subgraph(subset=val_indices)

In [None]:
def visualize_graph(G, color):
    plt.figure(figsize=(60,60))
    plt.xticks([])
    plt.yticks([])
    nx.draw_networkx(G, pos=nx.random_layout(G), with_labels=False,
                     node_color=color, cmap="winter_r", arrows=True)
    plt.savefig('foo.svg')
    plt.show()

In [None]:
G = to_networkx(data.subgraph(subset=val_indices))
visualize_graph(G, color=data.subgraph(subset=val_indices).y)

In [None]:
# Ensure train_mask exists and is Boolean
train_mask = data.train_mask.bool()
train_indices = train_mask.nonzero(as_tuple=True)[0]
train_indices = train_indices[torch.randperm(train_indices.size(0))]

# Define validation size (20%)
val_size = int(0.2 * len(train_indices))
val_indices = train_indices[:val_size]
new_train_indices = train_indices[val_size:]

# Create new masks
new_train_mask = torch.zeros_like(train_mask)
new_val_mask = torch.zeros_like(train_mask)
new_train_mask[new_train_indices] = True
new_val_mask[val_indices] = True

# Update data object with new masks
data.train_mask = new_train_mask
data.val_mask = new_val_mask  # ← This creates the val_mask attribute!

# Verify sizes
print(f"Training nodes:   {new_train_mask.sum().item():,}")
print(f"Validation nodes: {new_val_mask.sum().item():,}")
print(f"Testing nodes:    {data.test_mask.sum().item():,}")
print("="*60)



In [None]:
HIDDEN_CHANNELS = 8
NUM_EPOCHS = 1000
LR = 0.01
WEIGHT_DECAY = 1e-4
PRINT_EVERY = 100

# Initialize comparison
comparison = ModelComparison()


# 1) Multi-Layer Perceptron (MLP)

**Setup.** For node $i$ with feature vector $\mathbf{x}_i \in \mathbb{R}^{165}$.

**Model.**
$$
\begin{aligned}
\mathbf{h}_1 &= \sigma\!\left(\mathbf{W}_1 \mathbf{x}_i + \mathbf{b}_1\right), \\
\mathbf{z}_i &= \mathbf{W}_2 \mathbf{h}_1 + \mathbf{b}_2, \\
\hat{\mathbf{y}}_i &= \mathrm{softmax}(\mathbf{z}_i) \in \Delta^{1}.
\end{aligned}
$$

**Loss (binary cross-entropy).**
$$
\mathcal{L} \;=\; -\sum_{i}\sum_{c\in\{0,1\}} y_{ic}\,\log \hat{y}_{ic}.
$$

**Notes.** $\sigma$ typically ReLU. No graph context; nodes are treated independently.


In [None]:

class MLP(torch.nn.Module):
    """Multi-Layer Perceptron (baseline model)."""
    def __init__(self, num_features, hidden_channels, num_classes, seed=20251120):
        super().__init__()
        torch.manual_seed(seed)
        self.lin1 = Linear(num_features, hidden_channels)
        self.lin2 = Linear(hidden_channels, num_classes)
    
    def forward(self, x, edge_index=None):  # edge_index optional for compatibility
        x = self.lin1(x)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        return x


# ============================================================
# Train MLP
# ============================================================
print("\n" + "="*60)
print("TRAINING MLP MODEL")
print("="*60)

mlp_model = MLP(
    num_features=dataset.num_features,
    hidden_channels=HIDDEN_CHANNELS,
    num_classes=dataset.num_classes
)

      
# Untrained model for baseline
untrained_model = deepcopy(MLP(dataset.num_features, 8, dataset.num_classes))
untrained_model.load_state_dict(mlp_model.state_dict())

mlp_trainer = ModelTrainer(
    model=mlp_model,
    data=data,
    model_name="MLP",
    lr=LR,
    weight_decay=WEIGHT_DECAY
)

mlp_trainer.train(num_epochs=NUM_EPOCHS, print_every=PRINT_EVERY)
mlp_trainer.test()
mlp_trainer.plot_learning_curves()
comparison.add_model("MLP", mlp_trainer)



In [None]:
  
    
    # Visualize trained model predictions
    mlp_trainer.visualize_predictions_tsne(data.test_mask, "Test")
    
    # Compare before/after training
    mlp_trainer.visualize_before_after(data.test_mask, untrained_model, "Test")
    
    # Detailed error analysis
    mlp_trainer.analyze_errors(data.test_mask, "Test")

# ========================================

# 2) Graph Convolutional Network (GCN)

**Graph definition**  
Let $G=(V,E)$ have adjacency $\mathbf{A}$ and degree $\mathbf{D}$.  
Add self-loops: $\tilde{\mathbf{A}}=\mathbf{A}+\mathbf{I}$,  
$\tilde{\mathbf{D}}_{ii}=\sum_j \tilde{\mathbf{A}}_{ij}$.

**Layer propagation**
$$
\boxed{
\mathbf{H}^{(\ell+1)} = 
\sigma\!\left(
\tilde{\mathbf{D}}^{-\frac{1}{2}}
\tilde{\mathbf{A}}
\tilde{\mathbf{D}}^{-\frac{1}{2}}
\mathbf{H}^{(\ell)}
\mathbf{W}^{(\ell)}
\right)
}
\quad\text{with}\quad
\mathbf{H}^{(0)}=\mathbf{X}.
$$

**Output layer (logits)**
$$
\mathbf{Z} =
\tilde{\mathbf{D}}^{-\frac{1}{2}}\tilde{\mathbf{A}}\tilde{\mathbf{D}}^{-\frac{1}{2}}
\mathbf{H}^{(L-1)}\mathbf{W}^{(L-1)}.
$$

**Intuition.** Normalized neighborhood averaging (isotropic).  
Acts as a low-pass filter on the graph Laplacian.  
After $L$ layers, each node aggregates information from its $L$-hop neighborhood.


In [None]:



class GCN(torch.nn.Module):
    """Graph Convolutional Network."""
    def __init__(self, num_features, hidden_channels, num_classes, seed = 20251120):
        super().__init__()
        torch.manual_seed(seed)
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x


    
# ============================================================
# Train GCN
# ============================================================
print("\n" + "="*60)
print("TRAINING GCN MODEL")
print("="*60)

gcn_model = GCN(
    num_features=dataset.num_features,
    hidden_channels=HIDDEN_CHANNELS,
    num_classes=dataset.num_classes
)

       
# Untrained model for baseline
untrained_model = deepcopy(GCN(dataset.num_features, 8, dataset.num_classes))
untrained_model.load_state_dict(gcn_model.state_dict())


gcn_trainer = ModelTrainer(
    model=gcn_model,
    data=data,
    model_name="GCN",
    lr=LR,
    weight_decay=WEIGHT_DECAY
)

gcn_trainer.train(num_epochs=NUM_EPOCHS, print_every=PRINT_EVERY)
gcn_trainer.test()
gcn_trainer.plot_learning_curves()
comparison.add_model("GCN", gcn_trainer)

 

In [None]:

# Visualize trained model predictions
gcn_trainer.visualize_predictions_tsne(data.test_mask, "Test")

# Compare before/after training
gcn_trainer.visualize_before_after(data.test_mask, untrained_model, "Test")

# Detailed error analysis
gcn_trainer.analyze_errors(data.test_mask, "Test")



# ========================================

# 3) Graph Attention Network (GAT)

**Concept.** Replace uniform neighbor averaging with learned, anisotropic attention weights.

**Per-layer computations (single head)**
$$
\begin{aligned}
\mathbf{h}_i &= \mathbf{W}\mathbf{x}_i, \\[4pt]
e_{ij} &= \mathrm{LeakyReLU}\!\left(\mathbf{a}^{\top}[\mathbf{h}_i \Vert \mathbf{h}_j]\right), \quad j\in\mathcal{N}(i), \\[4pt]
\alpha_{ij} &= \frac{\exp(e_{ij})}{\sum_{k\in\mathcal{N}(i)} \exp(e_{ik})}, \\[6pt]
\mathbf{h}_i' &= \sigma\!\left(\sum_{j\in\mathcal{N}(i)} \alpha_{ij}\,\mathbf{h}_j\right).
\end{aligned}
$$

**Multi-head attention**
$$
\mathbf{h}_i' =
\big\Vert_{k=1}^{K}
\sigma\!\left(\sum_{j\in\mathcal{N}(i)} \alpha_{ij}^{(k)}\,\mathbf{h}_j^{(k)}\right)
\quad\text{or}\quad
\frac{1}{K}\sum_{k=1}^{K}
\sigma\!\left(\sum_{j} \alpha_{ij}^{(k)}\,\mathbf{h}_j^{(k)}\right).
$$

**Intuition.** Attention coefficients $\alpha_{ij}$ emphasize informative neighbors and suppress noisy ones, allowing data-dependent message passing.


In [None]:

class GAT(torch.nn.Module):
    """Graph Attention Network."""
    def __init__(self, num_features, hidden_channels, num_classes, heads=4, seed = 20251120):
        super().__init__()
        torch.manual_seed(seed)
        self.conv1 = GATConv(num_features, hidden_channels, heads=heads)
        self.conv2 = GATConv(hidden_channels * heads, num_classes, heads=1)
    
    def forward(self, x, edge_index):
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x


# ============================================================
# Train GAT
# ============================================================
print("\n" + "="*60)
print("TRAINING GAT MODEL")
print("="*60)

gat_model = GAT(
    num_features=dataset.num_features,
    hidden_channels=HIDDEN_CHANNELS,
    num_classes=dataset.num_classes,
    heads=4
)

# Untrained model for baseline
ungat_model = deepcopy(GAT(dataset.num_features, 8, dataset.num_classes))
ungat_model.load_state_dict(gat_model.state_dict())


gat_trainer = ModelTrainer(
    model=gat_model,
    data=data,
    model_name="GAT",
    lr=LR,
    weight_decay=WEIGHT_DECAY
)

gat_trainer.train(num_epochs=NUM_EPOCHS, print_every=PRINT_EVERY)
gat_trainer.test()
gat_trainer.plot_learning_curves()
comparison.add_model("GAT", gat_trainer)


In [None]:

# Visualize trained model predictions
gat_trainer.visualize_predictions_tsne(data.test_mask, "Test")

# Compare before/after training
gat_trainer.visualize_before_after(data.test_mask, ungat_model, "Test")

# Detailed error analysis
gat_trainer.analyze_errors(data.test_mask, "Test")



# 4) Comparative Summary

| Model | Aggregation | Key operator | Information scope | Behavior |
|:------|:-------------|:--------------|:------------------|:----------|
| **MLP** | None (independent nodes) | Linear layers + nonlinearity | Self only | Ignores graph topology |
| **GCN** | Normalized uniform average | $\tilde{D}^{-1/2}\tilde{A}\tilde{D}^{-1/2}$ | $L$-hop neighbors | Smooth, robust on homophilous graphs |
| **GAT** | Attention-weighted average | $\alpha_{ij}=\mathrm{softmax}(e_{ij})$ | $L$-hop neighbors | Adaptive, handles heterophily or noise |

**Bias–variance intuition**  
- **GCN:** stronger bias (smoothing), more stable on regular structures.  
- **GAT:** lower bias, learns relevance of each edge dynamically.


In [None]:

# ============================================================
# Compare All Models
# ============================================================
comparison.print_comparison()
comparison.plot_comparison()
comparison.plot_f1_curves()

print("\n" + "="*60)
print("ALL EXPERIMENTS COMPLETED!")
print("="*60)

# 5) Mathematical Intuition Hierarchy

**Functional perspective**
$$
\begin{aligned}
\text{MLP:} &\quad f:\mathbb{R}^{d}\!\to\!\{0,1\}.\\[4pt]
\text{GCN:} &\quad f(G,\mathbf{X})\ \text{via isotropic Laplacian smoothing.}\\[4pt]
\text{GAT:} &\quad f(G,\mathbf{X},\Theta_{\text{att}})\ \text{via anisotropic, data-driven attention.}
\end{aligned}
$$

**Spectral interpretation**
$$
\text{GCN}\ \approx\ \text{low-pass filter on the graph Laplacian.}
\qquad
\text{GAT: replaces fixed spectral weights with learned, local attention.}
$$

**Receptive field.**  
After $L$ layers, each node embedding incorporates information up to $L$-hop neighbors.
