# Experiment: GNN Failure Risk Audit (Minimal)

Objective:
- Verify that homophily affects GNN accuracy on simple synthetic graphs.


In [None]:
# Colab install (minimal)
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    import subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'torch-geometric', 'scipy', 'networkx'])
else:
    print('Local run detected. Ensure torch-geometric + scipy + networkx are installed.')


In [None]:
# Imports + seed + device
import random
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn

import networkx as nx
from torch_geometric.utils import from_networkx
from torch_geometric.nn import GCNConv

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)


In [None]:
# Synthetic data (SBM)

def make_sbm(num_nodes=200, p_in=0.1, p_out=0.01, num_classes=2, feat_dim=16, seed=0):
    rng = np.random.default_rng(seed)
    sizes = [num_nodes // num_classes for _ in range(num_classes)]
    sizes[0] += num_nodes - sum(sizes)
    probs = [[p_in if i == j else p_out for j in range(num_classes)] for i in range(num_classes)]

    G = nx.stochastic_block_model(sizes, probs, seed=seed)
    labels = []
    for c, size in enumerate(sizes):
        labels.extend([c] * size)
    labels = np.array(labels)

    features = rng.normal(size=(num_nodes, feat_dim)).astype(np.float32)
    features += 0.1 * rng.normal(size=(num_nodes, feat_dim)).astype(np.float32) * labels[:, None]

    data = from_networkx(G)
    data.x = torch.tensor(features, dtype=torch.float)
    data.y = torch.tensor(labels, dtype=torch.long)
    return data


In [None]:
# Models + training helpers
class MLP(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )
    def forward(self, x, edge_index=None):
        return self.net(x)

class GCN(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super().__init__()
        self.conv1 = GCNConv(in_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, out_dim)
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x


def split_indices(n, train_ratio=0.6, val_ratio=0.2, seed=0):
    rng = np.random.default_rng(seed)
    idx = rng.permutation(n)
    n_train = int(train_ratio * n)
    n_val = int(val_ratio * n)
    train_idx = torch.tensor(idx[:n_train], dtype=torch.long)
    val_idx = torch.tensor(idx[n_train:n_train + n_val], dtype=torch.long)
    test_idx = torch.tensor(idx[n_train + n_val:], dtype=torch.long)
    return train_idx, val_idx, test_idx


def train_epoch(model, data, train_idx, optimizer):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out[train_idx], data.y[train_idx])
    loss.backward()
    optimizer.step()
    return loss.item()


def eval_acc(model, data, idx):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        pred = out.argmax(dim=-1)
        acc = (pred[idx] == data.y[idx]).float().mean().item()
    return acc


In [None]:
# Minimal run (2 settings, 2 models)
settings = [
    (0.10, 0.01),  # homophilic
    (0.03, 0.05),  # heterophilic
]

for p_in, p_out in settings:
    data = make_sbm(p_in=p_in, p_out=p_out, seed=SEED)
    data = data.to(DEVICE)
    train_idx, val_idx, test_idx = split_indices(data.num_nodes, seed=SEED)

    for model_name, model in [
        ('MLP', MLP(data.num_features, 32, 2)),
        ('GCN', GCN(data.num_features, 32, 2)),
    ]:
        model = model.to(DEVICE)
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
        for _ in range(50):
            train_epoch(model, data, train_idx, optimizer)
        test_acc = eval_acc(model, data, test_idx)
        print(f"p_in={p_in:.2f}, p_out={p_out:.2f}, {model_name} test_acc={test_acc:.3f}")
