In [None]:
"""
Here's a complete Python script to process Zachary's Karate Club dataset (provided as an edge list) using PyTorch Geometric (PyG)
We will train a SGC model, and evaluate it:

* Zachary's Karate Club dataset is represented as an edge list with a shape of [78, 2].
* 78 edges (rows) and 2 columns (source node, target node)
* Each row denotes a connection between two nodes."
* When loaded into PyTorch Geometric, the original [78, 2] edge list is transposed to [2, 78] to match the expected input format.
"""

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SGConv
from sklearn.metrics import accuracy_score

In [2]:
# 1. Upload and preprocess data
# We define a function to load the dataset.

def load_my_dataset(file_path):

    # 1. Create edge list according to the PyG Format.
    edges = []
    with open(file_path, 'r') as f:
        for line in f:
            if line.strip():  # Skip empty lines
                u, v = map(int, line.strip().split())
                edges.append((u-1, v-1))  # Convert to 0-based indexing

    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    num_nodes = edge_index.max().item() + 1

    # 2.Calculate Degrees

    degrees = torch.zeros(num_nodes, dtype=torch.float)
    for u, v in edges:
        degrees[u] += 1
        degrees[v] += 1
    x = degrees.view(-1, 1)  # Node features: degree

    # 3. Labels: Binary (0 or 1) indicating faction membership.
    # Synthetic labels (community detection - same as original Zachary study)
    # We know this labels
    y = torch.tensor([0 if i in {0,1,2,3,7,11,12,13,17,19,21} else 1 for i in range(num_nodes)])
    
    # Create train/val/test masks (60/20/20 split)
    idx = torch.randperm(num_nodes)
    train_mask = idx[:int(0.6*num_nodes)]
    val_mask = idx[int(0.6*num_nodes):int(0.8*num_nodes)]
    test_mask = idx[int(0.8*num_nodes):]
    
    # Create masks as boolean tensors
    train_mask_bool = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask_bool = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask_bool = torch.zeros(num_nodes, dtype=torch.bool)
    train_mask_bool[train_mask] = True
    val_mask_bool[val_mask] = True
    test_mask_bool[test_mask] = True

    num_classes = len(set(y.cpu().numpy()))
    
    # Create Data object with explicit attributes
    data = Data(
        x=x,
        edge_index=edge_index,
        y=y,
        train_mask=train_mask_bool,
        val_mask=val_mask_bool,
        test_mask=test_mask_bool,
        num_nodes=num_nodes,
        num_edges=edge_index.size(1),
        num_classes=num_classes
    )

    print(f"Dataset loaded:")
    print(f"- Nodes: {data.num_nodes}")
    print(f"- Edges: {data.num_edges}")
    print(f"- Features: {data.num_features}")
    print(f"- Classes: {data.num_classes}")

    return data

In [3]:
# 2. Build Model
class SGCModel(nn.Module):
    def __init__(self, num_features, num_classes, K=1):
        super().__init__()
        self.conv = SGConv(num_features, num_classes, K=K)
        
    def forward(self, x, edge_index):
        return self.conv(x, edge_index)

In [4]:
# 3. Fit Model
def train_model(model, data, epochs=100):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
    criterion = nn.CrossEntropyLoss()
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        
        if epoch % 10 == 0:
            train_acc, val_acc, test_acc = evaluate_model(model, data)
            print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, '
                  f'Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')

In [5]:
# 4/5. Predict and Evaluate Model
def evaluate_model(model, data):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        pred = out.argmax(dim=1)
        
        train_acc = accuracy_score(data.y[data.train_mask].numpy(), 
                                 pred[data.train_mask].numpy())
        val_acc = accuracy_score(data.y[data.val_mask].numpy(), 
                               pred[data.val_mask].numpy())
        test_acc = accuracy_score(data.y[data.test_mask].numpy(), 
                                pred[data.test_mask].numpy())
        
    return train_acc, val_acc, test_acc

In [6]:
def main():
    epoches = 100
    file_path = 'C:/Users/Majid/Downloads/CNN/zachary-34-78'
    data = load_my_dataset(file_path)
    
    # 2. Build model
    model = SGCModel(num_features=data.num_features, 
                    num_classes=data.num_classes,
                    K=2)  # 1-hop propagation
    # 3. Train model
    train_model(model, data, epoches)
    
    # 4/5. Evaluate model
    train_acc, val_acc, test_acc = evaluate_model(model, data)
    print(f'\nFinal Results:')
    print(f'Train Accuracy: {train_acc:.4f}')
    print(f'Validation Accuracy: {val_acc:.4f}')
    print(f'Test Accuracy: {test_acc:.4f}')

In [8]:
main()

Dataset loaded:
- Nodes: 34
- Edges: 78
- Features: 1
- Classes: 2
Epoch: 000, Loss: 8.6271, Train Acc: 0.3000, Val Acc: 0.4286
Epoch: 010, Loss: 2.3483, Train Acc: 0.7000, Val Acc: 0.5714
Epoch: 020, Loss: 1.2243, Train Acc: 0.5500, Val Acc: 0.8571
Epoch: 030, Loss: 0.6802, Train Acc: 0.7000, Val Acc: 0.5714
Epoch: 040, Loss: 0.4616, Train Acc: 0.7000, Val Acc: 0.5714
Epoch: 050, Loss: 0.4696, Train Acc: 0.7500, Val Acc: 1.0000
Epoch: 060, Loss: 0.4414, Train Acc: 0.7500, Val Acc: 1.0000
Epoch: 070, Loss: 0.4258, Train Acc: 0.7000, Val Acc: 1.0000
Epoch: 080, Loss: 0.4224, Train Acc: 0.7500, Val Acc: 1.0000
Epoch: 090, Loss: 0.4189, Train Acc: 0.7500, Val Acc: 1.0000

Final Results:
Train Accuracy: 0.7500
Validation Accuracy: 1.0000
Test Accuracy: 0.8571
