<a href="https://colab.research.google.com/github/nncliff/qwen-32B/blob/main/chapter-1/droppath.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DropPath (Stochastic Depth) Implementation

This notebook implements **DropPath** (also known as Stochastic Depth), a regularization technique often used in deep residual networks (like ResNets, Transformers, and GNNs).

**What is DropPath?**
Unlike Dropout (which drops individual activations), DropPath drops **entire residual paths** during training. This effectively trains a simpler sub-network in each iteration, acting as an implicit ensemble of many shallower networks.

**Key Components:**
1.  `DropPath` Module: Randomly zeros out the input tensor based on a probability `drop_prob`.
2.  `ResidualGATBlock`: A Graph Attention Network block that uses DropPath on its residual connection.
3.  `GATClassifier`: A simple GNN classifier using these blocks.

In [1]:
# Install required library for Graph Neural Networks
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.3/1.3 MB[0m [31m78.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.7.0


In [2]:
import torch
import torch.nn as nn
import random
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GATConv, global_mean_pool

In [3]:
def generated_dummy_graph(label):
    num_nodes = random.randint(10, 20)
    x = torch.randn(num_nodes, 16) + label
    edge_index = torch.tensor([[i, (i+1)%num_nodes] for i in range(num_nodes)], dtype=torch.long).t().contiguous()
    data = Data(x=x, edge_index=edge_index, y=torch.tensor([label]))
    return data

def build_dataset(num_samples=300):
    dataset = []
    for _ in range(num_samples):
        label = random.randint(0, 1)
        graph = generated_dummy_graph(label)
        dataset.append(graph)
    return dataset

In [4]:
class DropPath(nn.Module):
    def __init__(self, drop_prob=0.1):
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        if not self.training or self.drop_prob == 0.0:
            return x
        keep_prob = 1 - self.drop_prob
        # Work with any number of dimensions, not just 4D (B, C, H, W)
        shape = (x.size(0),) + (1,) * (x.dim() - 1)
        random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
        binary_tensor = torch.floor(random_tensor)
        output = x.div(keep_prob) * binary_tensor
        return output

In [5]:
class ResidualGATBlock(nn.Module):
    def __init__(self, in_dim, out_dim, heads=2, drop_path_prob=0.1):
        super().__init__()
        self.norm = nn.LayerNorm(in_dim)
        self.gat_conv = GATConv(in_dim, out_dim // heads, heads=heads, concat=True)
        self.linear_proj = nn.Linear(in_dim, out_dim)
        self.drop_path = DropPath(drop_path_prob)
        self.activation = nn.ReLU()

    def forward(self, x, edge_index):
        identity = self.linear_proj(x)
        x = self.norm(x)
        out = self.gat_conv(x, edge_index)
        out = self.drop_path(out)
        out = identity + out  # Residual connection
        out = self.activation(out)
        return out

In [6]:
class GATClassifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super().__init__()
        self.block1 = ResidualGATBlock(in_dim, hidden_dim, drop_path_prob=0.2)
        self.block2 = ResidualGATBlock(hidden_dim, hidden_dim, drop_path_prob=0.2)
        self.classifier = nn.Linear(hidden_dim, out_dim)

    def forward(self, x, edge_index, batch):
        x = self.block1(x, edge_index)
        x = self.block2(x, edge_index)
        x = global_mean_pool(x, batch)
        out = self.classifier(x)
        return out

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = GATClassifier(in_dim=16, hidden_dim=64, out_dim=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

dataset = build_dataset()
loader = DataLoader(dataset, batch_size=16, shuffle=True)

Using device: cpu


  loader = DataLoader(dataset, batch_size=16, shuffle=True)


In [8]:
model.train()
for epoch in range(10):
    total_loss = 0.0
    correct = 0

    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch.x, batch.edge_index, batch.batch)
        loss = criterion(out, batch.y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = out.argmax(dim=1)
        correct += (preds == batch.y).sum().item()

    avg_loss = total_loss / len(loader)
    accuracy = correct / len(dataset)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

Epoch 1, Loss: 0.4003, Accuracy: 0.8900
Epoch 2, Loss: 0.0839, Accuracy: 1.0000
Epoch 3, Loss: 0.0148, Accuracy: 1.0000
Epoch 4, Loss: 0.0055, Accuracy: 1.0000
Epoch 5, Loss: 0.0031, Accuracy: 1.0000
Epoch 6, Loss: 0.0020, Accuracy: 1.0000
Epoch 7, Loss: 0.0018, Accuracy: 1.0000
Epoch 8, Loss: 0.0014, Accuracy: 1.0000
Epoch 9, Loss: 0.0011, Accuracy: 1.0000
Epoch 10, Loss: 0.0009, Accuracy: 1.0000
