# ⚙️ Training Simulation Lab – Module 12
Simulate multi-GPU setup, monitor memory, log checkpoints, and practice resuming from failure.

## 🔧 Step 1: Create a Dummy Dataset and Model

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

X = torch.randn(1000, 10)
y = torch.randint(0, 2, (1000,))
data = DataLoader(TensorDataset(X, y), batch_size=32)

model = nn.Sequential(nn.Linear(10, 64), nn.ReLU(), nn.Linear(64, 2))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

## 🧪 Step 2: Simulate Training + Interrupt

In [None]:
from pathlib import Path
import json, time

ckpt_file = Path("checkpoints/sim.ckpt")
start_epoch = 0

if ckpt_file.exists():
    state = torch.load(ckpt_file)
    model.load_state_dict(state['model'])
    optimizer.load_state_dict(state['optimizer'])
    start_epoch = state['epoch'] + 1
    print(f"✅ Resuming from epoch {start_epoch}")

for epoch in range(start_epoch, 5):
    running_loss = 0
    for xb, yb in data:
        optimizer.zero_grad()
        pred = model(xb)
        loss = loss_fn(pred, yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch}: loss={running_loss:.2f}")
    torch.save({
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'epoch': epoch,
        'timestamp': time.time()
    }, ckpt_file)

    if epoch == 2:
        raise KeyboardInterrupt("💥 Simulated crash after epoch 2")

## 🧾 Step 3: Log Training Metadata

In [None]:
with open("checkpoints/log.json", "w") as f:
    json.dump({
        'model': 'dummy-multigpu',
        'epochs_completed': epoch,
        'final_loss': running_loss,
        'resume_ready': ckpt_file.exists(),
        'hostname': Path('/etc/hostname').read_text().strip()
    }, f, indent=2)
print("✅ Metadata saved")