# üéØ MOUAADNET-ULTRA: Human Detection Training
## Lightning AI Studio + COCO 2017

**Lead Architect:** MOUAAD IDOUFKIR

### ‚ö° Lightning AI Benefits:
- Persistent storage (no re-download)
- Better GPUs (A10G, L4, etc.)
- Longer runtime

---

## 1Ô∏è‚É£ Setup

In [None]:
import subprocess
import sys

# Check GPU
!nvidia-smi

# Install dependencies
!pip install -q torch torchvision tqdm pycocotools

In [None]:
# Clone repo (or use Lightning AI's Git integration)
import os

REPO_DIR = '/teamspace/studios/this_studio/MouaadNet-Ultra'

if not os.path.exists(REPO_DIR):
    !git clone https://github.com/mouuuuaad/MouaadNet-Ultra.git {REPO_DIR}
else:
    print(f"‚úÖ Repo already exists at {REPO_DIR}")
    !cd {REPO_DIR} && git pull

os.chdir(REPO_DIR)
print(f"Working directory: {os.getcwd()}")

## 2Ô∏è‚É£ Download COCO 2017 (Persistent Storage)

In [None]:
# Lightning AI has persistent storage - data stays between sessions!
DATA_DIR = '/teamspace/studios/this_studio/data/coco'

import os
os.makedirs(DATA_DIR, exist_ok=True)

# Check if already downloaded
train_exists = os.path.exists(f'{DATA_DIR}/train2017')
val_exists = os.path.exists(f'{DATA_DIR}/val2017')
anno_exists = os.path.exists(f'{DATA_DIR}/annotations')

print(f"Train images: {'‚úÖ exists' if train_exists else '‚ùå missing'}")
print(f"Val images: {'‚úÖ exists' if val_exists else '‚ùå missing'}")
print(f"Annotations: {'‚úÖ exists' if anno_exists else '‚ùå missing'}")

In [None]:
# Download only if needed (persistent storage saves time!)
if not train_exists:
    print("üì• Downloading train2017 (~18GB)...")
    !wget -q --show-progress http://images.cocodataset.org/zips/train2017.zip -O {DATA_DIR}/train2017.zip
    !cd {DATA_DIR} && unzip -q train2017.zip && rm train2017.zip

if not val_exists:
    print("üì• Downloading val2017 (~1GB)...")
    !wget -q --show-progress http://images.cocodataset.org/zips/val2017.zip -O {DATA_DIR}/val2017.zip
    !cd {DATA_DIR} && unzip -q val2017.zip && rm val2017.zip

if not anno_exists:
    print("üì• Downloading annotations...")
    !wget -q --show-progress http://images.cocodataset.org/annotations/annotations_trainval2017.zip -O {DATA_DIR}/annotations.zip
    !cd {DATA_DIR} && unzip -q annotations.zip && rm annotations.zip

print("\n‚úÖ COCO 2017 ready!")
!ls -la {DATA_DIR}

## 3Ô∏è‚É£ Dataset

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from pycocotools.coco import COCO
from tqdm import tqdm
import cv2


def gaussian2D(shape, sigma=1):
    m, n = [(ss - 1.) / 2. for ss in shape]
    y, x = np.ogrid[-m:m+1, -n:n+1]
    h = np.exp(-(x*x + y*y) / (2*sigma*sigma))
    h[h < np.finfo(h.dtype).eps * h.max()] = 0
    return h


def draw_gaussian(heatmap, center, radius, k=1):
    diameter = 2 * radius + 1
    gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6)
    x, y = int(center[0]), int(center[1])
    height, width = heatmap.shape[0:2]
    left, right = min(x, radius), min(width - x, radius + 1)
    top, bottom = min(y, radius), min(height - y, radius + 1)
    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:radius + right]
    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
        np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
    return heatmap


def gaussian_radius(det_size, min_overlap=0.7):
    height, width = det_size
    a1, b1 = 1, (height + width)
    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
    sq1 = np.sqrt(b1 ** 2 - 4 * a1 * c1)
    return max(0, int((b1 + sq1) / 2))


class COCOPersonDataset(Dataset):
    PERSON_CAT_ID = 1
    
    def __init__(self, root_dir, split='train', img_size=416, down_ratio=4):
        self.root = root_dir
        self.split = split
        self.img_size = img_size
        self.down_ratio = down_ratio
        self.output_size = img_size // down_ratio
        
        anno_file = os.path.join(root_dir, 'annotations', f'instances_{split}2017.json')
        self.coco = COCO(anno_file)
        self.img_ids = self.coco.getImgIds(catIds=[self.PERSON_CAT_ID])
        print(f"‚úÖ {split}: {len(self.img_ids)} images with persons")
        
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ])
    
    def __len__(self): return len(self.img_ids)
    
    def __getitem__(self, idx):
        img_id = self.img_ids[idx]
        img_info = self.coco.loadImgs(img_id)[0]
        img_path = os.path.join(self.root, f'{self.split}2017', img_info['file_name'])
        
        img = cv2.imread(img_path)
        if img is None:
            return self.__getitem__((idx + 1) % len(self))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        orig_h, orig_w = img.shape[:2]
        
        # Resize
        scale = min(self.img_size / orig_h, self.img_size / orig_w)
        new_h, new_w = int(orig_h * scale), int(orig_w * scale)
        img_resized = cv2.resize(img, (new_w, new_h))
        
        # Pad
        pad_h, pad_w = self.img_size - new_h, self.img_size - new_w
        pad_top, pad_left = pad_h // 2, pad_w // 2
        img_padded = np.full((self.img_size, self.img_size, 3), 114, dtype=np.uint8)
        img_padded[pad_top:pad_top+new_h, pad_left:pad_left+new_w] = img_resized
        
        # Annotations
        ann_ids = self.coco.getAnnIds(imgIds=img_id, catIds=[self.PERSON_CAT_ID], iscrowd=False)
        anns = self.coco.loadAnns(ann_ids)
        
        # Targets
        heatmap = np.zeros((self.output_size, self.output_size), dtype=np.float32)
        size_map = np.zeros((2, self.output_size, self.output_size), dtype=np.float32)
        offset_map = np.zeros((2, self.output_size, self.output_size), dtype=np.float32)
        reg_mask = np.zeros((self.output_size, self.output_size), dtype=np.float32)
        
        for ann in anns:
            bbox = ann['bbox']
            if bbox[2] < 5 or bbox[3] < 5: continue
            
            x = bbox[0] * scale + pad_left
            y = bbox[1] * scale + pad_top
            w, h = bbox[2] * scale, bbox[3] * scale
            
            cx = np.clip((x + w/2) / self.down_ratio, 0, self.output_size - 1)
            cy = np.clip((y + h/2) / self.down_ratio, 0, self.output_size - 1)
            cx_int, cy_int = int(cx), int(cy)
            
            radius = max(1, gaussian_radius((h/self.down_ratio, w/self.down_ratio)))
            draw_gaussian(heatmap, (cx_int, cy_int), radius)
            
            size_map[0, cy_int, cx_int] = w / self.img_size
            size_map[1, cy_int, cx_int] = h / self.img_size
            offset_map[0, cy_int, cx_int] = cx - cx_int
            offset_map[1, cy_int, cx_int] = cy - cy_int
            reg_mask[cy_int, cx_int] = 1
        
        img_tensor = self.transform(Image.fromarray(img_padded))
        
        return {
            'image': img_tensor,
            'heatmap': torch.from_numpy(heatmap).unsqueeze(0),
            'size': torch.from_numpy(size_map),
            'offset': torch.from_numpy(offset_map),
            'reg_mask': torch.from_numpy(reg_mask),
        }

print("‚úÖ Dataset class ready")

In [None]:
BATCH_SIZE = 16
IMG_SIZE = 416

train_dataset = COCOPersonDataset(DATA_DIR, 'train', IMG_SIZE)
val_dataset = COCOPersonDataset(DATA_DIR, 'val', IMG_SIZE)

train_loader = DataLoader(train_dataset, BATCH_SIZE, shuffle=True, num_workers=8, pin_memory=True)
val_loader = DataLoader(val_dataset, BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

print(f"\nTrain: {len(train_loader)} batches | Val: {len(val_loader)} batches")

In [None]:
# Visualize
import matplotlib.pyplot as plt

batch = next(iter(train_loader))
fig, axes = plt.subplots(2, 4, figsize=(16, 8))

for i in range(4):
    img = batch['image'][i].permute(1, 2, 0).numpy()
    img = img * [0.229, 0.224, 0.225] + [0.485, 0.456, 0.406]
    img = np.clip(img, 0, 1)
    axes[0, i].imshow(img)
    axes[0, i].axis('off')
    
    hm = batch['heatmap'][i, 0].numpy()
    axes[1, i].imshow(hm, cmap='hot')
    axes[1, i].axis('off')

plt.tight_layout()
plt.show()

## 4Ô∏è‚É£ Model & Loss

In [None]:
import sys
sys.path.insert(0, '.')
from mouaadnet_ultra.model import MouaadNetUltra

device = torch.device('cuda')
model = MouaadNetUltra().to(device)
print(f"Device: {device}")
print(f"Parameters: {model.count_parameters():,}")

In [None]:
class DetectionLoss(nn.Module):
    def __init__(self, hm_weight=1.0, size_weight=0.1, offset_weight=1.0):
        super().__init__()
        self.hm_weight = hm_weight
        self.size_weight = size_weight
        self.offset_weight = offset_weight
    
    def focal_loss(self, pred, target):
        pred = torch.clamp(torch.sigmoid(pred), 1e-6, 1 - 1e-6)
        pos_mask = target.eq(1).float()
        neg_mask = target.lt(1).float()
        pos_loss = -torch.log(pred) * torch.pow(1 - pred, 2) * pos_mask
        neg_loss = -torch.log(1 - pred) * torch.pow(pred, 2) * torch.pow(1 - target, 4) * neg_mask
        return (pos_loss.sum() + neg_loss.sum()) / pos_mask.sum().clamp(min=1)
    
    def reg_loss(self, pred, target, mask):
        mask = mask.unsqueeze(1).expand_as(pred)
        return F.l1_loss(pred * mask, target * mask, reduction='sum') / mask.sum().clamp(min=1)
    
    def forward(self, pred_hm, pred_size, pred_offset, target_hm, target_size, target_offset, reg_mask):
        hm_loss = self.focal_loss(pred_hm, target_hm)
        size_loss = self.reg_loss(pred_size, target_size, reg_mask)
        offset_loss = self.reg_loss(pred_offset, target_offset, reg_mask)
        total = self.hm_weight * hm_loss + self.size_weight * size_loss + self.offset_weight * offset_loss
        return {'total': total, 'hm': hm_loss, 'size': size_loss, 'offset': offset_loss}

criterion = DetectionLoss()
print("‚úÖ Loss ready")

In [None]:
EPOCHS = 50
LR = 1e-3

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=1e-6)
scaler = torch.amp.GradScaler('cuda')

# Checkpoint dir
CKPT_DIR = '/teamspace/studios/this_studio/checkpoints'
os.makedirs(CKPT_DIR, exist_ok=True)

print(f"‚úÖ Training: {EPOCHS} epochs, checkpoints in {CKPT_DIR}")

## 5Ô∏è‚É£ Training

In [None]:
def train_epoch(model, loader, optimizer, criterion, scaler, device):
    model.train()
    total_loss, total_hm = 0, 0
    
    pbar = tqdm(loader, desc='Training')
    for batch in pbar:
        images = batch['image'].to(device, non_blocking=True)
        heatmaps = batch['heatmap'].to(device, non_blocking=True)
        sizes = batch['size'].to(device, non_blocking=True)
        offsets = batch['offset'].to(device, non_blocking=True)
        reg_mask = batch['reg_mask'].to(device, non_blocking=True)
        
        optimizer.zero_grad(set_to_none=True)
        
        with torch.amp.autocast('cuda'):
            outputs = model(images)
            losses = criterion(
                outputs['heatmaps'][0], outputs['sizes'][0], outputs['offsets'][0],
                heatmaps, sizes, offsets, reg_mask
            )
            loss = losses['total']
        
        if not torch.isnan(loss):
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            scaler.step(optimizer)
            scaler.update()
            
            total_loss += loss.item()
            total_hm += losses['hm'].item()
        
        pbar.set_postfix({'loss': f"{loss.item():.4f}"})
    
    n = len(loader)
    return total_loss/n, total_hm/n


@torch.no_grad()
def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    
    for batch in tqdm(loader, desc='Validating'):
        images = batch['image'].to(device)
        heatmaps = batch['heatmap'].to(device)
        sizes = batch['size'].to(device)
        offsets = batch['offset'].to(device)
        reg_mask = batch['reg_mask'].to(device)
        
        with torch.amp.autocast('cuda'):
            outputs = model(images)
            losses = criterion(
                outputs['heatmaps'][0], outputs['sizes'][0], outputs['offsets'][0],
                heatmaps, sizes, offsets, reg_mask
            )
        total_loss += losses['total'].item()
    
    return total_loss / len(loader)

print("‚úÖ Training functions ready")

In [None]:
# üöÄ TRAIN!
best_loss = float('inf')
history = {'loss': [], 'val_loss': []}

print("="*60)
print("üéØ Training MOUAADNET-ULTRA on Lightning AI")
print("="*60)

for epoch in range(EPOCHS):
    print(f"\nüìç Epoch {epoch+1}/{EPOCHS}")
    
    train_loss, hm_loss = train_epoch(model, train_loader, optimizer, criterion, scaler, device)
    val_loss = validate(model, val_loader, criterion, device)
    scheduler.step()
    
    history['loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    
    print(f"   Train: {train_loss:.4f} | Val: {val_loss:.4f}")
    
    # Save checkpoints
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'val_loss': val_loss,
    }, f'{CKPT_DIR}/epoch_{epoch+1}.pt')
    
    if val_loss < best_loss:
        best_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'best_loss': best_loss,
        }, f'{CKPT_DIR}/best_detection.pt')
        print("   ‚≠ê Best model!")

print(f"\n‚úÖ Done! Best loss: {best_loss:.4f}")

In [None]:
# Plot
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 4))
plt.plot(history['loss'], label='Train')
plt.plot(history['val_loss'], label='Val')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Detection Training')
plt.grid(True)
plt.savefig(f'{CKPT_DIR}/training.png', dpi=150)
plt.show()

## 6Ô∏è‚É£ Export

In [None]:
# Load best and export
ckpt = torch.load(f'{CKPT_DIR}/best_detection.pt')
model.load_state_dict(ckpt['model_state_dict'])
model.eval()
model.fuse_for_inference()
model.cpu()

# Export ONNX
torch.onnx.export(
    model, torch.randn(1, 3, 416, 416),
    f'{CKPT_DIR}/detection.onnx',
    input_names=['image'],
    opset_version=12
)

print(f"‚úÖ Exported to {CKPT_DIR}/detection.onnx")
print(f"   Best loss: {ckpt['best_loss']:.4f}")
print(f"\nüì• Download from: {CKPT_DIR}/best_detection.pt")