# üéØ MOUAADNET-ULTRA: Human Detection Training
## Using COCO Person Dataset (Real Bounding Boxes)

**Lead Architect:** MOUAAD IDOUFKIR

### ‚ö†Ô∏è Why PA-100k Failed for Detection:
PA-100k contains **cropped pedestrian images** without scene context.
For proper detection, we need **full scenes with bounding boxes**.

### ‚úÖ This Notebook Uses:
- **COCO 2017** - Real bounding box annotations
- **CenterNet-style** heatmap generation
- Proper detection training

---

## 1Ô∏è‚É£ Setup

In [None]:
!nvidia-smi
!git clone https://github.com/mouuuuaad/MouaadNet-Ultra.git
%cd MouaadNet-Ultra
!pip install -q torch torchvision tqdm pycocotools

## 2Ô∏è‚É£ Download COCO 2017 (Person Only)

In [None]:
# Download COCO 2017 Train images
!mkdir -p data/coco
!wget -q http://images.cocodataset.org/zips/train2017.zip -O data/coco/train2017.zip
!wget -q http://images.cocodataset.org/zips/val2017.zip -O data/coco/val2017.zip
!wget -q http://images.cocodataset.org/annotations/annotations_trainval2017.zip -O data/coco/annotations.zip

!cd data/coco && unzip -q train2017.zip && unzip -q val2017.zip && unzip -q annotations.zip
!rm data/coco/*.zip
print("‚úÖ COCO 2017 downloaded!")

In [None]:
# Alternative: Use smaller subset for faster training
# Skip this cell if you downloaded full COCO above

# !pip install -q fiftyone
# import fiftyone.zoo as foz
# dataset = foz.load_zoo_dataset(
#     "coco-2017",
#     split="train",
#     label_types=["detections"],
#     classes=["person"],
#     max_samples=10000,  # Smaller subset
# )

## 3Ô∏è‚É£ COCO Person Dataset with Heatmap Generation

In [None]:
import os
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from pycocotools.coco import COCO
from tqdm import tqdm
import cv2


def gaussian2D(shape, sigma=1):
    """Generate 2D Gaussian kernel."""
    m, n = [(ss - 1.) / 2. for ss in shape]
    y, x = np.ogrid[-m:m+1, -n:n+1]
    h = np.exp(-(x*x + y*y) / (2*sigma*sigma))
    h[h < np.finfo(h.dtype).eps * h.max()] = 0
    return h


def draw_gaussian(heatmap, center, radius, k=1):
    """Draw Gaussian on heatmap at center with given radius."""
    diameter = 2 * radius + 1
    gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6)
    
    x, y = int(center[0]), int(center[1])
    height, width = heatmap.shape[0:2]
    
    left, right = min(x, radius), min(width - x, radius + 1)
    top, bottom = min(y, radius), min(height - y, radius + 1)
    
    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:radius + right]
    
    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
        np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
    
    return heatmap


def gaussian_radius(det_size, min_overlap=0.7):
    """Compute Gaussian radius based on object size."""
    height, width = det_size
    a1 = 1
    b1 = (height + width)
    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
    sq1 = np.sqrt(b1 ** 2 - 4 * a1 * c1)
    r1 = (b1 + sq1) / 2
    return max(0, int(r1))


class COCOPersonDataset(Dataset):
    """
    COCO Dataset for Person Detection.
    Generates CenterNet-style heatmaps with proper bounding boxes.
    """
    PERSON_CAT_ID = 1  # Person category ID in COCO
    
    def __init__(self, root_dir, split='train', img_size=416, down_ratio=4):
        """
        Args:
            root_dir: COCO root directory
            split: 'train' or 'val'
            img_size: Input image size
            down_ratio: Heatmap downsampling ratio
        """
        self.root = root_dir
        self.split = split
        self.img_size = img_size
        self.down_ratio = down_ratio
        self.output_size = img_size // down_ratio
        
        # Load COCO annotations
        anno_file = os.path.join(root_dir, 'annotations', f'instances_{split}2017.json')
        self.coco = COCO(anno_file)
        
        # Get all images containing persons
        self.img_ids = self.coco.getImgIds(catIds=[self.PERSON_CAT_ID])
        print(f"‚úÖ Loaded {len(self.img_ids)} images with persons ({split})")
        
        # Transforms
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ])
    
    def __len__(self):
        return len(self.img_ids)
    
    def __getitem__(self, idx):
        img_id = self.img_ids[idx]
        img_info = self.coco.loadImgs(img_id)[0]
        img_path = os.path.join(self.root, f'{self.split}2017', img_info['file_name'])
        
        # Load image
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        orig_h, orig_w = img.shape[:2]
        
        # Resize with aspect ratio
        scale = min(self.img_size / orig_h, self.img_size / orig_w)
        new_h, new_w = int(orig_h * scale), int(orig_w * scale)
        img_resized = cv2.resize(img, (new_w, new_h))
        
        # Pad to square
        pad_h = self.img_size - new_h
        pad_w = self.img_size - new_w
        pad_top = pad_h // 2
        pad_left = pad_w // 2
        
        img_padded = np.full((self.img_size, self.img_size, 3), 114, dtype=np.uint8)
        img_padded[pad_top:pad_top+new_h, pad_left:pad_left+new_w] = img_resized
        
        # Get annotations
        ann_ids = self.coco.getAnnIds(imgIds=img_id, catIds=[self.PERSON_CAT_ID], iscrowd=False)
        anns = self.coco.loadAnns(ann_ids)
        
        # Generate targets
        heatmap = np.zeros((self.output_size, self.output_size), dtype=np.float32)
        size_map = np.zeros((2, self.output_size, self.output_size), dtype=np.float32)
        offset_map = np.zeros((2, self.output_size, self.output_size), dtype=np.float32)
        reg_mask = np.zeros((self.output_size, self.output_size), dtype=np.float32)
        
        num_persons = 0
        for ann in anns:
            bbox = ann['bbox']  # [x, y, w, h]
            if bbox[2] < 5 or bbox[3] < 5:  # Skip tiny boxes
                continue
            
            # Scale bbox to resized image
            x = bbox[0] * scale + pad_left
            y = bbox[1] * scale + pad_top
            w = bbox[2] * scale
            h = bbox[3] * scale
            
            # Center in output space
            cx = (x + w / 2) / self.down_ratio
            cy = (y + h / 2) / self.down_ratio
            
            # Clip to output size
            cx = np.clip(cx, 0, self.output_size - 1)
            cy = np.clip(cy, 0, self.output_size - 1)
            
            # Integer center
            cx_int, cy_int = int(cx), int(cy)
            
            # Gaussian radius based on object size
            radius = gaussian_radius((h / self.down_ratio, w / self.down_ratio))
            radius = max(1, radius)
            
            # Draw Gaussian on heatmap
            draw_gaussian(heatmap, (cx_int, cy_int), radius)
            
            # Size and offset (normalized)
            size_map[0, cy_int, cx_int] = w / self.img_size
            size_map[1, cy_int, cx_int] = h / self.img_size
            offset_map[0, cy_int, cx_int] = cx - cx_int
            offset_map[1, cy_int, cx_int] = cy - cy_int
            reg_mask[cy_int, cx_int] = 1
            
            num_persons += 1
        
        # Apply transforms
        img_tensor = self.transform(Image.fromarray(img_padded))
        
        return {
            'image': img_tensor,
            'heatmap': torch.from_numpy(heatmap).unsqueeze(0),
            'size': torch.from_numpy(size_map),
            'offset': torch.from_numpy(offset_map),
            'reg_mask': torch.from_numpy(reg_mask),
            'num_persons': num_persons,
        }


print("‚úÖ Dataset class ready")

In [None]:
# Create dataloaders
BATCH_SIZE = 16  # Smaller due to larger images
IMG_SIZE = 416
DATA_DIR = 'data/coco'

train_dataset = COCOPersonDataset(DATA_DIR, 'train', IMG_SIZE)
val_dataset = COCOPersonDataset(DATA_DIR, 'val', IMG_SIZE)

train_loader = DataLoader(train_dataset, BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

print(f"\nüìä Data: {len(train_loader)} train batches, {len(val_loader)} val batches")

In [None]:
# Visualize sample
import matplotlib.pyplot as plt

batch = next(iter(train_loader))
fig, axes = plt.subplots(2, 4, figsize=(16, 8))

for i in range(4):
    # Image
    img = batch['image'][i].permute(1, 2, 0).numpy()
    img = img * [0.229, 0.224, 0.225] + [0.485, 0.456, 0.406]
    img = np.clip(img, 0, 1)
    axes[0, i].imshow(img)
    axes[0, i].set_title(f"Persons: {batch['num_persons'][i]}")
    axes[0, i].axis('off')
    
    # Heatmap
    hm = batch['heatmap'][i, 0].numpy()
    axes[1, i].imshow(hm, cmap='hot')
    axes[1, i].set_title('Detection Heatmap')
    axes[1, i].axis('off')

plt.suptitle('COCO Person Dataset - Real Bounding Boxes!', fontsize=14)
plt.tight_layout()
plt.show()

## 4Ô∏è‚É£ Detection-Focused Loss

In [None]:
import sys
sys.path.insert(0, '.')
from mouaadnet_ultra.model import MouaadNetUltra

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MouaadNetUltra().to(device)
print(f"Device: {device}")
print(f"Parameters: {model.count_parameters():,}")

In [None]:
class DetectionLoss(nn.Module):
    """
    CenterNet-style detection loss.
    
    Components:
    - Focal Loss for heatmap
    - L1 Loss for size regression
    - L1 Loss for offset regression
    """
    def __init__(self, hm_weight=1.0, size_weight=0.1, offset_weight=1.0):
        super().__init__()
        self.hm_weight = hm_weight
        self.size_weight = size_weight
        self.offset_weight = offset_weight
    
    def focal_loss(self, pred, target):
        """Focal loss for heatmap."""
        pred = torch.clamp(torch.sigmoid(pred), 1e-6, 1 - 1e-6)
        
        pos_mask = target.eq(1).float()
        neg_mask = target.lt(1).float()
        
        pos_loss = -torch.log(pred) * torch.pow(1 - pred, 2) * pos_mask
        neg_loss = -torch.log(1 - pred) * torch.pow(pred, 2) * torch.pow(1 - target, 4) * neg_mask
        
        num_pos = pos_mask.sum().clamp(min=1)
        return (pos_loss.sum() + neg_loss.sum()) / num_pos
    
    def reg_loss(self, pred, target, mask):
        """L1 loss for regression with mask."""
        mask = mask.unsqueeze(1).expand_as(pred)
        loss = F.l1_loss(pred * mask, target * mask, reduction='sum')
        num_pos = mask.sum().clamp(min=1)
        return loss / num_pos
    
    def forward(self, pred_hm, pred_size, pred_offset, target_hm, target_size, target_offset, reg_mask):
        hm_loss = self.focal_loss(pred_hm, target_hm)
        size_loss = self.reg_loss(pred_size, target_size, reg_mask)
        offset_loss = self.reg_loss(pred_offset, target_offset, reg_mask)
        
        total = self.hm_weight * hm_loss + self.size_weight * size_loss + self.offset_weight * offset_loss
        
        return {
            'total': total,
            'hm': hm_loss,
            'size': size_loss,
            'offset': offset_loss,
        }

criterion = DetectionLoss(hm_weight=1.0, size_weight=0.1, offset_weight=1.0)
print("‚úÖ Detection loss ready")

In [None]:
EPOCHS = 30
LR = 1e-3

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=1e-6)
scaler = torch.amp.GradScaler('cuda')

print(f"‚úÖ Training setup: {EPOCHS} epochs, LR={LR}")

## 5Ô∏è‚É£ Training Loop

In [None]:
def train_epoch(model, loader, optimizer, criterion, scaler, device):
    model.train()
    total_loss, total_hm, total_size, total_offset = 0, 0, 0, 0
    
    pbar = tqdm(loader, desc='Training')
    for batch in pbar:
        images = batch['image'].to(device)
        heatmaps = batch['heatmap'].to(device)
        sizes = batch['size'].to(device)
        offsets = batch['offset'].to(device)
        reg_mask = batch['reg_mask'].to(device)
        
        optimizer.zero_grad()
        
        with torch.amp.autocast('cuda'):
            outputs = model(images)
            
            # Use first scale (104x104)
            pred_hm = outputs['heatmaps'][0]
            pred_size = outputs['sizes'][0]
            pred_offset = outputs['offsets'][0]
            
            losses = criterion(pred_hm, pred_size, pred_offset, heatmaps, sizes, offsets, reg_mask)
            loss = losses['total']
        
        if torch.isnan(loss):
            continue
        
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
        total_hm += losses['hm'].item()
        total_size += losses['size'].item()
        total_offset += losses['offset'].item()
        
        pbar.set_postfix({'loss': f"{loss.item():.4f}", 'hm': f"{losses['hm'].item():.3f}"})
    
    n = len(loader)
    return {'loss': total_loss/n, 'hm': total_hm/n, 'size': total_size/n, 'offset': total_offset/n}


@torch.no_grad()
def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    
    for batch in tqdm(loader, desc='Validating'):
        images = batch['image'].to(device)
        heatmaps = batch['heatmap'].to(device)
        sizes = batch['size'].to(device)
        offsets = batch['offset'].to(device)
        reg_mask = batch['reg_mask'].to(device)
        
        with torch.amp.autocast('cuda'):
            outputs = model(images)
            losses = criterion(outputs['heatmaps'][0], outputs['sizes'][0], outputs['offsets'][0],
                             heatmaps, sizes, offsets, reg_mask)
        
        total_loss += losses['total'].item()
    
    return total_loss / len(loader)

print("‚úÖ Training functions ready")

In [None]:
# üöÄ TRAIN DETECTION MODEL!
best_loss = float('inf')
history = {'loss': [], 'val_loss': [], 'hm': [], 'size': [], 'offset': []}

print("="*60)
print("üéØ Training MOUAADNET-ULTRA for HUMAN DETECTION")
print("="*60)

for epoch in range(EPOCHS):
    print(f"\nüìç Epoch {epoch+1}/{EPOCHS}")
    
    stats = train_epoch(model, train_loader, optimizer, criterion, scaler, device)
    val_loss = validate(model, val_loader, criterion, device)
    scheduler.step()
    
    history['loss'].append(stats['loss'])
    history['val_loss'].append(val_loss)
    history['hm'].append(stats['hm'])
    history['size'].append(stats['size'])
    history['offset'].append(stats['offset'])
    
    print(f"   Loss: {stats['loss']:.4f} (HM:{stats['hm']:.3f} Size:{stats['size']:.3f} Off:{stats['offset']:.3f})")
    print(f"   Val Loss: {val_loss:.4f}")
    
    if val_loss < best_loss:
        best_loss = val_loss
        torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'best_loss': best_loss}, 
                   'detection_model.pt')
        print("   ‚≠ê Best model saved!")

print(f"\n‚úÖ Training complete! Best loss: {best_loss:.4f}")

In [None]:
# Plot training curves
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].plot(history['loss'], label='Train')
axes[0].plot(history['val_loss'], label='Val')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].set_title('Total Loss')
axes[0].grid(True)

axes[1].plot(history['hm'], label='Heatmap')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].set_title('Heatmap Loss')
axes[1].grid(True)

axes[2].plot(history['size'], label='Size')
axes[2].plot(history['offset'], label='Offset')
axes[2].set_xlabel('Epoch')
axes[2].set_ylabel('Loss')
axes[2].legend()
axes[2].set_title('Regression Losses')
axes[2].grid(True)

plt.suptitle('MOUAADNET-ULTRA Detection Training', fontsize=14)
plt.tight_layout()
plt.savefig('detection_training.png', dpi=150)
plt.show()

## 6Ô∏è‚É£ Test Detection

In [None]:
# Test detection on sample
model.eval()

batch = next(iter(val_loader))
images = batch['image'].to(device)

with torch.no_grad():
    outputs = model(images[:4])

# Visualize
fig, axes = plt.subplots(2, 4, figsize=(16, 8))

for i in range(4):
    # Input image
    img = batch['image'][i].permute(1, 2, 0).cpu().numpy()
    img = img * [0.229, 0.224, 0.225] + [0.485, 0.456, 0.406]
    img = np.clip(img, 0, 1)
    axes[0, i].imshow(img)
    axes[0, i].set_title('Input Image')
    axes[0, i].axis('off')
    
    # Predicted heatmap
    hm = torch.sigmoid(outputs['heatmaps'][0][i, 0]).cpu().numpy()
    axes[1, i].imshow(hm, cmap='hot')
    axes[1, i].set_title(f'Predicted (max: {hm.max():.2f})')
    axes[1, i].axis('off')

plt.suptitle('Detection Results', fontsize=14)
plt.tight_layout()
plt.show()

## 7Ô∏è‚É£ Export

In [None]:
# Load best and export
ckpt = torch.load('detection_model.pt')
model.load_state_dict(ckpt['model_state_dict'])
model.eval()
model.fuse_for_inference()
model.cpu()

torch.onnx.export(model, torch.randn(1, 3, 416, 416), 'detection_model.onnx', input_names=['image'], opset_version=12)
print(f"‚úÖ Exported! Best loss: {ckpt['best_loss']:.4f}")

In [None]:
from google.colab import files
files.download('detection_model.pt')
files.download('detection_model.onnx')
files.download('detection_training.png')
print("üéâ Done! Now test with webcam_demo.py --weights detection_model.pt")