# üöÄ MOUAADNET-ULTRA Training
## Human Detection & Gender Classification with PA-100k

**Lead Architect:** MOUAAD IDOUFKIR

### üîß Features:
- **Dynamic Loss Balancing** using Uncertainty Weighting (Kendall et al.)
- **GradNorm** for gradient magnitude normalization
- Automatic task weight learning

---

## 1Ô∏è‚É£ Setup

In [None]:
!nvidia-smi
!git clone https://github.com/mouuuuaad/MouaadNet-Ultra.git
%cd MouaadNet-Ultra
!pip install -q torch torchvision tqdm scipy kagglehub

In [None]:
import kagglehub
DATA_PATH = kagglehub.dataset_download("yuulind/pa-100k")
print(f"‚úÖ Dataset: {DATA_PATH}")

## 2Ô∏è‚É£ Dataset

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from scipy.io import loadmat
from tqdm import tqdm

class PA100kDataset(Dataset):
    def __init__(self, root_dir, split='train', img_size=416, transform=None):
        self.root_dir = root_dir
        self.img_size = img_size
        self.transform = transform or self._default_transform()
        self.split = split
        self._find_data()
        self._load_data()
        print(f"‚úÖ {split}: {len(self.images)} images")
    
    def _find_data(self):
        self.anno_path = None
        self.img_dir = self.root_dir
        for root, _, files in os.walk(self.root_dir):
            for f in files:
                if f.endswith('.mat'):
                    self.anno_path = os.path.join(root, f)
            imgs = [x for x in files if x.lower().endswith(('.jpg', '.png'))]
            if len(imgs) > 100:
                self.img_dir = root
    
    def _load_data(self):
        if self.anno_path:
            anno = loadmat(self.anno_path)
            key = f'{self.split}_images_name'
            if key in anno:
                self.images = [str(x[0][0]) for x in anno[key]]
                self.labels = anno[f'{self.split}_label']
                return
        all_imgs = sorted([f for f in os.listdir(self.img_dir) if f.lower().endswith(('.jpg', '.png'))])
        n = len(all_imgs)
        splits = {'train': (0, 0.8), 'val': (0.8, 0.9), 'test': (0.9, 1.0)}
        s, e = splits[self.split]
        self.images = all_imgs[int(s*n):int(e*n)]
        self.labels = None
    
    def _default_transform(self):
        return transforms.Compose([
            transforms.Resize((self.img_size, self.img_size)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ])
    
    def __len__(self): return len(self.images)
    
    def __getitem__(self, idx):
        try:
            image = Image.open(os.path.join(self.img_dir, self.images[idx])).convert('RGB')
        except:
            return self.__getitem__((idx + 1) % len(self))
        
        if self.transform:
            image = self.transform(image)
        
        # Heatmap
        hm_size = self.img_size // 4
        cx, cy, sigma = hm_size // 2, hm_size // 2, hm_size // 6
        x, y = np.meshgrid(np.arange(hm_size), np.arange(hm_size))
        heatmap = np.exp(-((x - cx)**2 + (y - cy)**2) / (2 * sigma**2 + 1e-6)).astype(np.float32)
        
        # Gender
        gender = 1.0 - float(self.labels[idx][0]) if self.labels is not None else 0.5
        
        return {
            'image': image,
            'heatmap': torch.from_numpy(heatmap).unsqueeze(0),
            'gender': torch.tensor([gender], dtype=torch.float32),
        }

def create_dataloaders(data_dir, batch_size=32, img_size=416):
    train_tf = transforms.Compose([
        transforms.Resize((img_size + 32, img_size + 32)),
        transforms.RandomCrop(img_size),
        transforms.RandomHorizontalFlip(0.5),
        transforms.ColorJitter(0.2, 0.2, 0.2),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])
    val_tf = transforms.Compose([
        transforms.Resize((img_size, img_size)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])
    train_ds = PA100kDataset(data_dir, 'train', img_size, train_tf)
    val_ds = PA100kDataset(data_dir, 'val', img_size, val_tf)
    return DataLoader(train_ds, batch_size, shuffle=True, num_workers=2, pin_memory=True), \
           DataLoader(val_ds, batch_size, shuffle=False, num_workers=2, pin_memory=True)

BATCH_SIZE = 32
IMG_SIZE = 416
train_loader, val_loader = create_dataloaders(DATA_PATH, BATCH_SIZE, IMG_SIZE)
print(f"Train: {len(train_loader)} batches | Val: {len(val_loader)} batches")

## 3Ô∏è‚É£ Dynamic Loss Balancing (SOTA Methods)

### Method 1: **Uncertainty Weighting** (Kendall et al. 2018)
Learns task-specific uncertainty parameters $\sigma_i$ to automatically balance losses:

$$\mathcal{L}_{total} = \frac{1}{2\sigma_1^2}\mathcal{L}_{heatmap} + \frac{1}{2\sigma_2^2}\mathcal{L}_{gender} + \log(\sigma_1\sigma_2)$$

### Method 2: **GradNorm** (Chen et al. 2018)
Normalizes gradient magnitudes across tasks to ensure balanced training.

In [None]:
import sys
sys.path.insert(0, '.')
from mouaadnet_ultra.model import MouaadNetUltra

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

model = MouaadNetUltra().to(device)
print(f"Parameters: {model.count_parameters():,}")

In [None]:
# ============================================
# üî• SOTA: UNCERTAINTY WEIGHTING LOSS
# ============================================
# Paper: "Multi-Task Learning Using Uncertainty to Weigh Losses"
# Authors: Kendall, Gal, and Cipolla (CVPR 2018)
# 
# Key Idea: Learn task-specific homoscedastic uncertainty (log_var)
# that automatically balances the contribution of each task.
# High uncertainty = lower weight, Low uncertainty = higher weight

class UncertaintyWeightedLoss(nn.Module):
    """
    Multi-Task Loss with Learnable Uncertainty Weights.
    
    L_total = (1/2œÉ‚ÇÅ¬≤)¬∑L_hm + (1/2œÉ‚ÇÇ¬≤)¬∑L_gender + log(œÉ‚ÇÅ) + log(œÉ‚ÇÇ)
    
    We learn log(œÉ¬≤) for numerical stability.
    """
    def __init__(self):
        super().__init__()
        # Learnable log-variance for each task
        # Initialize: log(œÉ¬≤) = 0 ‚Üí œÉ = 1 ‚Üí weight = 0.5
        self.log_var_hm = nn.Parameter(torch.zeros(1))      # Detection
        self.log_var_gender = nn.Parameter(torch.zeros(1))  # Classification
    
    def focal_loss(self, pred, target, alpha=2.0, beta=4.0):
        """Stable Gaussian Focal Loss for heatmaps."""
        pred = torch.clamp(pred, 1e-6, 1 - 1e-6)
        
        pos_mask = target.eq(1).float()
        neg_mask = target.lt(1).float()
        
        pos_loss = -torch.log(pred) * torch.pow(1 - pred, alpha) * pos_mask
        neg_weight = torch.pow(1 - target, beta)
        neg_loss = -torch.log(1 - pred) * torch.pow(pred, alpha) * neg_weight * neg_mask
        
        num_pos = pos_mask.sum().clamp(min=1)
        return (pos_loss.sum() + neg_loss.sum()) / num_pos
    
    def forward(self, pred_hm, target_hm, pred_gender, target_gender):
        # Raw task losses
        pred_hm_sig = torch.sigmoid(pred_hm)
        loss_hm = self.focal_loss(pred_hm_sig, target_hm)
        loss_gender = F.binary_cross_entropy_with_logits(pred_gender, target_gender)
        
        # Uncertainty weighting: L_task / (2*exp(log_var)) + log_var/2
        # exp(-log_var) = 1/œÉ¬≤, log_var/2 = log(œÉ)
        precision_hm = torch.exp(-self.log_var_hm)
        precision_gender = torch.exp(-self.log_var_gender)
        
        weighted_hm = precision_hm * loss_hm + self.log_var_hm
        weighted_gender = precision_gender * loss_gender + self.log_var_gender
        
        total = weighted_hm + weighted_gender
        
        # Compute effective weights for logging
        with torch.no_grad():
            weight_hm = precision_hm.item()
            weight_gender = precision_gender.item()
        
        return {
            'total': total,
            'loss_hm': loss_hm,
            'loss_gender': loss_gender,
            'weight_hm': weight_hm,
            'weight_gender': weight_gender,
            'sigma_hm': torch.exp(self.log_var_hm / 2).item(),
            'sigma_gender': torch.exp(self.log_var_gender / 2).item(),
        }

print("‚úÖ UncertaintyWeightedLoss defined")

In [None]:
# ============================================
# üî• SOTA: GRADNORM
# ============================================
# Paper: "GradNorm: Gradient Normalization for Adaptive Loss Balancing"
# Authors: Chen et al. (ICML 2018)
#
# Key Idea: Balance gradient magnitudes across tasks by dynamically
# adjusting loss weights based on training rate of each task.

class GradNormLoss(nn.Module):
    """
    GradNorm: Gradient Normalization for Multi-Task Learning.
    
    Balances gradient magnitudes to ensure all tasks train at similar rates.
    
    Args:
        num_tasks: Number of tasks
        alpha: Asymmetry hyperparameter (higher = more aggressive balancing)
    """
    def __init__(self, num_tasks=2, alpha=1.5):
        super().__init__()
        self.num_tasks = num_tasks
        self.alpha = alpha
        
        # Learnable task weights (log scale for stability)
        self.log_weights = nn.Parameter(torch.zeros(num_tasks))
        
        # Track initial losses for relative training rate
        self.register_buffer('initial_losses', torch.ones(num_tasks))
        self.initialized = False
    
    def focal_loss(self, pred, target):
        pred = torch.clamp(torch.sigmoid(pred), 1e-6, 1 - 1e-6)
        pos_mask = target.eq(1).float()
        neg_mask = target.lt(1).float()
        pos_loss = -torch.log(pred) * torch.pow(1 - pred, 2) * pos_mask
        neg_loss = -torch.log(1 - pred) * torch.pow(pred, 2) * torch.pow(1 - target, 4) * neg_mask
        return (pos_loss.sum() + neg_loss.sum()) / pos_mask.sum().clamp(min=1)
    
    def forward(self, pred_hm, target_hm, pred_gender, target_gender):
        # Compute raw losses
        loss_hm = self.focal_loss(pred_hm, target_hm)
        loss_gender = F.binary_cross_entropy_with_logits(pred_gender, target_gender)
        
        losses = torch.stack([loss_hm, loss_gender])
        
        # Initialize on first forward
        if not self.initialized:
            self.initial_losses = losses.detach().clone()
            self.initialized = True
        
        # Get weights (softmax to ensure sum = num_tasks)
        weights = F.softmax(self.log_weights, dim=0) * self.num_tasks
        
        # Weighted sum
        total = (weights * losses).sum()
        
        return {
            'total': total,
            'loss_hm': loss_hm,
            'loss_gender': loss_gender,
            'weight_hm': weights[0].item(),
            'weight_gender': weights[1].item(),
            'losses': losses,  # For GradNorm update
            'weights': weights,  # For GradNorm update
        }
    
    def gradnorm_update(self, losses, shared_layer, lr=0.025):
        """
        Update task weights based on gradient magnitudes.
        Call this after backward() on each task separately.
        """
        weights = F.softmax(self.log_weights, dim=0) * self.num_tasks
        
        # Compute gradient norms for each task
        grad_norms = []
        for i, loss in enumerate(losses):
            # Get gradient w.r.t. shared layer
            grads = torch.autograd.grad(loss * weights[i], shared_layer.parameters(), 
                                        retain_graph=True, allow_unused=True)
            grad_norm = sum(g.norm() for g in grads if g is not None)
            grad_norms.append(grad_norm)
        
        grad_norms = torch.stack(grad_norms)
        
        # Target: average gradient norm
        avg_grad = grad_norms.mean()
        
        # Relative inverse training rate
        loss_ratios = losses.detach() / (self.initial_losses + 1e-8)
        inverse_train_rate = loss_ratios / loss_ratios.mean()
        
        # Target gradient for each task
        target_grads = avg_grad * (inverse_train_rate ** self.alpha)
        
        # GradNorm loss: difference between current and target gradient norms
        gradnorm_loss = (grad_norms - target_grads).abs().sum()
        
        # Update weights
        gradnorm_loss.backward()
        with torch.no_grad():
            self.log_weights -= lr * self.log_weights.grad
            self.log_weights.grad.zero_()

print("‚úÖ GradNormLoss defined")

In [None]:
# ============================================
# SIMPLE BUT EFFECTIVE: LOSS NORMALIZATION
# ============================================
# Normalize each loss by its running average for implicit balancing

class NormalizedMultiTaskLoss(nn.Module):
    """
    Simple Loss Normalization: Divide each loss by its EMA.
    This implicitly balances losses without learnable parameters.
    """
    def __init__(self, ema_decay=0.99):
        super().__init__()
        self.ema_decay = ema_decay
        self.register_buffer('ema_hm', torch.ones(1))
        self.register_buffer('ema_gender', torch.ones(1))
    
    def focal_loss(self, pred, target):
        pred = torch.clamp(torch.sigmoid(pred), 1e-6, 1 - 1e-6)
        pos_mask = target.eq(1).float()
        neg_mask = target.lt(1).float()
        pos_loss = -torch.log(pred) * torch.pow(1 - pred, 2) * pos_mask
        neg_loss = -torch.log(1 - pred) * torch.pow(pred, 2) * torch.pow(1 - target, 4) * neg_mask
        return (pos_loss.sum() + neg_loss.sum()) / pos_mask.sum().clamp(min=1)
    
    def forward(self, pred_hm, target_hm, pred_gender, target_gender):
        loss_hm = self.focal_loss(pred_hm, target_hm)
        loss_gender = F.binary_cross_entropy_with_logits(pred_gender, target_gender)
        
        # Update EMA
        with torch.no_grad():
            self.ema_hm = self.ema_decay * self.ema_hm + (1 - self.ema_decay) * loss_hm
            self.ema_gender = self.ema_decay * self.ema_gender + (1 - self.ema_decay) * loss_gender
        
        # Normalize by EMA (losses become ~1.0 scale)
        norm_hm = loss_hm / (self.ema_hm + 1e-8)
        norm_gender = loss_gender / (self.ema_gender + 1e-8)
        
        total = norm_hm + norm_gender
        
        return {
            'total': total,
            'loss_hm': loss_hm,
            'loss_gender': loss_gender,
            'norm_hm': norm_hm.item(),
            'norm_gender': norm_gender.item(),
        }

print("‚úÖ NormalizedMultiTaskLoss defined")

In [None]:
# Choose loss function:
# Option 1: UncertaintyWeightedLoss (recommended - learns optimal weights)
# Option 2: NormalizedMultiTaskLoss (simpler - normalizes by running average)

LOSS_TYPE = 'uncertainty'  # 'uncertainty' or 'normalized'

if LOSS_TYPE == 'uncertainty':
    criterion = UncertaintyWeightedLoss().to(device)
    print("üìä Using: Uncertainty Weighting (Kendall et al. 2018)")
else:
    criterion = NormalizedMultiTaskLoss().to(device)
    print("üìä Using: EMA-Normalized Loss")

# Training config
EPOCHS = 30
LR = 5e-4

# Include loss parameters in optimizer for uncertainty weighting
if LOSS_TYPE == 'uncertainty':
    optimizer = torch.optim.AdamW([
        {'params': model.parameters(), 'lr': LR},
        {'params': criterion.parameters(), 'lr': LR * 10},  # Faster learning for weights
    ], weight_decay=1e-4)
else:
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=1e-6)
scaler = torch.amp.GradScaler('cuda')

print(f"‚úÖ Training: {EPOCHS} epochs, LR={LR}")

## 4Ô∏è‚É£ Training Loop

In [None]:
def train_epoch(model, loader, optimizer, criterion, scaler, device):
    model.train()
    total_loss, total_hm, total_gender = 0, 0, 0
    weight_hm_sum, weight_gender_sum = 0, 0
    
    pbar = tqdm(loader, desc='Training')
    for batch in pbar:
        images = batch['image'].to(device)
        heatmaps = batch['heatmap'].to(device)
        genders = batch['gender'].to(device)
        
        optimizer.zero_grad()
        
        with torch.amp.autocast('cuda'):
            outputs = model(images)
            pred_hm = outputs['heatmaps'][0]
            pred_gender = outputs['gender']
            losses = criterion(pred_hm, heatmaps, pred_gender, genders)
            loss = losses['total']
        
        if torch.isnan(loss) or torch.isinf(loss):
            continue
        
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
        total_hm += losses['loss_hm'].item()
        total_gender += losses['loss_gender'].item()
        
        # Track weights if using uncertainty
        if 'weight_hm' in losses:
            weight_hm_sum += losses['weight_hm']
            weight_gender_sum += losses['weight_gender']
        
        pbar.set_postfix({
            'loss': f"{loss.item():.3f}",
            'hm': f"{losses['loss_hm'].item():.1f}",
            'gender': f"{losses['loss_gender'].item():.3f}",
        })
    
    n = len(loader)
    return {
        'loss': total_loss/n,
        'hm': total_hm/n,
        'gender': total_gender/n,
        'w_hm': weight_hm_sum/n if weight_hm_sum > 0 else 1.0,
        'w_gender': weight_gender_sum/n if weight_gender_sum > 0 else 1.0,
    }

@torch.no_grad()
def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct, total = 0, 0
    
    for batch in tqdm(loader, desc='Validating'):
        images = batch['image'].to(device)
        heatmaps = batch['heatmap'].to(device)
        genders = batch['gender'].to(device)
        
        with torch.amp.autocast('cuda'):
            outputs = model(images)
            losses = criterion(outputs['heatmaps'][0], heatmaps, outputs['gender'], genders)
        
        total_loss += losses['total'].item()
        pred = (torch.sigmoid(outputs['gender']) > 0.5).float()
        correct += (pred == genders).sum().item()
        total += genders.size(0)
    
    return total_loss / len(loader), correct / total * 100

print("‚úÖ Training functions ready")

In [None]:
# üöÄ TRAIN!
best_acc = 0
history = {'loss': [], 'val_loss': [], 'acc': [], 'hm': [], 'gender': [], 'w_hm': [], 'w_gender': []}

print("="*70)
print("üöÄ Training MOUAADNET-ULTRA with Dynamic Loss Balancing")
print("="*70)

for epoch in range(EPOCHS):
    print(f"\nüìç Epoch {epoch+1}/{EPOCHS}")
    
    train_stats = train_epoch(model, train_loader, optimizer, criterion, scaler, device)
    val_loss, val_acc = validate(model, val_loader, criterion, device)
    scheduler.step()
    
    # Log
    history['loss'].append(train_stats['loss'])
    history['val_loss'].append(val_loss)
    history['acc'].append(val_acc)
    history['hm'].append(train_stats['hm'])
    history['gender'].append(train_stats['gender'])
    history['w_hm'].append(train_stats['w_hm'])
    history['w_gender'].append(train_stats['w_gender'])
    
    print(f"   üìâ Loss: {train_stats['loss']:.4f} | HM: {train_stats['hm']:.1f} | Gender: {train_stats['gender']:.4f}")
    print(f"   ‚öñÔ∏è  Weights: HM={train_stats['w_hm']:.4f}, Gender={train_stats['w_gender']:.4f}")
    print(f"   üìä Val Loss: {val_loss:.4f} | Gender Acc: {val_acc:.2f}%")
    
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'best_acc': best_acc}, 'best_model.pt')
        print("   ‚≠ê Best model saved!")

print(f"\n‚úÖ Training complete! Best accuracy: {best_acc:.2f}%")

In [None]:
# Plot results with weight evolution
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Loss curves
axes[0, 0].plot(history['loss'], label='Train')
axes[0, 0].plot(history['val_loss'], label='Val')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Total Loss')
axes[0, 0].legend()
axes[0, 0].set_title('Total Loss')
axes[0, 0].grid(True)

# Individual losses
axes[0, 1].plot(history['hm'], label='Heatmap', color='blue')
ax2 = axes[0, 1].twinx()
ax2.plot(history['gender'], label='Gender', color='orange')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Heatmap Loss', color='blue')
ax2.set_ylabel('Gender Loss', color='orange')
axes[0, 1].set_title('Task Losses (Different Scales)')

# Learned weights
axes[1, 0].plot(history['w_hm'], label='Heatmap Weight', color='blue')
axes[1, 0].plot(history['w_gender'], label='Gender Weight', color='orange')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Weight')
axes[1, 0].legend()
axes[1, 0].set_title('Learned Task Weights (Dynamic Balancing)')
axes[1, 0].grid(True)

# Accuracy
axes[1, 1].plot(history['acc'], color='green', linewidth=2)
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Accuracy (%)')
axes[1, 1].set_title(f'Gender Classification Accuracy (Best: {best_acc:.2f}%)')
axes[1, 1].grid(True)

plt.suptitle('MOUAADNET-ULTRA Training with Uncertainty Weighting', fontsize=14)
plt.tight_layout()
plt.savefig('training_dynamic.png', dpi=150, bbox_inches='tight')
plt.show()

## 5Ô∏è‚É£ Export

In [None]:
ckpt = torch.load('best_model.pt')
model.load_state_dict(ckpt['model_state_dict'])
model.eval()
model.fuse_for_inference()
model.cpu()

torch.onnx.export(model, torch.randn(1, 3, 416, 416), 'mouaadnet_ultra.onnx', input_names=['image'], opset_version=12)
print(f"‚úÖ Exported! Best accuracy: {ckpt['best_acc']:.2f}%")

In [None]:
from google.colab import files
files.download('best_model.pt')
files.download('mouaadnet_ultra.onnx')
files.download('training_dynamic.png')
print("üéâ Done!")