# üöÄ MOUAADNET-ULTRA Training
## Human Detection & Gender Classification with PA-100k

**Lead Architect:** MOUAAD IDOUFKIR

[![GitHub](https://img.shields.io/badge/GitHub-MouaadNet--Ultra-blue)](https://github.com/mouuuuaad/MouaadNet-Ultra)

---

## 1Ô∏è‚É£ Environment Setup

In [None]:
!nvidia-smi

In [None]:
!git clone https://github.com/mouuuuaad/MouaadNet-Ultra.git
%cd MouaadNet-Ultra
!pip install -q torch torchvision tqdm scipy kagglehub

## 2Ô∏è‚É£ Download PA-100k Dataset

In [None]:
import kagglehub

print("üì• Downloading PA-100k dataset...")
DATA_PATH = kagglehub.dataset_download("yuulind/pa-100k")
print(f"‚úÖ Dataset: {DATA_PATH}")

## 3Ô∏è‚É£ Dataset & DataLoader

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from scipy.io import loadmat
from tqdm import tqdm

class PA100kDataset(Dataset):
    def __init__(self, root_dir, split='train', img_size=416, transform=None):
        self.root_dir = root_dir
        self.img_size = img_size
        self.transform = transform or self._default_transform()
        self.split = split
        
        self.anno_path = self._find_file('.mat')
        self.img_dir = self._find_images()
        self._load_data()
        print(f"‚úÖ {split}: {len(self.images)} images")
    
    def _find_file(self, ext):
        for root, _, files in os.walk(self.root_dir):
            for f in files:
                if f.endswith(ext):
                    return os.path.join(root, f)
        return None
    
    def _find_images(self):
        for root, dirs, files in os.walk(self.root_dir):
            imgs = [f for f in files if f.lower().endswith(('.jpg', '.png'))]
            if len(imgs) > 100:
                return root
        return self.root_dir
    
    def _load_data(self):
        if self.anno_path:
            anno = loadmat(self.anno_path)
            key = f'{self.split}_images_name'
            if key in anno:
                self.images = [str(x[0][0]) for x in anno[key]]
                self.labels = anno[f'{self.split}_label']
                return
        
        all_imgs = sorted([f for f in os.listdir(self.img_dir) if f.lower().endswith(('.jpg', '.png'))])
        n = len(all_imgs)
        if self.split == 'train':
            self.images = all_imgs[:int(0.8*n)]
        elif self.split == 'val':
            self.images = all_imgs[int(0.8*n):int(0.9*n)]
        else:
            self.images = all_imgs[int(0.9*n):]
        self.labels = None
    
    def _default_transform(self):
        return transforms.Compose([
            transforms.Resize((self.img_size, self.img_size)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ])
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.images[idx])
        try:
            image = Image.open(img_path).convert('RGB')
        except:
            return self.__getitem__((idx + 1) % len(self))
        
        if self.transform:
            image = self.transform(image)
        
        # Heatmap (person centered)
        hm_size = self.img_size // 4
        cx, cy = hm_size // 2, hm_size // 2
        sigma = hm_size // 6
        x = np.arange(hm_size)
        y = np.arange(hm_size)
        xx, yy = np.meshgrid(x, y)
        heatmap = np.exp(-((xx - cx)**2 + (yy - cy)**2) / (2 * sigma**2 + 1e-6))
        heatmap = torch.from_numpy(heatmap.astype(np.float32)).unsqueeze(0)
        
        # Gender
        if self.labels is not None:
            gender = 1.0 - float(self.labels[idx][0])  # Female=0 -> Male=1
        else:
            gender = 0.5
        
        return {
            'image': image,
            'heatmap': heatmap,
            'gender': torch.tensor([gender], dtype=torch.float32),
        }


def create_dataloaders(data_dir, batch_size=32, img_size=416):
    train_tf = transforms.Compose([
        transforms.Resize((img_size + 32, img_size + 32)),
        transforms.RandomCrop(img_size),
        transforms.RandomHorizontalFlip(0.5),
        transforms.ColorJitter(0.2, 0.2, 0.2),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])
    val_tf = transforms.Compose([
        transforms.Resize((img_size, img_size)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])
    
    train_ds = PA100kDataset(data_dir, 'train', img_size, train_tf)
    val_ds = PA100kDataset(data_dir, 'val', img_size, val_tf)
    
    train_loader = DataLoader(train_ds, batch_size, shuffle=True, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size, shuffle=False, num_workers=2, pin_memory=True)
    
    return train_loader, val_loader

print("‚úÖ Dataset ready")

In [None]:
BATCH_SIZE = 32
IMG_SIZE = 416

train_loader, val_loader = create_dataloaders(DATA_PATH, BATCH_SIZE, IMG_SIZE)
print(f"Train: {len(train_loader)} batches | Val: {len(val_loader)} batches")

## 4Ô∏è‚É£ Model & Stable Loss Functions

In [None]:
import sys
sys.path.insert(0, '.')

from mouaadnet_ultra.model import MouaadNetUltra

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

model = MouaadNetUltra()
model = model.to(device)
print(f"Parameters: {model.count_parameters():,}")

In [None]:
# ‚ö†Ô∏è STABLE LOSS FUNCTIONS (No NaN!)

class StableFocalLoss(nn.Module):
    """Numerically stable focal loss for heatmaps."""
    def __init__(self, alpha=2.0, beta=4.0):
        super().__init__()
        self.alpha = alpha
        self.beta = beta
    
    def forward(self, pred, target):
        # Clamp predictions to avoid log(0)
        pred = torch.clamp(pred, min=1e-6, max=1-1e-6)
        
        pos_mask = target.eq(1).float()
        neg_mask = target.lt(1).float()
        
        # Positive loss
        pos_loss = -torch.log(pred) * torch.pow(1 - pred, self.alpha) * pos_mask
        
        # Negative loss with reduced weight near positives
        neg_weight = torch.pow(1 - target, self.beta)
        neg_loss = -torch.log(1 - pred) * torch.pow(pred, self.alpha) * neg_weight * neg_mask
        
        num_pos = pos_mask.sum().clamp(min=1)
        loss = (pos_loss.sum() + neg_loss.sum()) / num_pos
        
        return loss


class StableMultiTaskLoss(nn.Module):
    """Stable multi-task loss."""
    def __init__(self, hm_weight=1.0, gender_weight=1.0):
        super().__init__()
        self.hm_weight = hm_weight
        self.gender_weight = gender_weight
        self.focal = StableFocalLoss()
    
    def forward(self, pred_hm, target_hm, pred_gender, target_gender):
        # Heatmap loss
        pred_hm = torch.sigmoid(pred_hm)  # Ensure [0, 1]
        hm_loss = self.focal(pred_hm, target_hm)
        
        # Gender loss (stable BCE)
        gender_loss = F.binary_cross_entropy_with_logits(
            pred_gender, target_gender, 
            pos_weight=torch.tensor([3.0], device=pred_gender.device)
        )
        
        total = self.hm_weight * hm_loss + self.gender_weight * gender_loss
        
        return {
            'total': total,
            'hm_loss': hm_loss,
            'gender_loss': gender_loss,
        }

print("‚úÖ Stable loss functions ready")

In [None]:
# Training config
EPOCHS = 30
LR = 5e-4  # Lower LR for stability

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=1e-6)
criterion = StableMultiTaskLoss(hm_weight=1.0, gender_weight=1.0)
scaler = torch.amp.GradScaler('cuda')

print(f"‚úÖ Training config: {EPOCHS} epochs, LR={LR}")

In [None]:
def train_epoch(model, loader, optimizer, criterion, scaler, device):
    model.train()
    total_loss = 0
    total_hm = 0
    total_gender = 0
    
    pbar = tqdm(loader, desc='Training')
    for batch in pbar:
        images = batch['image'].to(device)
        heatmaps = batch['heatmap'].to(device)
        genders = batch['gender'].to(device)
        
        optimizer.zero_grad()
        
        with torch.amp.autocast('cuda'):
            outputs = model(images)
            
            # Use first scale heatmap
            pred_hm = outputs['heatmaps'][0]
            pred_gender = outputs['gender']
            
            losses = criterion(pred_hm, heatmaps, pred_gender, genders)
            loss = losses['total']
        
        # Check for NaN
        if torch.isnan(loss) or torch.isinf(loss):
            print("‚ö†Ô∏è NaN detected, skipping batch")
            optimizer.zero_grad()
            continue
        
        scaler.scale(loss).backward()
        
        # Gradient clipping
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
        total_hm += losses['hm_loss'].item()
        total_gender += losses['gender_loss'].item()
        
        pbar.set_postfix({
            'loss': f"{loss.item():.4f}",
            'hm': f"{losses['hm_loss'].item():.4f}",
            'gender': f"{losses['gender_loss'].item():.4f}"
        })
    
    n = len(loader)
    return total_loss/n, total_hm/n, total_gender/n


@torch.no_grad()
def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    for batch in tqdm(loader, desc='Validating'):
        images = batch['image'].to(device)
        heatmaps = batch['heatmap'].to(device)
        genders = batch['gender'].to(device)
        
        with torch.amp.autocast('cuda'):
            outputs = model(images)
            losses = criterion(outputs['heatmaps'][0], heatmaps, outputs['gender'], genders)
        
        total_loss += losses['total'].item()
        
        pred = (torch.sigmoid(outputs['gender']) > 0.5).float()
        correct += (pred == genders).sum().item()
        total += genders.size(0)
    
    return total_loss / len(loader), correct / total * 100

print("‚úÖ Training functions ready")

In [None]:
# üöÄ TRAIN!
best_acc = 0
history = {'loss': [], 'val_loss': [], 'acc': []}

print("="*60)
print("üöÄ Training MOUAADNET-ULTRA")
print("="*60)

for epoch in range(EPOCHS):
    print(f"\nüìç Epoch {epoch+1}/{EPOCHS}")
    
    train_loss, hm_loss, gender_loss = train_epoch(model, train_loader, optimizer, criterion, scaler, device)
    val_loss, val_acc = validate(model, val_loader, criterion, device)
    scheduler.step()
    
    history['loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['acc'].append(val_acc)
    
    print(f"   Loss: {train_loss:.4f} (HM: {hm_loss:.4f}, Gender: {gender_loss:.4f})")
    print(f"   Val Loss: {val_loss:.4f} | Gender Acc: {val_acc:.2f}%")
    
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'best_acc': best_acc,
        }, 'best_model.pt')
        print("   ‚≠ê Best model saved!")

print(f"\n‚úÖ Training complete! Best accuracy: {best_acc:.2f}%")

In [None]:
# Plot results
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

ax1.plot(history['loss'], label='Train')
ax1.plot(history['val_loss'], label='Val')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Loss')
ax1.grid(True)

ax2.plot(history['acc'], color='green')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy (%)')
ax2.set_title('Gender Accuracy')
ax2.grid(True)

plt.tight_layout()
plt.savefig('training.png', dpi=150)
plt.show()

## 5Ô∏è‚É£ Export

In [None]:
# Load best & export
ckpt = torch.load('best_model.pt')
model.load_state_dict(ckpt['model_state_dict'])
model.eval()
model.fuse_for_inference()
model.cpu()

torch.onnx.export(
    model, torch.randn(1, 3, 416, 416),
    'mouaadnet_ultra.onnx',
    input_names=['image'],
    opset_version=12
)

print(f"‚úÖ Exported: mouaadnet_ultra.onnx")
print(f"   Best accuracy: {ckpt['best_acc']:.2f}%")

In [None]:
from google.colab import files
files.download('best_model.pt')
files.download('mouaadnet_ultra.onnx')
files.download('training.png')
print("üéâ Done!")