# üöÄ MOUAADNET-ULTRA Training
## Human Detection & Gender Classification with PA-100k

**Lead Architect:** MOUAAD IDOUFKIR

[![GitHub](https://img.shields.io/badge/GitHub-MouaadNet--Ultra-blue)](https://github.com/mouuuuaad/MouaadNet-Ultra)

---

### üìã Steps:
1. ‚úÖ Setup environment & GPU
2. ‚úÖ Download PA-100k from Kaggle (automatic)
3. ‚úÖ Prepare data loaders
4. ‚úÖ Train model with mixed precision
5. ‚úÖ Export to ONNX

## 1Ô∏è‚É£ Environment Setup

In [None]:
# Check GPU
!nvidia-smi

In [None]:
# Clone MOUAADNET-ULTRA repository
!git clone https://github.com/mouuuuaad/MouaadNet-Ultra.git
%cd MouaadNet-Ultra

# Install dependencies
!pip install -q torch torchvision tqdm scipy kagglehub

## 2Ô∏è‚É£ Download PA-100k Dataset from Kaggle

In [None]:
import kagglehub

# Download PA-100k dataset automatically
print("üì• Downloading PA-100k dataset from Kaggle...")
DATA_PATH = kagglehub.dataset_download("yuulind/pa-100k")

print(f"\n‚úÖ Dataset downloaded to: {DATA_PATH}")

In [None]:
# Explore dataset structure
import os

print("üìÅ Dataset contents:")
for root, dirs, files in os.walk(DATA_PATH):
    level = root.replace(DATA_PATH, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f'{indent}{os.path.basename(root)}/')
    subindent = ' ' * 2 * (level + 1)
    for file in files[:5]:  # Show first 5 files
        print(f'{subindent}{file}')
    if len(files) > 5:
        print(f'{subindent}... and {len(files) - 5} more files')

## 3Ô∏è‚É£ Dataset & DataLoader

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from scipy.io import loadmat
from tqdm import tqdm

class PA100kDataset(Dataset):
    """
    PA-100k Dataset for Human Detection & Gender Classification.
    
    PA-100k contains cropped pedestrian images with 26 attributes.
    Attribute indices:
      - 0: Female
      - 1: AgeOver60
      - 2: Age18-60
      - 3: AgeLess18
      ... (more attributes)
    """
    
    def __init__(self, root_dir, split='train', img_size=416, transform=None):
        self.root_dir = root_dir
        self.img_size = img_size
        self.transform = transform or self._default_transform()
        self.split = split
        
        # Find annotation file
        self.anno_path = self._find_annotation()
        self.img_dir = self._find_images()
        
        # Load annotations
        if self.anno_path:
            self._load_annotations()
        else:
            self._load_from_directory()
        
        print(f"‚úÖ Loaded {len(self.images)} images for {split} split")
    
    def _find_annotation(self):
        """Find annotation.mat file."""
        for root, dirs, files in os.walk(self.root_dir):
            for f in files:
                if f.endswith('.mat'):
                    return os.path.join(root, f)
        return None
    
    def _find_images(self):
        """Find images directory."""
        for root, dirs, files in os.walk(self.root_dir):
            # Check if this dir has images
            img_files = [f for f in files if f.lower().endswith(('.jpg', '.png', '.jpeg'))]
            if len(img_files) > 100:  # PA-100k has 100k images
                return root
            # Check subdirs
            for d in dirs:
                if 'image' in d.lower() or 'data' in d.lower():
                    return os.path.join(root, d)
        return self.root_dir
    
    def _load_annotations(self):
        """Load from annotation.mat file."""
        anno = loadmat(self.anno_path)
        
        # Try different annotation formats
        if self.split == 'train':
            key_images = 'train_images_name'
            key_labels = 'train_label'
        elif self.split == 'val':
            key_images = 'val_images_name'
            key_labels = 'val_label'
        else:
            key_images = 'test_images_name'
            key_labels = 'test_label'
        
        if key_images in anno:
            self.images = [str(x[0][0]) for x in anno[key_images]]
            self.labels = anno[key_labels]
        else:
            # Alternative format
            self._load_from_directory()
    
    def _load_from_directory(self):
        """Fallback: load all images from directory."""
        all_images = []
        for f in os.listdir(self.img_dir):
            if f.lower().endswith(('.jpg', '.png', '.jpeg')):
                all_images.append(f)
        
        all_images = sorted(all_images)
        n = len(all_images)
        
        # Split 80/10/10
        if self.split == 'train':
            self.images = all_images[:int(0.8 * n)]
        elif self.split == 'val':
            self.images = all_images[int(0.8 * n):int(0.9 * n)]
        else:
            self.images = all_images[int(0.9 * n):]
        
        self.labels = None
    
    def _default_transform(self):
        return transforms.Compose([
            transforms.Resize((self.img_size, self.img_size)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
    
    def __len__(self):
        return len(self.images)
    
    def _generate_heatmap(self, h, w):
        """Generate Gaussian heatmap centered on image."""
        cx, cy = w // 2, h // 2
        sigma = min(h, w) // 6
        x = np.arange(w)
        y = np.arange(h)
        xx, yy = np.meshgrid(x, y)
        heatmap = np.exp(-((xx - cx)**2 + (yy - cy)**2) / (2 * sigma**2))
        return heatmap.astype(np.float32)
    
    def __getitem__(self, idx):
        # Load image
        img_name = self.images[idx]
        img_path = os.path.join(self.img_dir, img_name)
        
        try:
            image = Image.open(img_path).convert('RGB')
        except:
            # Return random tensor if image fails
            return self.__getitem__((idx + 1) % len(self))
        
        if self.transform:
            image = self.transform(image)
        
        # Generate targets
        hm_size = self.img_size // 4
        heatmap = self._generate_heatmap(hm_size, hm_size)
        heatmap = torch.from_numpy(heatmap).unsqueeze(0)
        
        # Size (person fills ~80% of frame)
        size = torch.tensor([0.8, 0.9])
        
        # Offset
        offset = torch.tensor([0.0, 0.0])
        
        # Gender: attribute 0 = Female
        if self.labels is not None:
            label = self.labels[idx]
            gender = 1.0 - float(label[0])  # 0=Female->1, 1=Female->0 (Male=1)
        else:
            gender = 0.5
        
        gender = torch.tensor([gender], dtype=torch.float32)
        
        return {
            'image': image,
            'heatmap': heatmap,
            'size': size,
            'offset': offset,
            'gender': gender,
        }


def create_dataloaders(data_dir, batch_size=32, img_size=416, num_workers=2):
    """Create train and validation dataloaders."""
    
    train_transform = transforms.Compose([
        transforms.Resize((img_size + 32, img_size + 32)),
        transforms.RandomCrop(img_size),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    
    val_transform = transforms.Compose([
        transforms.Resize((img_size, img_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    
    train_dataset = PA100kDataset(data_dir, split='train', img_size=img_size, transform=train_transform)
    val_dataset = PA100kDataset(data_dir, split='val', img_size=img_size, transform=val_transform)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
    
    return train_loader, val_loader

print("‚úÖ Dataset classes defined")

In [None]:
# Create dataloaders using downloaded dataset
BATCH_SIZE = 32
IMG_SIZE = 416

train_loader, val_loader = create_dataloaders(
    DATA_PATH,
    batch_size=BATCH_SIZE,
    img_size=IMG_SIZE,
    num_workers=2
)

print(f"\nüìä Data Summary:")
print(f"   Train batches: {len(train_loader)}")
print(f"   Val batches: {len(val_loader)}")
print(f"   Batch size: {BATCH_SIZE}")
print(f"   Image size: {IMG_SIZE}x{IMG_SIZE}")

In [None]:
# Visualize samples
import matplotlib.pyplot as plt

batch = next(iter(train_loader))
fig, axes = plt.subplots(2, 4, figsize=(16, 8))

for i in range(4):
    img = batch['image'][i].permute(1, 2, 0).numpy()
    img = img * [0.229, 0.224, 0.225] + [0.485, 0.456, 0.406]
    img = np.clip(img, 0, 1)
    
    axes[0, i].imshow(img)
    gender = 'Male' if batch['gender'][i] > 0.5 else 'Female'
    axes[0, i].set_title(f'Gender: {gender}')
    axes[0, i].axis('off')
    
    hm = batch['heatmap'][i, 0].numpy()
    axes[1, i].imshow(hm, cmap='hot')
    axes[1, i].set_title('Detection Heatmap')
    axes[1, i].axis('off')

plt.suptitle('PA-100k Training Samples', fontsize=14)
plt.tight_layout()
plt.show()

## 4Ô∏è‚É£ Model & Training

In [None]:
import sys
sys.path.insert(0, '.')

from mouaadnet_ultra.model import MouaadNetUltra
from mouaadnet_ultra.losses import MultiTaskLoss

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"üñ•Ô∏è Device: {device}")

model = MouaadNetUltra()
model = model.to(device)

print(f"\nüìä Model Info:")
print(f"   Parameters: {model.count_parameters():,}")
print(f"   FP32 Size: {model.get_model_size_mb():.2f} MB")
print(f"   INT8 Size: {model.get_model_size_mb('int8'):.2f} MB")

In [None]:
# Training configuration
EPOCHS = 30
LR = 1e-3
WEIGHT_DECAY = 1e-4

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=LR * 10, epochs=EPOCHS,
    steps_per_epoch=len(train_loader), pct_start=0.3
)

criterion = MultiTaskLoss(det_weight=1.0, gender_weight=1.0, gender_pos_weight=3.0)
scaler = torch.cuda.amp.GradScaler()

print("‚úÖ Training setup complete")
print(f"   Epochs: {EPOCHS}")
print(f"   Learning rate: {LR} ‚Üí {LR * 10}")
print(f"   Optimizer: AdamW")

In [None]:
def train_epoch(model, loader, optimizer, scheduler, criterion, scaler, device):
    model.train()
    total_loss = 0
    
    pbar = tqdm(loader, desc='Training')
    for batch in pbar:
        images = batch['image'].to(device)
        B = images.shape[0]
        hm_h, hm_w = images.shape[2] // 4, images.shape[3] // 4
        
        targets = {
            'heatmaps': [batch['heatmap'].to(device)],
            'sizes': [batch['size'].view(B, 2, 1, 1).expand(B, 2, hm_h, hm_w).to(device)],
            'offsets': [batch['offset'].view(B, 2, 1, 1).expand(B, 2, hm_h, hm_w).to(device)],
            'gender_labels': batch['gender'].to(device),
        }
        
        optimizer.zero_grad()
        
        with torch.cuda.amp.autocast():
            outputs = model(images)
            predictions = {
                'heatmaps': [outputs['heatmaps'][0]],
                'sizes': [outputs['sizes'][0]],
                'offsets': [outputs['offsets'][0]],
                'gender': outputs['gender'],
            }
            losses = criterion(predictions, targets)
            loss = losses['total']
        
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': f'{loss.item():.4f}', 'lr': f'{scheduler.get_last_lr()[0]:.6f}'})
    
    return total_loss / len(loader)


@torch.no_grad()
def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    for batch in tqdm(loader, desc='Validating'):
        images = batch['image'].to(device)
        B = images.shape[0]
        hm_h, hm_w = images.shape[2] // 4, images.shape[3] // 4
        
        targets = {
            'heatmaps': [batch['heatmap'].to(device)],
            'sizes': [batch['size'].view(B, 2, 1, 1).expand(B, 2, hm_h, hm_w).to(device)],
            'offsets': [batch['offset'].view(B, 2, 1, 1).expand(B, 2, hm_h, hm_w).to(device)],
            'gender_labels': batch['gender'].to(device),
        }
        
        with torch.cuda.amp.autocast():
            outputs = model(images)
            predictions = {
                'heatmaps': [outputs['heatmaps'][0]],
                'sizes': [outputs['sizes'][0]],
                'offsets': [outputs['offsets'][0]],
                'gender': outputs['gender'],
            }
            losses = criterion(predictions, targets)
        
        total_loss += losses['total'].item()
        
        gender_pred = (torch.sigmoid(outputs['gender']) > 0.5).float()
        correct += (gender_pred == batch['gender'].to(device)).sum().item()
        total += batch['gender'].size(0)
    
    return total_loss / len(loader), correct / total * 100

print("‚úÖ Training functions ready")

In [None]:
# üöÄ TRAIN!
best_loss = float('inf')
history = {'train_loss': [], 'val_loss': [], 'val_acc': []}

print("="*60)
print("üöÄ Starting Training")
print("="*60)

for epoch in range(EPOCHS):
    print(f"\nüìç Epoch {epoch+1}/{EPOCHS}")
    
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, criterion, scaler, device)
    val_loss, val_acc = validate(model, val_loader, criterion, device)
    
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    
    print(f"   Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Gender Acc: {val_acc:.2f}%")
    
    if val_loss < best_loss:
        best_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'best_loss': best_loss,
            'val_acc': val_acc,
        }, 'best_model.pt')
        print("   ‚≠ê New best model saved!")

print("\n" + "="*60)
print("‚úÖ Training Complete!")
print(f"   Best Val Loss: {best_loss:.4f}")
print(f"   Final Gender Accuracy: {history['val_acc'][-1]:.2f}%")
print("="*60)

In [None]:
# Plot training curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(history['train_loss'], label='Train', linewidth=2)
ax1.plot(history['val_loss'], label='Validation', linewidth=2)
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Training & Validation Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2.plot(history['val_acc'], color='green', linewidth=2)
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy (%)')
ax2.set_title('Gender Classification Accuracy')
ax2.grid(True, alpha=0.3)

plt.suptitle('MOUAADNET-ULTRA Training on PA-100k', fontsize=14)
plt.tight_layout()
plt.savefig('training_curves.png', dpi=150, bbox_inches='tight')
plt.show()

## 5Ô∏è‚É£ Export Model

In [None]:
# Load best model
checkpoint = torch.load('best_model.pt')
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

print(f"‚úÖ Loaded best model from epoch {checkpoint['epoch'] + 1}")
print(f"   Val Loss: {checkpoint['best_loss']:.4f}")
print(f"   Gender Accuracy: {checkpoint['val_acc']:.2f}%")

In [None]:
# Fuse for faster inference
model.fuse_for_inference()
print("‚úÖ Model fused for inference")

In [None]:
# Export to ONNX
model.cpu()
dummy_input = torch.randn(1, 3, 416, 416)

torch.onnx.export(
    model, dummy_input,
    'mouaadnet_ultra_pa100k.onnx',
    input_names=['image'],
    output_names=['heatmaps', 'sizes', 'offsets', 'gender'],
    dynamic_axes={'image': {0: 'batch'}},
    opset_version=12,
)

import os
onnx_size = os.path.getsize('mouaadnet_ultra_pa100k.onnx') / (1024 * 1024)
print(f"‚úÖ Exported to ONNX")
print(f"   File: mouaadnet_ultra_pa100k.onnx")
print(f"   Size: {onnx_size:.2f} MB")

In [None]:
# Download trained files
from google.colab import files

print("üì• Downloading trained files...")
files.download('best_model.pt')
files.download('mouaadnet_ultra_pa100k.onnx')
files.download('training_curves.png')

## üéâ Done!

Your MOUAADNET-ULTRA model is now trained! 

### Downloaded Files:
- `best_model.pt` - PyTorch checkpoint
- `mouaadnet_ultra_pa100k.onnx` - ONNX model for deployment
- `training_curves.png` - Training visualization

### Next Steps:
1. Copy `best_model.pt` to your local project
2. Run webcam demo with trained weights:
   ```bash
   python examples/webcam_demo.py --weights best_model.pt
   ```