# H-MOLQD Kaggle Training

Production-ready training notebook with robust checkpoint management.

**Features:**
- âœ… Automatic checkpoint saving every N epochs
- âœ… Auto-resume from latest checkpoint
- âœ… Manual resume from specific checkpoint
- âœ… Save optimizer + scheduler state
- âœ… Save training metrics history
- âœ… Handle interruptions gracefully
- âœ… Export final models & results

## 1. Installation & Setup

In [None]:
# Install dependencies
!pip install -q networkx torch torchvision numpy matplotlib tqdm

# Optional: Uncomment to download dataset
# !kaggle datasets download -d yourusername/zelda-vglc-data
# !unzip -q zelda-vglc-data.zip -d /kaggle/input/

print("âœ… Installation complete")

In [None]:
import sys
import os
from pathlib import Path
import json
import torch
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 2. Configuration

In [None]:
CONFIG = {
    # Training parameters
    'batch_size': 32,
    'num_epochs': 100,
    'learning_rate': 1e-4,
    'weight_decay': 1e-5,
    'checkpoint_every': 5,  # Save every N epochs
    
    # Model parameters
    'latent_dim': 64,
    'num_embeddings': 512,
    'num_timesteps': 1000,
    'embedding_dim': 256,
    
    # Paths
    'data_dir': '/kaggle/input/zelda-vglc-data',
    'checkpoint_dir': '/kaggle/working/checkpoints',
    'output_dir': '/kaggle/working/outputs',
    
    # Resume
    'resume_from': None,  # Set to checkpoint path to manually resume
    'auto_resume': True,   # Automatically resume from latest if exists
    
    # Mixed precision training
    'use_amp': torch.cuda.is_available(),
    
    # Logging
    'log_every': 10,  # Log every N batches
    'val_every': 1,   # Validate every N epochs
}

# Create directories
os.makedirs(CONFIG['checkpoint_dir'], exist_ok=True)
os.makedirs(CONFIG['output_dir'], exist_ok=True)

# Save config
with open(f"{CONFIG['output_dir']}/config.json", 'w') as f:
    json.dump(CONFIG, f, indent=2)

print("Configuration:")
print(json.dumps(CONFIG, indent=2))

## 3. Checkpoint Management System

**Robust checkpoint manager with:**
- Automatic saving every N epochs
- Resume from latest or specific checkpoint
- Save full training state (model, optimizer, scheduler, metrics)
- Handle interruptions gracefully

In [None]:
class CheckpointManager:
    """
    Manages checkpoint saving and loading for Kaggle training.
    
    Features:
    - Automatic checkpoint saving every N epochs
    - Resume from latest checkpoint
    - Save optimizer state, scheduler state, metrics
    - Handle interruptions gracefully
    """
    
    def __init__(self, checkpoint_dir, save_every=5):
        self.checkpoint_dir = Path(checkpoint_dir)
        self.checkpoint_dir.mkdir(exist_ok=True, parents=True)
        self.save_every = save_every
        self.best_metric = float('inf')  # Lower is better
    
    def save_checkpoint(
        self,
        epoch,
        model,
        optimizer,
        scheduler=None,
        metrics=None,
        extra_state=None,
        is_best=False
    ):
        """
        Save training checkpoint.
        
        Args:
            epoch: Current epoch number
            model: Model or dict of models
            optimizer: Optimizer or dict of optimizers
            scheduler: LR scheduler (optional)
            metrics: Dict of training metrics
            extra_state: Any additional state to save
            is_best: Whether this is the best checkpoint
        """
        checkpoint = {
            'epoch': epoch,
            'timestamp': datetime.now().isoformat(),
            'metrics': metrics or {},
            'extra_state': extra_state or {},
        }
        
        # Save model(s)
        if isinstance(model, dict):
            checkpoint['model_state_dict'] = {
                name: m.state_dict() for name, m in model.items()
            }
        else:
            checkpoint['model_state_dict'] = model.state_dict()
        
        # Save optimizer(s)
        if isinstance(optimizer, dict):
            checkpoint['optimizer_state_dict'] = {
                name: opt.state_dict() for name, opt in optimizer.items()
            }
        else:
            checkpoint['optimizer_state_dict'] = optimizer.state_dict()
        
        # Save scheduler
        if scheduler is not None:
            if isinstance(scheduler, dict):
                checkpoint['scheduler_state_dict'] = {
                    name: sch.state_dict() for name, sch in scheduler.items()
                }
            else:
                checkpoint['scheduler_state_dict'] = scheduler.state_dict()
        
        # Save regular checkpoint
        checkpoint_path = self.checkpoint_dir / f'checkpoint_epoch_{epoch}.pt'
        torch.save(checkpoint, checkpoint_path)
        
        # Save "latest" pointer
        latest_path = self.checkpoint_dir / 'checkpoint_latest.pt'
        torch.save(checkpoint, latest_path)
        
        # Save "best" if applicable
        if is_best:
            best_path = self.checkpoint_dir / 'checkpoint_best.pt'
            torch.save(checkpoint, best_path)
            print(f"ðŸ’Ž Best checkpoint saved (epoch {epoch})")
        
        print(f"âœ… Checkpoint saved: epoch {epoch}")
        
        return checkpoint_path
    
    def load_checkpoint(self, checkpoint_path=None):
        """
        Load checkpoint.
        
        Args:
            checkpoint_path: Specific checkpoint to load, or None for latest
        
        Returns:
            checkpoint dict or None if no checkpoint found
        """
        if checkpoint_path is None:
            # Try to load latest
            latest_path = self.checkpoint_dir / 'checkpoint_latest.pt'
            if latest_path.exists():
                checkpoint_path = latest_path
            else:
                # Find latest numbered checkpoint
                checkpoints = sorted(self.checkpoint_dir.glob('checkpoint_epoch_*.pt'))
                if checkpoints:
                    checkpoint_path = checkpoints[-1]
                else:
                    print("No checkpoint found - starting from scratch")
                    return None
        
        checkpoint_path = Path(checkpoint_path)
        if not checkpoint_path.exists():
            print(f"Checkpoint not found: {checkpoint_path}")
            return None
        
        checkpoint = torch.load(checkpoint_path, map_location='cpu')
        print(f"âœ… Loaded checkpoint from epoch {checkpoint['epoch']}")
        print(f"   Saved at: {checkpoint['timestamp']}")
        
        return checkpoint
    
    def restore_training_state(
        self,
        checkpoint,
        model,
        optimizer,
        scheduler=None
    ):
        """
        Restore training state from checkpoint.
        
        Args:
            checkpoint: Loaded checkpoint dict
            model: Model or dict of models
            optimizer: Optimizer or dict of optimizers
            scheduler: LR scheduler (optional)
        
        Returns:
            start_epoch: Epoch to resume from
        """
        # Restore model(s)
        if isinstance(model, dict):
            for name, m in model.items():
                m.load_state_dict(checkpoint['model_state_dict'][name])
                print(f"   Restored model: {name}")
        else:
            model.load_state_dict(checkpoint['model_state_dict'])
            print("   Restored model")
        
        # Restore optimizer(s)
        if isinstance(optimizer, dict):
            for name, opt in optimizer.items():
                opt.load_state_dict(checkpoint['optimizer_state_dict'][name])
                print(f"   Restored optimizer: {name}")
        else:
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            print("   Restored optimizer")
        
        # Restore scheduler
        if scheduler is not None and 'scheduler_state_dict' in checkpoint:
            if isinstance(scheduler, dict):
                for name, sch in scheduler.items():
                    sch.load_state_dict(checkpoint['scheduler_state_dict'][name])
                    print(f"   Restored scheduler: {name}")
            else:
                scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
                print("   Restored scheduler")
        
        start_epoch = checkpoint['epoch'] + 1
        print(f"\nðŸš€ Resuming from epoch {start_epoch}")
        
        return start_epoch
    
    def should_save(self, epoch):
        """Check if should save checkpoint this epoch."""
        return (epoch + 1) % self.save_every == 0

print("âœ… CheckpointManager class defined")

## 4. Data Loading

Load VGLC Zelda dungeon data.

In [None]:
def load_vglc_data(data_dir):
    """
    Load VGLC Zelda dungeon data.
    
    TODO: Implement actual data loading based on your data format.
    This is a placeholder that should be replaced with:
    - Load level files from data_dir
    - Parse dungeon structures
    - Create train/val splits
    - Create DataLoaders
    """
    print(f"Loading data from: {data_dir}")
    
    # Placeholder implementation
    # Replace with actual data loading code
    train_data = []
    val_data = []
    
    # Example structure (replace with actual loading):
    # data_path = Path(data_dir)
    # for level_file in data_path.glob('*.txt'):
    #     level = parse_level(level_file)
    #     train_data.append(level)
    
    print(f"âœ… Loaded {len(train_data)} training samples")
    print(f"âœ… Loaded {len(val_data)} validation samples")
    
    return {
        'train': train_data,
        'val': val_data
    }

# Load data
# data = load_vglc_data(CONFIG['data_dir'])

# Create DataLoaders
# train_loader = torch.utils.data.DataLoader(
#     data['train'],
#     batch_size=CONFIG['batch_size'],
#     shuffle=True,
#     num_workers=2,
#     pin_memory=True
# )
# val_loader = torch.utils.data.DataLoader(
#     data['val'],
#     batch_size=CONFIG['batch_size'],
#     shuffle=False,
#     num_workers=2,
#     pin_memory=True
# )

print("âœ… Data loading configured")

## 5. Model Initialization

Initialize VQ-VAE, Diffusion, and LogicNet models.

In [None]:
# TODO: Import your actual models
# from src.core.vqvae import SemanticVQVAE
# from src.core.latent_diffusion import LatentDiffusionModel
# from src.core.logic_net import LogicNet

# Initialize models
# vqvae = SemanticVQVAE(
#     num_embeddings=CONFIG['num_embeddings'],
#     embedding_dim=CONFIG['embedding_dim'],
#     latent_dim=CONFIG['latent_dim']
# ).to(device)

# diffusion = LatentDiffusionModel(
#     latent_dim=CONFIG['latent_dim'],
#     num_timesteps=CONFIG['num_timesteps']
# ).to(device)

# logic_net = LogicNet(
#     input_dim=CONFIG['latent_dim']
# ).to(device)

# Placeholder model for demonstration
class PlaceholderModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(64, 64)
    
    def forward(self, x):
        return self.fc(x)

model = PlaceholderModel().to(device)

# Optimizers
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=CONFIG['learning_rate'],
    weight_decay=CONFIG['weight_decay']
)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=CONFIG['num_epochs']
)

# Mixed precision scaler
scaler = torch.cuda.amp.GradScaler(enabled=CONFIG['use_amp'])

print("âœ… Models initialized")
print(f"   Parameters: {sum(p.numel() for p in model.parameters()):,}")

## 6. Resume from Checkpoint

Automatically resume from latest checkpoint if available.

In [None]:
# Initialize checkpoint manager
checkpoint_manager = CheckpointManager(
    CONFIG['checkpoint_dir'],
    save_every=CONFIG['checkpoint_every']
)

start_epoch = 0
training_history = []
best_val_loss = float('inf')

# Try to resume
checkpoint = None
if CONFIG['resume_from'] is not None:
    # Manual resume from specific checkpoint
    checkpoint = checkpoint_manager.load_checkpoint(CONFIG['resume_from'])
elif CONFIG['auto_resume']:
    # Auto-resume from latest
    checkpoint = checkpoint_manager.load_checkpoint()

if checkpoint is not None:
    # Restore training state
    start_epoch = checkpoint_manager.restore_training_state(
        checkpoint,
        model=model,
        optimizer=optimizer,
        scheduler=scheduler
    )
    
    # Restore metrics
    training_history = checkpoint.get('metrics', {}).get('history', [])
    best_val_loss = checkpoint.get('metrics', {}).get('best_val_loss', float('inf'))
    
    print(f"\nðŸ“Š Previous metrics:")
    print(f"   Best val loss: {best_val_loss:.4f}")
    print(f"   Training history: {len(training_history)} epochs")
else:
    print("\nðŸ†• Starting fresh training")

print(f"\nðŸ“… Training plan: epochs {start_epoch} â†’ {CONFIG['num_epochs']}")

## 7. Training Loop

Main training loop with progress tracking and checkpoint saving.

In [None]:
def train_epoch(model, train_loader, optimizer, scheduler, scaler, epoch):
    """
    Train for one epoch.
    
    TODO: Replace with actual training logic.
    """
    model.train()
    
    # Placeholder implementation
    # Replace with actual training loop
    total_loss = 0.0
    num_batches = 100  # Placeholder
    
    pbar = tqdm(range(num_batches), desc=f"Epoch {epoch}")
    for batch_idx in pbar:
        # Simulate training step
        loss = torch.rand(1).item()
        total_loss += loss
        
        pbar.set_postfix({'loss': f'{loss:.4f}'})
    
    avg_loss = total_loss / num_batches
    return {
        'epoch': epoch,
        'train_loss': avg_loss,
        'lr': optimizer.param_groups[0]['lr']
    }

def validate(model, val_loader, epoch):
    """
    Validate the model.
    
    TODO: Replace with actual validation logic.
    """
    model.eval()
    
    # Placeholder implementation
    val_loss = torch.rand(1).item()
    
    return {
        'val_loss': val_loss
    }

print("âœ… Training functions defined")

In [None]:
# Main training loop
print("\n" + "="*60)
print("ðŸš€ STARTING TRAINING")
print("="*60)

for epoch in range(start_epoch, CONFIG['num_epochs']):
    print(f"\n{'='*60}")
    print(f"ðŸ“… Epoch {epoch+1}/{CONFIG['num_epochs']}")
    print(f"{'='*60}")
    
    # Train
    train_metrics = train_epoch(
        model=model,
        train_loader=None,  # Replace with actual loader
        optimizer=optimizer,
        scheduler=scheduler,
        scaler=scaler,
        epoch=epoch
    )
    
    # Validate
    if (epoch + 1) % CONFIG['val_every'] == 0:
        val_metrics = validate(
            model=model,
            val_loader=None,  # Replace with actual loader
            epoch=epoch
        )
        train_metrics.update(val_metrics)
    
    # Log metrics
    print(f"\nðŸ“Š Metrics:")
    print(f"   Train Loss: {train_metrics['train_loss']:.4f}")
    if 'val_loss' in train_metrics:
        print(f"   Val Loss:   {train_metrics['val_loss']:.4f}")
    print(f"   LR:         {train_metrics['lr']:.6f}")
    
    # Update best model
    is_best = False
    if 'val_loss' in train_metrics and train_metrics['val_loss'] < best_val_loss:
        best_val_loss = train_metrics['val_loss']
        is_best = True
        print(f"   ðŸ’Ž New best model! (val_loss: {best_val_loss:.4f})")
    
    # Save metrics
    training_history.append(train_metrics)
    
    # Save checkpoint
    if checkpoint_manager.should_save(epoch) or is_best:
        checkpoint_manager.save_checkpoint(
            epoch=epoch,
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            metrics={
                'current_train_loss': train_metrics['train_loss'],
                'best_val_loss': best_val_loss,
                'history': training_history
            },
            extra_state={
                'config': CONFIG
            },
            is_best=is_best
        )
    
    # Step scheduler
    scheduler.step()

print("\n" + "="*60)
print("âœ… TRAINING COMPLETE!")
print("="*60)
print(f"\nBest validation loss: {best_val_loss:.4f}")

## 8. Save Final Models

Export final models and training history.

In [None]:
# Save final checkpoint
final_checkpoint_path = checkpoint_manager.save_checkpoint(
    epoch=CONFIG['num_epochs'] - 1,
    model=model,
    optimizer=optimizer,
    scheduler=scheduler,
    metrics={
        'final_train_loss': training_history[-1]['train_loss'],
        'best_val_loss': best_val_loss,
        'history': training_history
    }
)

print(f"\nâœ… Final checkpoint saved: {final_checkpoint_path}")

# Export final model weights
torch.save(model.state_dict(), f"{CONFIG['output_dir']}/model_final.pt")

# Export training history
with open(f"{CONFIG['output_dir']}/training_history.json", 'w') as f:
    json.dump(training_history, f, indent=2)

print("âœ… Model weights exported")
print("âœ… Training history exported")

## 9. Visualize Training Progress

In [None]:
# Plot training curves
epochs = [m['epoch'] for m in training_history]
train_losses = [m['train_loss'] for m in training_history]
val_losses = [m.get('val_loss', None) for m in training_history]

plt.figure(figsize=(12, 5))

# Loss curves
plt.subplot(1, 2, 1)
plt.plot(epochs, train_losses, label='Train Loss', linewidth=2)
if any(v is not None for v in val_losses):
    val_losses_clean = [v for v in val_losses if v is not None]
    val_epochs = [e for e, v in zip(epochs, val_losses) if v is not None]
    plt.plot(val_epochs, val_losses_clean, label='Val Loss', linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Progress')
plt.legend()
plt.grid(True, alpha=0.3)

# Learning rate
plt.subplot(1, 2, 2)
lrs = [m['lr'] for m in training_history]
plt.plot(epochs, lrs, linewidth=2, color='orange')
plt.xlabel('Epoch')
plt.ylabel('Learning Rate')
plt.title('Learning Rate Schedule')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f"{CONFIG['output_dir']}/training_curves.png", dpi=150, bbox_inches='tight')
plt.show()

print("âœ… Training curves saved")

## 10. Generate Sample Dungeons

Generate and visualize sample outputs.

In [None]:
# TODO: Implement actual dungeon generation
# def generate_dungeons(model, num_samples=5):
#     model.eval()
#     samples = []
#     with torch.no_grad():
#         for i in range(num_samples):
#             sample = model.generate()
#             samples.append(sample)
#     return samples

# samples = generate_dungeons(model, num_samples=5)

# for i, sample in enumerate(samples):
#     plt.figure(figsize=(10, 14))
#     plt.imshow(sample, cmap='viridis')
#     plt.title(f"Generated Dungeon {i+1}")
#     plt.axis('off')
#     plt.savefig(f"{CONFIG['output_dir']}/sample_{i+1}.png", dpi=150, bbox_inches='tight')
#     plt.show()

print("âœ… Sample generation placeholder - implement actual generation logic")

## 11. Create Submission Archive

Package all outputs for easy download.

In [None]:
import zipfile
from pathlib import Path

# Create zip of outputs
zip_path = '/kaggle/working/h_molqd_outputs.zip'

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add checkpoints
    checkpoint_dir = Path(CONFIG['checkpoint_dir'])
    if checkpoint_dir.exists():
        for checkpoint_file in checkpoint_dir.glob('*.pt'):
            zipf.write(checkpoint_file, f"checkpoints/{checkpoint_file.name}")
            print(f"   Added: checkpoints/{checkpoint_file.name}")
    
    # Add outputs
    output_dir = Path(CONFIG['output_dir'])
    if output_dir.exists():
        for output_file in output_dir.glob('*'):
            if output_file.is_file():
                zipf.write(output_file, f"outputs/{output_file.name}")
                print(f"   Added: outputs/{output_file.name}")

# Get file size
zip_size_mb = Path(zip_path).stat().st_size / (1024 * 1024)

print(f"\nâœ… Created submission archive: {zip_path}")
print(f"   Size: {zip_size_mb:.2f} MB")
print("\nðŸ“¦ Download this file from Kaggle output panel")

## Training Complete! ðŸŽ‰

### What's saved:
- âœ… Checkpoints (every 5 epochs + best model)
- âœ… Final model weights
- âœ… Training history & metrics
- âœ… Training curves visualization
- âœ… Complete archive (h_molqd_outputs.zip)

### Next steps:
1. Download `h_molqd_outputs.zip` from Kaggle output
2. Extract checkpoints for inference
3. Analyze training curves
4. Generate dungeons using best checkpoint

### To resume training:
```python
# Automatic resume (loads latest)
CONFIG['auto_resume'] = True

# Or manual resume (loads specific checkpoint)
CONFIG['resume_from'] = '/kaggle/working/checkpoints/checkpoint_epoch_50.pt'
```