<a href="https://colab.research.google.com/github/nahubn1/Hybrid-Robot-Navigation-System/blob/main/notebooks/model_prototyping/U-Net-FiLM-Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
COLAB_PROJECT_ROOT = '/content/Thesis_Project'
if not os.path.exists(COLAB_PROJECT_ROOT):
  !git clone https://github.com/nahubn1/Hybrid-Robot-Navigation-System {COLAB_PROJECT_ROOT}
os.chdir(COLAB_PROJECT_ROOT) # Change directory into the project
!git pull # Ensure it's the latest version

Cloning into '/content/Thesis_Project'...
remote: Enumerating objects: 707, done.[K
remote: Counting objects: 100% (115/115), done.[K
remote: Compressing objects: 100% (98/98), done.[K
remote: Total 707 (delta 46), reused 17 (delta 17), pack-reused 592 (from 2)[K
Receiving objects: 100% (707/707), 420.56 KiB | 1.68 MiB/s, done.
Resolving deltas: 100% (385/385), done.
Already up to date.


In [None]:
!pip install -r environment/requirements.txt

Collecting pybullet==3.2.7 (from -r environment/requirements.txt (line 1))
  Downloading pybullet-3.2.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0->-r environment/requirements.txt (line 11))
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0->-r environment/requirements.txt (line 11))
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0->-r environment/requirements.txt (line 11))
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0->-r environment/requirements.txt (line 11))
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nv

In [5]:
import sys
# Add src to path
sys.path.append(os.path.join(os.getcwd(), 'src'))

# Define Drive paths
DRIVE_DATA_PATH = '/content/drive/MyDrive/Thesis_DNN_Planner/data'
DRIVE_MODELS_PATH = '/content/drive/MyDrive/Thesis_DNN_Planner/models'
DRIVE_RESULTS_PATH = '/content/drive/MyDrive/Thesis_DNN_Planner/results'

# Link Drive storage to local cloned directories
if not os.path.islink('data'):
    !ln -s {DRIVE_DATA_PATH} data
if not os.path.islink('models'):
    !ln -s {DRIVE_MODELS_PATH} models
if not os.path.islink('results'):
    !ln -s {DRIVE_RESULTS_PATH} results

print("✅ Environment setup complete.")

✅ Environment setup complete.


### Phase 2, Task 2.5: Full Pipeline Sanity Check

In [6]:
import torch
import numpy as np
from pathlib import Path
from torch.utils.data import DataLoader
from dnn_guidance.data_loader import PathfindingDataset
from dnn_guidance.model import UNetFiLM
from dnn_guidance.loss import DiceFocalLoss

# Create a temporary minimal dataset
_temp_root = Path('tmp_sanity_data')
_samples = _temp_root/'samples'
_gt = _temp_root/'gt'
_samples.mkdir(parents=True, exist_ok=True)
_gt.mkdir(parents=True, exist_ok=True)

grid = np.zeros((200, 200), dtype=np.uint8)
grid[0, 0] = 8  # start
grid[-1, -1] = 9  # goal
np.savez(_samples/'sample0.npz', map=grid, clearance=2.0, step_size=8.0, config=np.array([]))
heatmap = np.zeros((200, 200), dtype=np.float32)
np.savez(_gt/'sample0.npz', heatmap=heatmap)

# Build dataset and dataloader
dataset = PathfindingDataset(_samples, _gt)
loader = DataLoader(dataset, batch_size=1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = UNetFiLM().to(device)
criterion = DiceFocalLoss()

(grid_batch, robot_batch), target_batch = next(iter(loader))
grid_batch = grid_batch.to(device)
robot_batch = robot_batch.to(device)
target_batch = target_batch.to(device)

logits = model(grid_batch, robot_batch)
print('Logits shape:', logits.shape)
loss = criterion(logits, target_batch)
print('Loss:', loss.item())

assert logits.shape == (1, 1, 200, 200)
assert loss.dim() == 0

loss.backward()
print('✅ Phase 2 Sanity Check Passed: Model, data pipeline, and loss function are fully integrated and functional.')

Logits shape: torch.Size([1, 1, 200, 200])
Loss: 0.6198154091835022
✅ Phase 2 Sanity Check Passed: Model, data pipeline, and loss function are fully integrated and functional.


### Phase 3: Experiment Setup and Initialization

In [7]:
import yaml
from pathlib import Path

CONFIG_PATH = 'configs/dnn/unet_film_v1_baseline.yaml'
with open(CONFIG_PATH, 'r') as f:
    cfg = yaml.safe_load(f)
print(f'Configuration loaded from {CONFIG_PATH}.')

Configuration loaded from configs/dnn/unet_film_v1_baseline.yaml.


In [8]:
import random
import numpy as np
import torch

seed = cfg.get('seed', 0)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
print(f'Random seed set to {seed}.')

Random seed set to 42.


In [2]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from dnn_guidance.data_loader import PathfindingDataset, _pair_files
from pathlib import Path

samples_dir = Path(DRIVE_DATA_PATH + '/' + cfg['samples_dir'])
gt_dir = Path(DRIVE_DATA_PATH + '/' + cfg['ground_truth_dir'])
all_pairs = _pair_files(samples_dir, gt_dir)
train_pairs, val_pairs = train_test_split(all_pairs, test_size=cfg['val_split'], random_state=seed)
train_dataset = PathfindingDataset(samples_dir, gt_dir, augment=True)
val_dataset = PathfindingDataset(samples_dir, gt_dir, augment=False)
train_dataset.pairs = train_pairs
val_dataset.pairs = val_pairs
train_loader = DataLoader(train_dataset, batch_size=cfg['batch_size'], shuffle=True, num_workers=cfg['num_workers'])
val_loader = DataLoader(val_dataset, batch_size=cfg['batch_size'], shuffle=False, num_workers=cfg['num_workers'])
print('DataLoaders created.')

ModuleNotFoundError: No module named 'dnn_guidance'

In [17]:
from dnn_guidance.model import UNetFiLM
from dnn_guidance.config import UNetConfig

model_cfg = UNetConfig.from_yaml('configs/dnn/unet_film.yaml')
model = UNetFiLM(model_cfg)
device = torch.device(cfg['device'])
model = model.to(device)
print(f'Model loaded onto {device}.')

Model loaded onto cuda.


In [18]:
import torch.optim as optim

opt_name = cfg['optimizer']['name']
optimizer_cls = getattr(optim, opt_name)
optimizer = optimizer_cls(model.parameters(), lr=cfg['optimizer']['lr'], weight_decay=cfg['optimizer']['weight_decay'])
scheduler = None
if cfg['scheduler']['name'] == 'CosineAnnealing':
    t_max = cfg['epochs'] - cfg['scheduler']['warmup_epochs']
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=t_max)
print('Optimizer and Scheduler initialized.')

Optimizer and Scheduler initialized.


In [19]:
from dnn_guidance.loss import DiceFocalLoss

loss_cfg = cfg['loss']
loss_fn = DiceFocalLoss(dice_weight=loss_cfg['dice_weight'], focal_weight=loss_cfg['focal_weight'], focal_gamma=loss_cfg['focal_gamma'])
print('Loss function initialized.')

Loss function initialized.


### Phase 4: Training Loop with Monitoring and Checkpointing

In [20]:
from datetime import datetime
from pathlib import Path
from torch.utils.tensorboard import SummaryWriter
from torch.cuda.amp import GradScaler
from dnn_guidance.trainer import train_one_epoch, validate_one_epoch

# Directories for logging and checkpoints
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_root = Path(cfg['log_dir'])
log_dir = log_root / f"{cfg['run_name']}_{timestamp}"
log_dir.mkdir(parents=True, exist_ok=True)
writer = SummaryWriter(log_dir)

ckpt_dir = Path(cfg['checkpoints_dir'])
ckpt_dir.mkdir(parents=True, exist_ok=True)
best_model_path = ckpt_dir / f"{cfg['run_name']}_best_model.pth"
latest_ckpt = ckpt_dir / f"{cfg['run_name']}_latest.pth"

scaler = GradScaler(enabled=cfg.get('use_amp', True))
best_dice = -1.0
epochs_no_improve = 0
start_epoch = 0

# Resume from checkpoint if available
if latest_ckpt.exists():
    ckpt = torch.load(latest_ckpt, map_location=device)
    model.load_state_dict(ckpt['model'])
    optimizer.load_state_dict(ckpt['optimizer'])
    if scheduler and ckpt.get('scheduler'):
        scheduler.load_state_dict(ckpt['scheduler'])
    scaler.load_state_dict(ckpt.get('scaler', {}))
    start_epoch = ckpt.get('epoch', 0) + 1
    best_dice = ckpt.get('best_dice', -1.0)
    epochs_no_improve = ckpt.get('epochs_no_improve', 0)
    print(f"Resumed from epoch {start_epoch}")

  scaler = GradScaler(enabled=cfg.get('use_amp', True))


In [21]:
for epoch in range(start_epoch, cfg['epochs']):
    print(f"--- Epoch {epoch+1}/{cfg['epochs']} ---")
    train_loss = train_one_epoch(model, train_loader, optimizer, loss_fn, device, scaler)
    val_loss, val_dice = validate_one_epoch(model, val_loader, loss_fn, device)
    if scheduler:
        scheduler.step()
    lr = optimizer.param_groups[0]['lr']

    writer.add_scalar('loss/train', train_loss, epoch)
    writer.add_scalar('loss/val', val_loss, epoch)
    writer.add_scalar('dice/val', val_dice, epoch)
    writer.add_scalar('lr', lr, epoch)
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Dice: {val_dice:.4f} | LR: {lr:.6f}")

    improved = val_dice > best_dice
    if improved:
        best_dice = val_dice
        torch.save(model.state_dict(), best_model_path)
        print("New best model found, saving checkpoint...")
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        print(f"No improvement for {epochs_no_improve} epochs...")

    torch.save({
        'epoch': epoch,
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'scheduler': scheduler.state_dict() if scheduler else None,
        'scaler': scaler.state_dict(),
        'best_dice': best_dice,
        'epochs_no_improve': epochs_no_improve
    }, latest_ckpt)

    if epochs_no_improve >= cfg['early_stop_patience']:
        print('Early stopping triggered.')
        break

writer.close()

--- Epoch 1/50 ---




ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "/content/Thesis_Project/src/dnn_guidance/data_loader.py", line 83, in __getitem__
    heatmap_tensor = torch.from_numpy(heatmap[None, ...])
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: At least one stride in the given numpy array is negative, and tensors with negative strides are not currently supported. (You can probably work around this by making a copy of your array  with array.copy().) 
