# Train LoRA BDD Night (Mamba-Vision)

Notebook-first pipeline with safety gates:

- dependency checks
- optional FiftyOne export + manifest generation
- one-batch shape/loss sanity
- mandatory pilot before full run


In [3]:
from __future__ import annotations

import random
import sys
from pathlib import Path

import numpy as np
import torch


def find_repo_root(start: Path) -> Path:
    for candidate in (start, *start.parents):
        if (candidate / '.git').exists():
            return candidate
    return start


REPO_ROOT = find_repo_root(Path.cwd().resolve())
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

RUN_MODE = 'pilot'  # pilot | full
PREPARE_DATA = False
AUTO_CONFIRM_FULL = False

print('Repo root:', REPO_ROOT)
print('Run mode:', RUN_MODE)


Repo root: /teamspace/studios/this_studio/Hot-Peppers-Company-Computer-Vision
Run mode: pilot


In [4]:
CONFIG_PATH = REPO_ROOT / 'configs/training/lora_bdd_night.yaml'
CONFIG_PATH


PosixPath('/teamspace/studios/this_studio/Hot-Peppers-Company-Computer-Vision/configs/training/lora_bdd_night.yaml')

In [5]:
from pipelines.dependencies import check_packages, assert_required_packages

required = ['torch', 'torchvision', 'yaml', 'safetensors', 'tqdm', 'einops']
optional = ['fiftyone', 'mambavision', 'wandb']
status = check_packages(required + optional)
status


{'torch': True,
 'torchvision': True,
 'yaml': True,
 'safetensors': True,
 'tqdm': True,
 'einops': True,
 'fiftyone': True,
 'mambavision': True,
 'wandb': True}

In [6]:
assert_required_packages(['torch', 'torchvision', 'yaml', 'safetensors', 'einops'])
print('Core dependencies look good')


Core dependencies look good


In [7]:
from pipelines.contracts import TrainConfig

cfg = TrainConfig.from_yaml(CONFIG_PATH)
cfg.model.model_file = str((REPO_ROOT / cfg.model.model_file).resolve())
cfg.ckpt.output_path = str((REPO_ROOT / cfg.ckpt.output_path).resolve())
if cfg.model.base_checkpoint:
    cfg.model.base_checkpoint = str((REPO_ROOT / cfg.model.base_checkpoint).resolve())
if cfg.data.get('lora_output_path'):
    cfg.data['lora_output_path'] = str((REPO_ROOT / cfg.data['lora_output_path']).resolve())
cfg


TrainConfig(run_name='lora_bdd_night', data={'source': 'fiftyone_zoo', 'zoo_name': 'bdd100k', 'split_train': 'train', 'split_val': 'validation', 'time_of_day': ['night'], 'manifest_train': 'configs/manifests/bdd_night_train.json', 'manifest_val': 'configs/manifests/bdd_night_val.json', 'export_train_dir': 'data/exports/bdd_night/train', 'export_val_dir': 'data/exports/bdd_night/val', 'max_samples_train': None, 'max_samples_val': None, 'local_dataset_dir': '/teamspace/studios/this_studio/datasets/bdd100k:-images-100k', 'local_dataset_type': 'COCODetectionDataset', 'lora_output_path': '/teamspace/studios/this_studio/Hot-Peppers-Company-Computer-Vision/checkpoints/lora/bdd_night.safetensors'}, model=ModelSection(backbone='mamba_vision_T2', num_classes=8, pretrained=False, checkpoint_path='', base_checkpoint='/teamspace/studios/this_studio/Hot-Peppers-Company-Computer-Vision/checkpoints/base/coco_base.ckpt', model_file='/teamspace/studios/this_studio/Hot-Peppers-Company-Computer-Vision/mam

In [8]:
from pipelines.fiftyone_data import prepare_zoo_split_export

if PREPARE_DATA:
    print('Preparing train split export...')
    prepare_zoo_split_export(
        zoo_name=cfg.data['zoo_name'],
        split=cfg.data['split_train'],
        dataset_name=f"{cfg.run_name}_{cfg.data['split_train']}",
        export_dir=str(REPO_ROOT / cfg.data['export_train_dir']),
        manifest_path=str(REPO_ROOT / cfg.data['manifest_train']),
        source=cfg.data.get('source', 'fiftyone_zoo'),
        max_samples=cfg.data.get('max_samples_train'),
        time_of_day=cfg.data.get('time_of_day'),
        local_dataset_dir=cfg.data.get('local_dataset_dir'),
        local_dataset_type=cfg.data.get('local_dataset_type', 'COCODetectionDataset'),
    )

    print('Preparing val split export...')
    prepare_zoo_split_export(
        zoo_name=cfg.data['zoo_name'],
        split=cfg.data['split_val'],
        dataset_name=f"{cfg.run_name}_{cfg.data['split_val']}",
        export_dir=str(REPO_ROOT / cfg.data['export_val_dir']),
        manifest_path=str(REPO_ROOT / cfg.data['manifest_val']),
        source=cfg.data.get('source', 'fiftyone_zoo'),
        max_samples=cfg.data.get('max_samples_val'),
        time_of_day=cfg.data.get('time_of_day'),
        local_dataset_dir=cfg.data.get('local_dataset_dir'),
        local_dataset_type=cfg.data.get('local_dataset_type', 'COCODetectionDataset'),
    )
else:
    print('PREPARE_DATA=False -> expecting existing manifests')


PREPARE_DATA=False -> expecting existing manifests


In [10]:
from pipelines.coco_dataset import build_dataloader
from pipelines.contracts import DatasetManifest

train_manifest = DatasetManifest.from_json(REPO_ROOT / cfg.data['manifest_train'])
val_manifest = DatasetManifest.from_json(REPO_ROOT / cfg.data['manifest_val'])

train_loader = build_dataloader(
    train_manifest,
    image_size=cfg.train.image_size,
    batch_size=cfg.train.batch_size,
    num_workers=cfg.train.num_workers,
    shuffle=True,
    max_samples=cfg.data.get('max_samples_train'),
)
val_loader = build_dataloader(
    val_manifest,
    image_size=cfg.train.image_size,
    batch_size=cfg.train.batch_size,
    num_workers=cfg.train.num_workers,
    shuffle=False,
    max_samples=cfg.data.get('max_samples_val'),
)

print('Train images:', train_manifest.num_images, 'instances:', train_manifest.num_instances)
print('Val images:', val_manifest.num_images, 'instances:', val_manifest.num_instances)
print('Train batches:', len(train_loader), 'Val batches:', len(val_loader))


Train images: 27971 instances: 367425
Val images: 3929 instances: 51889
Train batches: 3497 Val batches: 492


In [11]:
from pipelines.lora import (
    collect_trainable_parameter_summary,
    configure_lora_training,
    inject_lora_modules,
)
from pipelines.model_loader import create_model_from_config
from pipelines.training import load_checkpoint, resolve_device


device = resolve_device(cfg.train.device)
model = create_model_from_config(cfg.model, device=str(device))

if cfg.model.base_checkpoint:
    load_checkpoint(REPO_ROOT / cfg.model.base_checkpoint, model)
    print('Loaded base checkpoint:', REPO_ROOT / cfg.model.base_checkpoint)

if cfg.lora is not None:
    replaced_layers = inject_lora_modules(
        model.backbone,
        rank=cfg.lora.rank,
        alpha=cfg.lora.alpha,
        dropout=cfg.lora.dropout,
        target_rule=cfg.lora.target_rule,
    )
    configure_lora_training(model, freeze_neck=cfg.freeze.neck, freeze_head=cfg.freeze.head)
    print('LoRA layers injected:', len(replaced_layers))

summary = collect_trainable_parameter_summary(model)
print(summary)




Loaded base checkpoint: /teamspace/studios/this_studio/Hot-Peppers-Company-Computer-Vision/checkpoints/base/coco_base.ckpt
LoRA layers injected: 76
{'trainable': 756864, 'frozen': 43573319, 'total': 44330183}


In [12]:
from pipelines.yolo_ops import MultiScaleYoloLoss

model = model.to(device)
criterion = MultiScaleYoloLoss(num_classes=cfg.model.num_classes).to(device)

images, targets = next(iter(train_loader))
images = images.to(device)
targets = [{k: v.to(device) if torch.is_tensor(v) else v for k, v in t.items()} for t in targets]

with torch.no_grad():
    outputs = model(images)

losses = criterion(outputs, targets)
print('Scale outputs:', [tuple(o.shape) for o in outputs])
print('Sanity loss:', {k: float(v.detach().cpu()) for k, v in losses.items()})


Scale outputs: [(8, 13, 80, 80), (8, 13, 40, 40), (8, 13, 20, 20)]
Sanity loss: {'loss': 3.3725879192352295, 'obj_loss': 1.8322288990020752, 'box_loss': 0.05520438402891159, 'cls_loss': 1.2643370628356934}


In [13]:
if device.type == 'cuda':
    allocated = torch.cuda.memory_allocated(device) / (1024 ** 3)
    reserved = torch.cuda.memory_reserved(device) / (1024 ** 3)
    print(f'GPU memory allocated: {allocated:.2f} GB')
    print(f'GPU memory reserved : {reserved:.2f} GB')
else:
    print('Non-CUDA device:', device)


GPU memory allocated: 0.22 GB
GPU memory reserved : 1.99 GB


In [14]:
from pipelines.training import fit_model

pilot_history = fit_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    train_cfg=cfg.train,
    ckpt_cfg=cfg.ckpt,
    num_classes=cfg.model.num_classes,
    run_mode='pilot',
)

print('Pilot train loss:', pilot_history['train'][-1].loss)
print('Pilot val loss  :', pilot_history['val'][-1].loss)


  scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda" and precision.lower() == "fp16"))
                                                                

Pilot train loss: 4.141508388519287
Pilot val loss  : 3.735966205596924


In [15]:
if RUN_MODE == 'full':
    if AUTO_CONFIRM_FULL:
        confirm = 'yes'
    else:
        confirm = input('Pilot complete. Start FULL training? (yes/no): ').strip().lower()
    if confirm != 'yes':
        raise RuntimeError('Cancelled by user before full training')
else:
    print('RUN_MODE is pilot -> full run step is skipped')


RUN_MODE is pilot -> full run step is skipped


In [16]:
full_history = None
if RUN_MODE == 'full':
    full_history = fit_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        train_cfg=cfg.train,
        ckpt_cfg=cfg.ckpt,
        num_classes=cfg.model.num_classes,
        run_mode='full',
    )
    print('Full train loss:', full_history['train'][-1].loss)
    print('Full val loss  :', full_history['val'][-1].loss)
else:
    print('Pilot-only run completed')


Pilot-only run completed


In [17]:
from pipelines.lora import save_lora_adapters

if cfg.lora is not None:
    lora_path = REPO_ROOT / cfg.data['lora_output_path']
    lora_path.parent.mkdir(parents=True, exist_ok=True)
    save_lora_adapters(
        model,
        output_path=str(lora_path),
        metadata={
            'run_name': cfg.run_name,
            'base_checkpoint': cfg.model.base_checkpoint,
        },
    )
    print('Saved LoRA adapters:', lora_path)

print('Train checkpoint path:', REPO_ROOT / cfg.ckpt.output_path)


Saved LoRA adapters: /teamspace/studios/this_studio/Hot-Peppers-Company-Computer-Vision/checkpoints/lora/bdd_night.safetensors
Train checkpoint path: /teamspace/studios/this_studio/Hot-Peppers-Company-Computer-Vision/checkpoints/lora/bdd_night_train_state.ckpt
