## Intializing environment

* Clone my repo to get pyproject.toml, model.py, data.py

In [None]:
# !git clone https://github.com/manavgoel472003/CLIP-VLM.git

In [None]:
# %cd /content/CLIP-VLM

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# !cp -r /content/CLIP-VLM /content/drive/MyDrive//CLIP-VLM

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd "/content/drive/MyDrive/CLIP-VLM/CLIP-VLM"

/content/drive/MyDrive/CLIP-VLM/CLIP-VLM


In [None]:
!pip install uv



In [None]:
!uv pip install -e .

[2mUsing Python 3.12.12 environment at: /usr[0m
[2K[2mResolved [1m75 packages[0m [2min 521ms[0m[0m
[2K[2mPrepared [1m1 package[0m [2min 888ms[0m[0m
[2mUninstalled [1m1 package[0m [2min 0.43ms[0m[0m
[2K[2mInstalled [1m1 package[0m [2min 1ms[0m[0m
 [33m~[39m [1mclip-vlm[0m[2m==0.1.0 (from file:///content/drive/MyDrive/CLIP-VLM/CLIP-VLM)[0m


## Step-by-step Training Flow

Setting hyper-parameters and dir paths

In [None]:
import os
import torch
import logging
from torch.utils.data import DataLoader
from data import DataCfg, build_dataset
from model import ModelCfg, VLMFusionModel
from training import distill_epoch, lm_epoch

os.environ["MPLBACKEND"] = "Agg"
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")

overlap_export_dir = os.environ.get("OVERLAP_EXPORT_DIR", "/content/drive/MyDrive/CLIP-VLM/overlap_front_back_trainval")
checkpoint_dir = os.environ.get("CHECKPOINT_DIR", "/content/drive/MyDrive/CLIP-VLM/checkpoints")
qwen_path = os.environ.get("QWEN_MODEL_DIR", "Qwen/Qwen2.5-VL-3B-Instruct")

hyperparams = {
    "mode": "lm",
    "epochs": 3,
    "batch_size": 4,
    "lr": 3e-4,
    "wd": 1e-2,
    "prompt": "Describe the scene.",
    "cameras": ["CAM_FRONT", "CAM_BACK"],
    "max_vis_tokens": 512,
    "qwen_quant": "bnb-4bit",
    "lora_r": 8,
    "lora_alpha": 16,
    "lora_dropout": 0.05,
}

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device={device}, Qwen={qwen_path}, quant={hyperparams['qwen_quant']}")


Using device=cuda, Qwen=Qwen/Qwen2.5-VL-3B-Instruct, quant=bnb-4bit


Initalizing the model and data config

In [None]:
dcfg = DataCfg(
    nusc_root=None,
    version="v1.0-mini",
    cameras=hyperparams["cameras"],
    dataset="overlap_export",
    nuinteract_dir=None,
    nuinteract_caption_strategy="overall",
    # max_samples=hyperparams["max_samples"],
    require_files=True,
    overlap_dir=overlap_export_dir,
)
print("Building dataset...")
ds = build_dataset(dcfg)
loader = DataLoader(ds, batch_size=hyperparams["batch_size"], shuffle=True, num_workers=0, collate_fn=lambda x: x[0])
print(f"Dataset ready with {len(ds)} samples"); loader


Building dataset...
Dataset ready with 2462 samples


<torch.utils.data.dataloader.DataLoader at 0x7bc69b296f30>

In [None]:
mcfg = ModelCfg(
    device=device,
    qwen_id=qwen_path,
    qwen_quant=hyperparams["qwen_quant"],
    use_lora=True,
    lora_r=hyperparams["lora_r"],
    lora_alpha=hyperparams["lora_alpha"],
    lora_dropout=hyperparams["lora_dropout"],
    max_vis_tokens=hyperparams["max_vis_tokens"],
)
model = VLMFusionModel(mcfg).to(device)
print("Model instantiated.")

for p in model.qwen.parameters():
    p.requires_grad = False
for p in model.ext.parameters():
    p.requires_grad = False
for name, param in model.gen_model.named_parameters():
    if "lora_" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False
train_params = list(model.connector.parameters()) + [p for p in model.gen_model.parameters() if p.requires_grad]
opt = torch.optim.AdamW(train_params, lr=hyperparams["lr"], weight_decay=hyperparams["wd"])


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model instantiated.


## Main training loop

Trying out the training for 3 epochs for now

In [None]:
os.makedirs(checkpoint_dir, exist_ok=True)
mode = hyperparams["mode"]
for epoch in range(1, hyperparams["epochs"] + 1):
    print(f"Starting epoch {epoch}/{hyperparams['epochs']}")
    if mode == "distill":
        loss = distill_epoch(model, loader, opt, device)
    else:
        loss = lm_epoch(model, loader, opt, device, prompt=hyperparams["prompt"])
    print(f"Epoch {epoch} loss: {loss:.4f}")
    ckpt_path = os.path.join(checkpoint_dir, f"connector_epoch{epoch}.pt")
    torch.save({
        "connector": model.connector.state_dict(),
        "d_ext": model.ext.d_out,
        "d_model": model.qwen.d_model,
        "version": "v1.0-mini",
        "qwen_id": mcfg.qwen_id,
        "mode": mode,
        "prompt": hyperparams["prompt"],
    }, ckpt_path)
    print(f"Saved {ckpt_path}")

final_path = os.path.join(checkpoint_dir, "connector_final.pt")
torch.save({
    "connector": model.connector.state_dict(),
    "d_ext": model.ext.d_out,
    "d_model": model.qwen.d_model,
    "version": "v1.0-mini",
    "qwen_id": mcfg.qwen_id,
    "mode": mode,
    "prompt": hyperparams["prompt"],
}, final_path)
print(f"Final checkpoint saved to {final_path}")


Starting epoch 1/3


lm: 100%|██████████| 616/616 [1:55:49<00:00, 11.28s/it]


Epoch 1 loss: 1.1367
Saved /content/drive/MyDrive/CLIP-VLM/checkpoints/connector_epoch1.pt
Starting epoch 2/3


lm: 100%|██████████| 616/616 [1:01:20<00:00,  5.98s/it]


Epoch 2 loss: 1.0668
Saved /content/drive/MyDrive/CLIP-VLM/checkpoints/connector_epoch2.pt
Starting epoch 3/3


lm: 100%|██████████| 616/616 [1:01:32<00:00,  5.99s/it]


Epoch 3 loss: 1.0465
Saved /content/drive/MyDrive/CLIP-VLM/checkpoints/connector_epoch3.pt
Final checkpoint saved to /content/drive/MyDrive/CLIP-VLM/checkpoints/connector_final.pt


## Traing (continued)
Training for 7 more epochs

In [None]:
for epoch in range(hyperparams["epochs"] + 1, 11):
    print(f"Starting epoch {epoch}/10")
    if mode == "distill":
        loss = distill_epoch(model, loader, opt, device)
    else:
        loss = lm_epoch(model, loader, opt, device, prompt=hyperparams["prompt"])
    print(f"Epoch {epoch} loss: {loss:.4f}")
    ckpt_path = os.path.join(checkpoint_dir, f"connector_epoch{epoch}.pt")
    torch.save({
        "connector": model.connector.state_dict(),
        "d_ext": model.ext.d_out,
        "d_model": model.qwen.d_model,
        "version": "v1.0-mini",
        "qwen_id": mcfg.qwen_id,
        "mode": mode,
        "prompt": hyperparams["prompt"],
    }, ckpt_path)
    print(f"Saved {ckpt_path}")

final_path = os.path.join(checkpoint_dir, "connector_final.pt")
torch.save({
    "connector": model.connector.state_dict(),
    "d_ext": model.ext.d_out,
    "d_model": model.qwen.d_model,
    "version": "v1.0-mini",
    "qwen_id": mcfg.qwen_id,
    "mode": mode,
    "prompt": hyperparams["prompt"],
}, final_path)
print(f"Final checkpoint saved to {final_path}")


Starting epoch 4/3


lm: 100%|██████████| 616/616 [1:01:23<00:00,  5.98s/it]


Epoch 4 loss: 1.0322
Saved /content/drive/MyDrive/CLIP-VLM/checkpoints/connector_epoch4.pt
Starting epoch 5/3


lm: 100%|██████████| 616/616 [1:01:15<00:00,  5.97s/it]


Epoch 5 loss: 1.0119
Saved /content/drive/MyDrive/CLIP-VLM/checkpoints/connector_epoch5.pt
Starting epoch 6/3


lm: 100%|██████████| 616/616 [1:01:08<00:00,  5.95s/it]


Epoch 6 loss: 0.9743
Saved /content/drive/MyDrive/CLIP-VLM/checkpoints/connector_epoch6.pt
Starting epoch 7/3


lm: 100%|██████████| 616/616 [1:01:00<00:00,  5.94s/it]


Epoch 7 loss: 0.9584
Saved /content/drive/MyDrive/CLIP-VLM/checkpoints/connector_epoch7.pt
Starting epoch 8/3


lm: 100%|██████████| 616/616 [1:01:06<00:00,  5.95s/it]


Epoch 8 loss: nan
Saved /content/drive/MyDrive/CLIP-VLM/checkpoints/connector_epoch8.pt
Starting epoch 9/3


lm: 100%|██████████| 616/616 [1:01:01<00:00,  5.94s/it]


Epoch 9 loss: nan
Saved /content/drive/MyDrive/CLIP-VLM/checkpoints/connector_epoch9.pt
Starting epoch 10/3


lm: 100%|██████████| 616/616 [1:01:04<00:00,  5.95s/it]


Epoch 10 loss: nan
Saved /content/drive/MyDrive/CLIP-VLM/checkpoints/connector_epoch10.pt
Final checkpoint saved to /content/drive/MyDrive/CLIP-VLM/checkpoints/connector_final.pt
