In [None]:
# set up for google colab
# from google.colab import drive
# drive.mount('/content/drive')

# %cd /content/drive/MyDrive/rt-detrv2-fine-tune/RT-DETR/rtdetrv2_pytorch

# !pip install supervision
# !pip install torchmetrics
# !pip install albumentations

Mounted at /content/drive


## Load pretrained model

In [4]:
# load model
import torch
from argparse import Namespace
from src.core import YAMLConfig
import torch.nn as nn
import torch.nn.init as init

args = Namespace(config_path='configs/rtdetrv2/rtdetrv2_r18vd_120e_coco.yml',
                 resume_path='models/rtdetrv2_r18vd_120e_coco_rerun_48.1.pth',
                 tuning=None, device=None, seed=0, use_amp=True, output_dir=None,
                 summary_dir=None, test_only=False, update=None, print_method='builtin',
                 print_rank=0, local_rank=None)


def load_pretrained_model(config_path,resume_path):

    # initialize the raw model
    cfg=YAMLConfig(config_path, resume=resume_path)
    model=cfg.model
    # model state_dict
    state_dict_model=model.state_dict()

    # pretrained state_dict
    checkpoint=torch.load(args.resume_path,map_location="cpu")
    if 'ema' in checkpoint:
        state_dict_pretrained=checkpoint['ema']['module']
    else:
        state_dict_pretrained=checkpoint['model']

    # Create a new state dictionary to store matched weights
    matched_weights = {}

        # Loop through all layers in the model
    for model_key, model_param in state_dict_model.items():
        # Try to find a matching key in the state_dict
        matched_key = None
        for state_key in state_dict_pretrained.keys():
            # Check if the state_dict key is a substring of the model key
            if state_key in model_key:
                matched_key = state_key
                break

        # If a matching key is found and shapes match, load the weight
        if matched_key is not None:
            state_weight = state_dict_pretrained[matched_key]

            # Ensure the shapes match exactly
            if state_weight.shape == model_param.shape:
                matched_weights[model_key] = state_weight
                print(f"Matched and loaded weight for: {model_key}")
            else:
                print(f"Shape mismatch for {model_key}: {state_weight.shape} vs {model_param.shape}")

    # Load the matched weights into the model
    model.load_state_dict(matched_weights, strict=False)
    print(f"\nLoad pretrained weights succesfully | {sum(p.numel() for p in model.parameters())/1e6} million parameters")
    return model, cfg

model, cfg=load_pretrained_model(args.config_path,args.resume_path)


Downloading: "https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet18_vd_pretrained_from_paddle.pth" to /root/.cache/torch/hub/checkpoints/ResNet18_vd_pretrained_from_paddle.pth
100%|██████████| 42.8M/42.8M [00:00<00:00, 225MB/s]


Load PResNet18 state_dict


  checkpoint=torch.load(args.resume_path,map_location="cpu")


Matched and loaded weight for: backbone.conv1.conv1_1.conv.weight
Matched and loaded weight for: backbone.conv1.conv1_1.norm.weight
Matched and loaded weight for: backbone.conv1.conv1_1.norm.bias
Matched and loaded weight for: backbone.conv1.conv1_1.norm.running_mean
Matched and loaded weight for: backbone.conv1.conv1_1.norm.running_var
Matched and loaded weight for: backbone.conv1.conv1_1.norm.num_batches_tracked
Matched and loaded weight for: backbone.conv1.conv1_2.conv.weight
Matched and loaded weight for: backbone.conv1.conv1_2.norm.weight
Matched and loaded weight for: backbone.conv1.conv1_2.norm.bias
Matched and loaded weight for: backbone.conv1.conv1_2.norm.running_mean
Matched and loaded weight for: backbone.conv1.conv1_2.norm.running_var
Matched and loaded weight for: backbone.conv1.conv1_2.norm.num_batches_tracked
Matched and loaded weight for: backbone.conv1.conv1_3.conv.weight
Matched and loaded weight for: backbone.conv1.conv1_3.norm.weight
Matched and loaded weight for: b

## 1. Dataset

In [5]:
import albumentations as A
from data_visdrone import VisDroneData
from torch.utils.data import DataLoader
import torch



def collate_fn(batch):
    # Extract pixel values and labels
    pixel_values = torch.stack([x["pixel_values"] for x in batch])

    # Prepare labels
    labels = [x["labels"] for x in batch]

    return {"pixel_values": pixel_values, "labels": labels}

# dataloaders
train_transform = A.Compose(
    [
        A.HorizontalFlip(p=0.5),
        A.ShiftScaleRotate(
            shift_limit=0.1,
            scale_limit=0.5,
            rotate_limit=0,
            p=0.5
        ),
        A.HueSaturationValue(
            hue_shift_limit=15, sat_shift_limit=70, val_shift_limit=40, p=0.5
        ),
        A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
    ],
    bbox_params=A.BboxParams(
        format="pascal_voc",  # Albumentations expects [xmin, ymin, xmax, ymax]
        label_fields=["category"],
        clip=True,
        min_area=1,
    ),
)

val_transform = A.Compose(
    [A.NoOp()],
    bbox_params=A.BboxParams(
        format="pascal_voc",
        label_fields=["category"],
        clip=True,
        min_area=1,
    ),
)

ds_train = VisDroneData(
        json_path="dataset/visdrone/annotations/train_coco.json",
        split="train",
        transforms=train_transform)
train_loader=DataLoader(ds_train,
                        batch_size=8,
                        collate_fn=collate_fn,
                        num_workers=2,
                        shuffle=True,
                        pin_memory=True)

ds_val = VisDroneData(
        json_path="dataset/visdrone/annotations/val_coco.json",
        split="val",
        transforms=train_transform)
val_loader=DataLoader(ds_val,
                      batch_size=8,
                      collate_fn=collate_fn,
                      num_workers=2,
                      shuffle=False,
                      pin_memory=True)

# take a batch
batch=next(iter(train_loader))
print(batch)

  check_for_updates()


preprocessor_config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

{'pixel_values': tensor([[[[0.3843, 0.4000, 0.4039,  ..., 0.3804, 0.3529, 0.3294],
          [0.3804, 0.3922, 0.3961,  ..., 0.3490, 0.3804, 0.3137],
          [0.3765, 0.3843, 0.3922,  ..., 0.3098, 0.3804, 0.3373],
          ...,
          [0.2431, 0.2784, 0.3176,  ..., 0.3176, 0.3098, 0.3059],
          [0.2980, 0.3412, 0.3373,  ..., 0.3176, 0.3137, 0.3059],
          [0.3451, 0.3451, 0.3176,  ..., 0.3176, 0.3137, 0.3098]],

         [[0.5294, 0.5451, 0.5490,  ..., 0.4235, 0.2902, 0.1137],
          [0.5255, 0.5373, 0.5412,  ..., 0.3961, 0.3451, 0.1451],
          [0.5216, 0.5294, 0.5373,  ..., 0.3686, 0.3882, 0.2431],
          ...,
          [0.2235, 0.2588, 0.2980,  ..., 0.3020, 0.2941, 0.2902],
          [0.2784, 0.3216, 0.3176,  ..., 0.3020, 0.2980, 0.2902],
          [0.3255, 0.3255, 0.2980,  ..., 0.3020, 0.2980, 0.2941]],

         [[0.6431, 0.6588, 0.6627,  ..., 0.4510, 0.3490, 0.2196],
          [0.6392, 0.6510, 0.6549,  ..., 0.4235, 0.3961, 0.2314],
          [0.6353, 0.6431

## 2. Train

In [6]:
import torch
import numpy as np
import supervision as sv
from tqdm import tqdm
from transformers import get_scheduler
from torch.cuda.amp import GradScaler, autocast


from dataclasses import dataclass, replace
from transformers import (
    AutoImageProcessor,
    TrainingArguments,
    Trainer
)
from torchmetrics.detection.mean_ap import MeanAveragePrecision

In [7]:
model=model.to("cuda")
# train 1 epoch
def train_one_epoch(
    model,loader,optimizer,criterion,
    max_norm=0.1,
    device="cuda",
    lr_warmup_scheduler=None, # learning rate warmup: start with small lr till reaching actual lr
    amp=True,
    scaler=None):

  model.train()
  loss_total=0
  num_batches=0

  if scaler is None:
    scaler=GradScaler(enabled=True) # use GradeScaler if scaler is not provided

  # tqdm progress bar
  progress_bar=tqdm(loader,desc="Training",leave =True)
  for batch_idx, batch in enumerate(progress_bar):
    batch_images = batch["pixel_values"].to(device)

    batch_images = batch_images.to(device=device, dtype=torch.float32, non_blocking=True)

    batch_targets = [{k: v.to(device) for k, v in t.items()} for t in batch["labels"]]

    # forward with amp (mixed precision)
    with torch.autocast(device_type=device, cache_enabled=True):
      outputs = model(batch_images, batch_targets)
    with torch.autocast(device_type=device, cache_enabled=False):
      loss_dict = criterion(outputs, batch_targets)

    loss=sum(loss_dict.values())

    scaler.scale(loss).backward()
    # gradient clipping
    if max_norm > 0:
      scaler.unscale_(optimizer)
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)

    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad() # zero grad for the next run

    if lr_warmup_scheduler is not None:
        lr_warmup_scheduler.step()

    # track loss
    loss_total += loss.item()
    num_batches += 1

    # Update tqdm bar with the current loss
    progress_bar.set_postfix({"batch_loss": loss.item()})

  # Close tqdm bar
  progress_bar.close()

  # Return average loss
  return loss_total / num_batches if num_batches > 0 else 0


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
# validation
# model output as a class -> to suit post processing method
@dataclass
class ModelOutput:
    logits: torch.Tensor
    pred_boxes: torch.Tensor


# compute mAP50 and mAP50-100 in validation
def validate(model, loader, processor, threshold, device):
    model.eval()

    # Initialize tqdm progress bar and evaluator
    progress_bar = tqdm(loader, desc="Validating", leave=True)
    evaluator = MeanAveragePrecision(box_format="xyxy", class_metrics=True)
    evaluator.warn_on_many_detections = False

    for batch in progress_bar:
        # Move batch data to the correct device
        images = batch['pixel_values'].to(device)
        batch_targets = batch['labels']

        # (1) Prepare target sizes and targets
        target_sizes = torch.tensor(np.array([x["orig_size"] for x in batch_targets])).to(device)
        batch_targets_processed = []

        # loop through individual targets
        for target, (height,width) in zip(batch_targets,target_sizes):
            boxes=target['boxes'].cpu().numpy()
            # convert to xyxy and compute actual dimensions
            boxes=sv.xcycwh_to_xyxy(boxes)
            boxes=boxes*np.array([width.item(),height.item(),width.item(),height.item()])
            boxes=torch.tensor(boxes, device=device)
            labels=target["labels"].to(device)
            batch_targets_processed.append({
                "boxes": boxes,
                "labels": labels
            })

        # (2) Compute predictions and post-process them
        with torch.no_grad():
            preds = model(images)
            outputs = ModelOutput(
                logits=preds['pred_logits'],
                pred_boxes=preds['pred_boxes']
            )
            batch_preds_processed = processor.post_process_object_detection(
                outputs,
                threshold=threshold,
                target_sizes=target_sizes
            )

        # (3) Update evaluator incrementally
        preds_for_evaluator = [
            {
                "boxes": pred["boxes"].cpu(),
                "scores": pred["scores"].cpu(),
                "labels": pred["labels"].cpu()
            }
            for pred in batch_preds_processed
        ]
        targets_for_evaluator = [
            {
                "boxes": target["boxes"].cpu(),
                "labels": target["labels"].cpu()
            }
            for target in batch_targets_processed
        ]
        evaluator.update(preds=preds_for_evaluator, target=targets_for_evaluator)

    # Compute final metrics
    metrics = evaluator.compute()
    mAP50 = metrics["map_50"].item()
    mAP50_95 = metrics["map"].item()

    #print(f"mAP@50: {mAP50:.4f}, mAP@50-95: {mAP50_95:.4f}")
    return mAP50, mAP50_95



In [None]:
# Main training loop
def train(model, train_loader, val_loader, criterion, processor,device="cuda",num_epochs=100, threshold=0.01):
    # Define optimizer, criterion, and scheduler
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=0.000714, # ultralytics lr
        betas=(0.9, 0.999),
        weight_decay=0.0005
    )


    # Learning rate warmup scheduler
    num_training_steps = len(train_loader) * num_epochs
    lr_warmup_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=500,
        num_training_steps=num_training_steps,
    )

    best_model = None
    best_map50 = 0
    device = "cuda"

    for epoch in range(num_epochs):
        # Apply warmup scheduler only in the first epoch
        current_lr_scheduler = lr_warmup_scheduler if epoch == 0 else None

        # Single training call
        train_loss = train_one_epoch(
            model,
            train_loader,
            optimizer,
            criterion,
            max_norm=0.1,
            device=device,
            lr_warmup_scheduler=lr_warmup_scheduler
        )

        # Validation
        map50, map50_95 = validate(
            model,
            val_loader,
            processor=processor,
            threshold=threshold,
            device=device
        )

        print(f"--------- Epoch {epoch + 1}/{num_epochs} --------- ")
        print(f"train_loss: {train_loss:.4f} | val_map50: {map50:.4f} | val_map50_95: {map50_95:.4f}")

        # Update best model
        if map50 > best_map50:
            best_map50 = map50
            best_model = model.state_dict()  # Save model state dict, not the entire model

    return best_model, best_map50


# processor for evaluator
processor=AutoImageProcessor.from_pretrained(
            "PekingU/rtdetr_r18vd_coco_o365",
            do_resize=True,
            size={"width": 640, "height": 640},)
# Hungarian matching loss
criterion = cfg.criterion
train(model=model, train_loader=train_loader, val_loader=val_loader, criterion=criterion, processor=processor, device="cuda", num_epochs=10, threshold=0.01)
