In [1]:
import os
os.environ['PIP_DISABLE_PIP_VERSION_CHECK'] = '1'

!pip install -q --upgrade pip
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
!pip install -q --upgrade numpy scipy scikit-learn
!pip install -q timm pycocotools faster-coco-eval
!pip install -q --upgrade transformers lightly-train
!pip install -q wandb
!pip install -q -U "numpy<2.1" matplotlib --force-reinstall --no-cache-dir

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
libcugraph-cu12 25.6.0 requires libraft-cu12==25.6.*, but you have libraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires pylibraft-cu12==25.6.*, but you have pylibraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires rmm-cu12==25.6.*, but you have rmm-cu12 25.2.0 which is incompatible.[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.3.4 whic

In [2]:
FINAL_DIR = '/kaggle/working/FINAL'

os.makedirs(os.path.join(FINAL_DIR, 'DISTILL-CONVNEXT'), exist_ok=True)
os.makedirs(os.path.join(FINAL_DIR, 'DISTILL-VIT'), exist_ok=True)
os.makedirs(os.path.join(FINAL_DIR, 'FINETUNE_BASELINE'), exist_ok=True)
os.makedirs(os.path.join(FINAL_DIR, 'FINETUNE_DISTILLED'), exist_ok=True)
os.makedirs(os.path.join(FINAL_DIR, 'YOLO'), exist_ok=True)
os.makedirs(os.path.join(FINAL_DIR, 'CONFIG'), exist_ok=True)

print("Directory structure for 'FINAL' created successfully.")

Directory structure for 'FINAL' created successfully.


In [3]:
%%writefile trainer_convnext.py
import os
import math
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoConfig
from PIL import Image
from pycocotools.coco import COCO
from torchvision import transforms as T
from tqdm import tqdm
import wandb
import datetime
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler

class HuggingFaceTeacherWrapper(nn.Module):
    def __init__(self, model_id: str, token: str = None):
        super().__init__()
        if int(os.environ.get("RANK", 0)) == 0:
            print(f"Loading teacher model '{model_id}' from Hugging Face...")
        
        config = AutoConfig.from_pretrained(model_id, token=token, output_hidden_states=True)
        self._model = AutoModel.from_pretrained(model_id, config=config, token=token)
        
        self.is_vit = "vit" in config.model_type.lower()
        
        all_hidden_sizes = self._model.config.hidden_sizes
        self.feature_layers = [-3, -2, -1]
        self.feature_dims = [all_hidden_sizes[i] for i in [1, 2, 3]]
        
        if int(os.environ.get("RANK", 0)) == 0:
            print(f"Architecture: {'ViT' if self.is_vit else 'ConvNeXT'}.")
            print(f"Extracting features from layers with dimensions: {self.feature_dims}")

    def forward(self, x: Tensor) -> list[Tensor]:
        outputs = self._model(pixel_values=x)
        selected_hidden_states = [outputs.hidden_states[i] for i in [2, 3, 4]]
        return selected_hidden_states

class CocoDetectionForDistill(torch.utils.data.Dataset):
    def __init__(self, root, ann_file, transforms):
        self.root = root
        self.coco = COCO(ann_file)
        self.ids = list(sorted(self.coco.imgs.keys()))
        self.transforms = transforms

    def __getitem__(self, index):
        img_id = self.ids[index]
        path = self.coco.loadImgs(img_id)[0]["file_name"]
        img = Image.open(os.path.join(self.root, path)).convert("RGB")
        return self.transforms(img), 0

    def __len__(self):
        return len(self.ids)

def setup_ddp():
    dist.init_process_group(backend="nccl")
    torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))

def cleanup_ddp():
    dist.destroy_process_group()

def main_training_function(rank, world_size, config):
    device = rank
    is_main_process = (rank == 0)
    
    if is_main_process:
        print(f"Running DDP on {world_size} GPUs.")
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")
        run_name = f"run_ddp_convnext_{timestamp}_lr{config['learning_rate']}_bs{config['batch_size_per_gpu']}"
        try:
            from kaggle_secrets import UserSecretsClient
            from huggingface_hub import login
            secrets = UserSecretsClient()
            hf_token = secrets.get_secret("HUGGINGFACE_TOKEN")
            wandb_key = secrets.get_secret("WANDB_API_KEY")
            login(token=hf_token)
            wandb.login(key=wandb_key)
            wandb.init(project=config["wandb_project"], config=config, name=run_name)
        except Exception:
            hf_token = None
            print("Could not log in, continuing without W&B.")
    else:
        hf_token = None

    dist.barrier()
    
    teacher_model = HuggingFaceTeacherWrapper(config["teacher_hf_id"], token=hf_token).to(device)
    teacher_model.eval()

    if is_main_process:
        print("Loading student model on main process...")
        torch.hub.load("lyuwenyu/RT-DETR", "rtdetrv2_l", pretrained=True, trust_repo=True)

    dist.barrier()

    student_hub_model = torch.hub.load("lyuwenyu/RT-DETR", "rtdetrv2_l", pretrained=True, trust_repo=True)
    student_model = student_hub_model.model.to(device)

    with torch.no_grad():
        x = torch.randn(1, 3, 640, 640).to(device)
        student_features_list = student_model.encoder(student_model.backbone(x))
        student_channels = [f.shape[1] for f in student_features_list]
    
    teacher_dims = teacher_model.feature_dims 
    projection_layers = nn.ModuleList([
        nn.Conv2d(student_channels[i], teacher_dims[i], kernel_size=1) for i in range(len(student_channels))
    ]).to(device)
    
    if is_main_process:
        for i in range(len(projection_layers)):
            print(f"Projection layer {i}: {student_channels[i]} -> {teacher_dims[i]}")

    student_model = DDP(student_model, device_ids=[device], find_unused_parameters=True)
    projection_layers = DDP(projection_layers, device_ids=[device], find_unused_parameters=True)
    
    transforms = T.Compose([
        T.Resize((640, 640)), T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    train_dataset = CocoDetectionForDistill(root=config["dataset_dir"]+"/train2017", ann_file=config["dataset_dir"]+"/annotations/instances_train2017.json", transforms=transforms)
    val_dataset = CocoDetectionForDistill(root=config["dataset_dir"]+"/val2017", ann_file=config["dataset_dir"]+"/annotations/instances_val2017.json", transforms=transforms)
    
    if is_main_process:
        print(f"Data loaded: {len(train_dataset)} training images, {len(val_dataset)} validation images.")

    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    val_sampler = DistributedSampler(val_dataset, num_replicas=world_size, rank=rank, shuffle=False)

    train_loader = DataLoader(train_dataset, batch_size=config["batch_size_per_gpu"], shuffle=False, num_workers=config["num_workers"], pin_memory=True, drop_last=True, sampler=train_sampler)
    val_loader = DataLoader(val_dataset, batch_size=config["batch_size_per_gpu"], shuffle=False, num_workers=config["num_workers"], pin_memory=True, drop_last=False, sampler=val_sampler)

    params = list(student_model.module.backbone.parameters()) + list(student_model.module.encoder.parameters()) + list(projection_layers.module.parameters())
             
    optimizer = torch.optim.AdamW(params, lr=config["learning_rate"], weight_decay=config["weight_decay"])
    criterion = nn.MSELoss()
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=config['scheduler_factor'], patience=config['scheduler_patience'], verbose=is_main_process)

    if is_main_process and wandb.run:
        wandb.watch((student_model, projection_layers), log="all", log_freq=100)
    
    best_val_loss = float('inf')
    early_stopping_counter = 0

    if is_main_process:
        print("Starting training...")
        
    for epoch in range(config["epochs"]):
        train_sampler.set_epoch(epoch)
        start = time.time()
        student_model.train()
        projection_layers.train()
        total_train_loss = 0.0
        
        train_iterator = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config['epochs']} [Train]") if is_main_process else train_loader

        for images, _ in train_iterator:
            images = images.to(device)
            
            with torch.no_grad():
                teacher_features_list = teacher_model(images)
            
            student_features_list = student_model.module.encoder(student_model.module.backbone(images))
            
            total_loss = 0
            for i in range(len(student_features_list)):
                student_feat = student_features_list[i]
                teacher_feat = teacher_features_list[i]
                
                projected_feat = projection_layers.module[i](student_feat)
                teacher_resized = F.interpolate(teacher_feat, size=projected_feat.shape[-2:], mode="bilinear", align_corners=False)
                
                total_loss += criterion(projected_feat, teacher_resized)
            
            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()
            total_train_loss += total_loss.item()
        
        train_loss_tensor = torch.tensor(total_train_loss).to(device)
        dist.all_reduce(train_loss_tensor, op=dist.ReduceOp.SUM)
        avg_train_loss = train_loss_tensor.item() / (len(train_loader) * world_size)

        student_model.eval()
        projection_layers.eval()
        total_val_loss = 0.0
        
        val_iterator = tqdm(val_loader, desc=f"Epoch {epoch+1}/{config['epochs']} [Val]") if is_main_process else val_loader
        with torch.no_grad():
            for images, _ in val_iterator:
                images = images.to(device)
                teacher_features_list = teacher_model(images)
                student_features_list = student_model.module.encoder(student_model.module.backbone(images))
                
                loss = 0
                for i in range(len(student_features_list)):
                    projected = projection_layers.module[i](student_features_list[i])
                    teacher_resized = F.interpolate(teacher_features_list[i], size=projected.shape[-2:], mode="bilinear", align_corners=False)
                    loss += criterion(projected, teacher_resized)
                total_val_loss += loss.item()
                
        val_loss_tensor = torch.tensor(total_val_loss).to(device)
        dist.all_reduce(val_loss_tensor, op=dist.ReduceOp.SUM)
        avg_val_loss = val_loss_tensor.item() / (len(val_loader) * world_size)
        
        if is_main_process:
            duration = time.time() - start
            print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Duration: {duration:.2f}s")
            if wandb.run: wandb.log({"epoch": epoch + 1, "train/avg_loss": avg_train_loss, "val/avg_loss": avg_val_loss})
            scheduler.step(avg_val_loss)
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                early_stopping_counter = 0
                print(f"Validation loss improved. Saving best model...")
                best_weights = {**student_model.module.backbone.state_dict(), **student_model.module.encoder.state_dict()}
                torch.save({'model': best_weights}, config["best_weights_filename"])
            else:
                early_stopping_counter += 1
                print(f"Validation loss did not improve. Counter: {early_stopping_counter}/{config['early_stopping_patience']}")

        stop_training = torch.tensor(1 if early_stopping_counter >= config['early_stopping_patience'] else 0, device=device)
        dist.all_reduce(stop_training, op=dist.ReduceOp.MAX)
        if stop_training.item() == 1:
            if is_main_process: print("Early stopping triggered.")
            break
            
    if is_main_process:
        print("\nDistillation finished.")
        if wandb.run: wandb.finish()

if __name__ == "__main__":
    setup_ddp()
    rank = int(os.environ["RANK"])
    world_size = int(os.environ["WORLD_SIZE"])

    config = {
        "learning_rate": 1e-4, "epochs": 50, "batch_size_per_gpu": 16,
        "num_workers": 2, "weight_decay": 1e-5,
        "teacher_hf_id": "facebook/dinov3-convnext-base-pretrain-lvd1689m",
        "dataset_dir": "/kaggle/input/dsp-pre-final/processed_taco_coco",
        "scheduler_patience": 3, "scheduler_factor": 0.1,
        "early_stopping_patience": 7,
        "best_weights_filename": "/kaggle/working/FINAL/DISTILL-CONVNEXT/distilled_rtdetr_convnext_teacher_BEST.pth",
        "final_weights_filename": "/kaggle/working/FINAL/DISTILL-CONVNEXT/distilled_rtdetr_convnext_teacher_FINAL.pth",
        "wandb_project": "Distill-RTDETR-ConvNeXt-Teacher",
    }
    main_training_function(rank, world_size, config)
    cleanup_ddp()

Writing trainer_convnext.py


In [4]:
!torchrun --nproc_per_node=2 trainer_convnext.py

W1030 15:37:20.378000 69 torch/distributed/run.py:793] 
W1030 15:37:20.378000 69 torch/distributed/run.py:793] *****************************************
W1030 15:37:20.378000 69 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W1030 15:37:20.378000 69 torch/distributed/run.py:793] *****************************************
Running DDP on 2 GPUs.
[rank1]:[W1030 15:37:32.942637281 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key f

In [5]:
%%writefile trainer_vit.py
import os
import math
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoConfig
from PIL import Image
from pycocotools.coco import COCO
from torchvision import transforms as T
from tqdm import tqdm
import wandb
import datetime
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler

class HuggingFaceTeacherWrapper(nn.Module):
    def __init__(self, model_id: str, token: str = None):
        super().__init__()
        if int(os.environ.get("RANK", 0)) == 0:
            print(f"Loading teacher model '{model_id}' from Hugging Face...")
        
        config = AutoConfig.from_pretrained(model_id, token=token)
        self._model = AutoModel.from_pretrained(model_id, config=config, token=token)
        self.feature_dim = self._model.config.hidden_size
        
        if int(os.environ.get("RANK", 0)) == 0:
            print(f"Architecture: ViT. Feature dim: {self.feature_dim}")

    def forward(self, x: Tensor) -> Tensor:
        outputs = self._model(pixel_values=x)
        patch_tokens = outputs.last_hidden_state[:, 1:, :]
        return patch_tokens

class CocoDetectionForDistill(torch.utils.data.Dataset):
    def __init__(self, root, ann_file, transforms):
        self.root = root
        self.coco = COCO(ann_file)
        self.ids = list(sorted(self.coco.imgs.keys()))
        self.transforms = transforms

    def __getitem__(self, index):
        img_id = self.ids[index]
        path = self.coco.loadImgs(img_id)[0]["file_name"]
        img = Image.open(os.path.join(self.root, path)).convert("RGB")
        return self.transforms(img), 0

    def __len__(self):
        return len(self.ids)

def setup_ddp():
    dist.init_process_group(backend="nccl")
    torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))

def cleanup_ddp():
    dist.destroy_process_group()

def main_training_function(rank, world_size, config):
    device = rank
    is_main_process = (rank == 0)
    
    if is_main_process:
        print(f"Running DDP on {world_size} GPUs.")
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")
        run_name = f"run_ddp_vit_{timestamp}_lr{config['learning_rate']}_bs{config['batch_size_per_gpu']}"
        try:
            from kaggle_secrets import UserSecretsClient
            from huggingface_hub import login
            secrets = UserSecretsClient()
            hf_token = secrets.get_secret("HUGGINGFACE_TOKEN")
            wandb_key = secrets.get_secret("WANDB_API_KEY")
            login(token=hf_token)
            wandb.login(key=wandb_key)
            wandb.init(project=config["wandb_project"], config=config, name=run_name)
        except Exception:
            hf_token = None
            print("Could not log in, continuing without W&B.")
    else:
        hf_token = None

    dist.barrier()
    
    teacher_model = HuggingFaceTeacherWrapper(config["teacher_hf_id"], token=hf_token).to(device)
    teacher_model.eval()

    if is_main_process:
        print("Loading student model on main process...")
        torch.hub.load("lyuwenyu/RT-DETR", "rtdetrv2_l", pretrained=True, trust_repo=True)

    dist.barrier()

    student_hub_model = torch.hub.load("lyuwenyu/RT-DETR", "rtdetrv2_l", pretrained=True, trust_repo=True)
    student_model = student_hub_model.model.to(device)

    with torch.no_grad():
        x = torch.randn(1, 3, 640, 640).to(device)
        student_channels = student_model.encoder(student_model.backbone(x))[-1].shape[1]
    
    teacher_channels = teacher_model.feature_dim
    projection_layer = nn.Linear(student_channels, teacher_channels).to(device)

    student_model = DDP(student_model, device_ids=[device], find_unused_parameters=True)
    projection_layer = DDP(projection_layer, device_ids=[device], find_unused_parameters=True)
    
    transforms = T.Compose([
        T.Resize((640, 640)), T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    train_dataset = CocoDetectionForDistill(root=config["dataset_dir"]+"/train2017", ann_file=config["dataset_dir"]+"/annotations/instances_train2017.json", transforms=transforms)
    val_dataset = CocoDetectionForDistill(root=config["dataset_dir"]+"/val2017", ann_file=config["dataset_dir"]+"/annotations/instances_val2017.json", transforms=transforms)
    
    if is_main_process: print(f"Data loaded: {len(train_dataset)} train, {len(val_dataset)} val.")

    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    val_sampler = DistributedSampler(val_dataset, num_replicas=world_size, rank=rank, shuffle=False)

    train_loader = DataLoader(train_dataset, batch_size=config["batch_size_per_gpu"], shuffle=False, num_workers=config["num_workers"], pin_memory=True, drop_last=True, sampler=train_sampler)
    val_loader = DataLoader(val_dataset, batch_size=config["batch_size_per_gpu"], shuffle=False, num_workers=config["num_workers"], pin_memory=True, drop_last=False, sampler=val_sampler)

    params = list(student_model.module.backbone.parameters()) + list(student_model.module.encoder.parameters()) + list(projection_layer.module.parameters())
             
    optimizer = torch.optim.AdamW(params, lr=config["learning_rate"], weight_decay=config["weight_decay"])
    criterion = nn.MSELoss()
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=config['scheduler_factor'], patience=config['scheduler_patience'], verbose=is_main_process)

    if is_main_process and wandb.run:
        wandb.watch((student_model, projection_layer), log="all", log_freq=100)
    
    best_val_loss = float('inf')
    early_stopping_counter = 0

    if is_main_process: print("Starting training...")
        
    for epoch in range(config["epochs"]):
        train_sampler.set_epoch(epoch)
        start = time.time()
        student_model.train()
        projection_layer.train()
        total_train_loss = 0.0
        
        train_iterator = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config['epochs']} [Train]") if is_main_process else train_loader

        for images, _ in train_iterator:
            images = images.to(device)
            
            with torch.no_grad():
                teacher_tokens = teacher_model(images)
            
            student_features_2d = student_model.module.encoder(student_model.module.backbone(images))[-1]
            
            b, c, h, w = student_features_2d.shape
            student_tokens = student_features_2d.flatten(2).permute(0, 2, 1)

            projected_tokens = projection_layer(student_tokens)

            teacher_tokens_resized = F.interpolate(
                teacher_tokens.permute(0, 2, 1),
                size=student_tokens.shape[1],
                mode='linear',
                align_corners=False
            ).permute(0, 2, 1)
            
            loss = criterion(projected_tokens, teacher_tokens_resized)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
        
        train_loss_tensor = torch.tensor(total_train_loss).to(device)
        dist.all_reduce(train_loss_tensor, op=dist.ReduceOp.SUM)
        avg_train_loss = train_loss_tensor.item() / (len(train_loader) * world_size)

        student_model.eval()
        projection_layer.eval()
        total_val_loss = 0.0
        
        val_iterator = tqdm(val_loader, desc=f"Epoch {epoch+1}/{config['epochs']} [Val]") if is_main_process else val_loader
        with torch.no_grad():
            for images, _ in val_iterator:
                images = images.to(device)
                teacher_tokens = teacher_model(images)
                student_features_2d = student_model.module.encoder(student_model.module.backbone(images))[-1]
                
                b, c, h, w = student_features_2d.shape
                student_tokens = student_features_2d.flatten(2).permute(0, 2, 1)
                
                projected = projection_layer(student_tokens)
                
                teacher_resized = F.interpolate(
                    teacher_tokens.permute(0, 2, 1),
                    size=student_tokens.shape[1],
                    mode='linear',
                    align_corners=False
                ).permute(0, 2, 1)
                
                loss = criterion(projected, teacher_resized)
                total_val_loss += loss.item()
                
        val_loss_tensor = torch.tensor(total_val_loss).to(device)
        dist.all_reduce(val_loss_tensor, op=dist.ReduceOp.SUM)
        avg_val_loss = val_loss_tensor.item() / (len(val_loader) * world_size)
        
        if is_main_process:
            duration = time.time() - start
            print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Duration: {duration:.2f}s")
            if wandb.run: wandb.log({"epoch": epoch + 1, "train/avg_loss": avg_train_loss, "val/avg_loss": avg_val_loss})
            scheduler.step(avg_val_loss)
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                early_stopping_counter = 0
                print(f"Validation loss improved. Saving best model...")
                best_weights = {**student_model.module.backbone.state_dict(), **student_model.module.encoder.state_dict()}
                torch.save({'model': best_weights}, config["best_weights_filename"])
            else:
                early_stopping_counter += 1
                print(f"Validation loss did not improve. Counter: {early_stopping_counter}/{config['early_stopping_patience']}")

        stop_training = torch.tensor(1 if early_stopping_counter >= config['early_stopping_patience'] else 0, device=device)
        dist.all_reduce(stop_training, op=dist.ReduceOp.MAX)
        if stop_training.item() == 1:
            if is_main_process: print("Early stopping triggered.")
            break
            
    if is_main_process:
        print("\nDistillation finished.")
        if wandb.run: wandb.finish()

if __name__ == "__main__":
    setup_ddp()
    rank = int(os.environ["RANK"])
    world_size = int(os.environ["WORLD_SIZE"])

    config = {
        "learning_rate": 1e-4, "epochs": 50, "batch_size_per_gpu": 16,
        "num_workers": 2, "weight_decay": 1e-5,
        "teacher_hf_id": "facebook/dinov3-vitb16-pretrain-lvd1689m", 
        "dataset_dir": "/kaggle/input/dsp-pre-final/processed_taco_coco",
        "scheduler_patience": 3, "scheduler_factor": 0.1,
        "early_stopping_patience": 7,
        "best_weights_filename": "/kaggle/working/FINAL/DISTILL-VIT/distilled_rtdetr_vit_teacher_BEST.pth",
        "final_weights_filename": "/kaggle/working/FINAL/DISTILL-VIT/distilled_rtdetr_vit_teacher_FINAL.pth",
        "wandb_project": "Distill-RTDETR-DINOv3-ViT-Teacher",
    }
    main_training_function(rank, world_size, config)
    cleanup_ddp()

Writing trainer_vit.py


In [6]:
!torchrun --nproc_per_node=2 trainer_vit.py

W1030 17:38:31.802000 2207 torch/distributed/run.py:793] 
W1030 17:38:31.802000 2207 torch/distributed/run.py:793] *****************************************
W1030 17:38:31.802000 2207 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W1030 17:38:31.802000 2207 torch/distributed/run.py:793] *****************************************
Running DDP on 2 GPUs.
[rank1]:[W1030 17:38:38.868469987 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc

# Finetune RT-DETR

In [7]:
!rm -rf /kaggle/working/RT-DETR

In [8]:
%cd /kaggle/working
!git clone https://github.com/lyuwenyu/RT-DETR.git
!cd RT-DETR/rtdetrv2_pytorch && pip install -r requirements.txt -q
!pip install -q protobuf==3.20.3
!pip install -q tensorboard
!pip install -q --upgrade pip
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
!pip install -q --upgrade numpy scipy scikit-learn
!pip install -q timm pycocotools faster-coco-eval
!pip install -q --upgrade transformers lightly-train
!pip install -q wandb
!pip install -q -U "numpy<2.1" matplotlib --force-reinstall --no-cache-dir

/kaggle/working
Cloning into 'RT-DETR'...
remote: Enumerating objects: 1100, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 1100 (delta 8), reused 5 (delta 5), pack-reused 1077 (from 2)[K
Receiving objects: 100% (1100/1100), 660.70 KiB | 6.48 MiB/s, done.
Resolving deltas: 100% (522/522), done.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
google-api-core 1.34.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<4.0.0dev,>=3.19.5, but you have protobuf 6.33.0 which is incompatible.
google-cloud-translate 3.12.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have prot

In [9]:
%%writefile /kaggle/working/RT-DETR/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_taco_finetune_convnext.yml
__include__: [
  '../dataset/coco_detection.yml',
  '../runtime.yml',
  './include/dataloader.yml',
  './include/rtdetrv2_r50vd.yml',
]

output_dir: /kaggle/working/FINAL/FINETUNE_DISTILLED/rtdetrv2_finetune_taco_convnext_teacher

RTDETR:
  backbone: PResNet

PResNet:
  depth: 50
  variant: d
  freeze_at: 0
  return_idx: [1, 2, 3]
  num_stages: 4
  freeze_norm: True
  pretrained: False

task: detection
remap_mscoco_category: false
tuning: '/kaggle/working/FINAL/DISTILL-CONVNEXT/distilled_rtdetr_convnext_teacher_BEST.pth'
compile: true
epoches: 50

num_classes: 60

train_dataloader:
  num_workers: 4
  dataset:
    type: CocoDetection
    img_folder: /kaggle/input/dsp-pre-final/processed_taco_coco/train2017
    ann_file: /kaggle/input/dsp-pre-final/processed_taco_coco/annotations/instances_train2017.json

val_dataloader:
  num_workers: 4
  dataset:
    type: CocoDetection
    img_folder: /kaggle/input/dsp-pre-final/processed_taco_coco/val2017
    ann_file: /kaggle/input/dsp-pre-final/processed_taco_coco/annotations/instances_val2017.json

batch_size: 16

optimizer:
  type: AdamW
  params:
    - params: '^(?=.*backbone)'
      lr: 0.00001
    - params: '^(?=.*encoder)'
      lr: 0.00005
  lr: 0.0001
  weight_decay: 0.0001
  betas: [0.9, 0.999]

lr_scheduler:
  type: MultiStepLR
  milestones: [40]
  gamma: 0.1

checkpoint_freq: 10

Writing /kaggle/working/RT-DETR/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_taco_finetune_convnext.yml


In [10]:
%cd /kaggle/working/RT-DETR/rtdetrv2_pytorch/

!torchrun --nproc_per_node=2 tools/train.py \
    -c configs/rtdetrv2/rtdetrv2_taco_finetune_convnext.yml \
    --use-amp \
    --seed=0

/kaggle/working/RT-DETR/rtdetrv2_pytorch
W1030 20:10:48.179000 4396 torch/distributed/run.py:793] 
W1030 20:10:48.179000 4396 torch/distributed/run.py:793] *****************************************
W1030 20:10:48.179000 4396 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W1030 20:10:48.179000 4396 torch/distributed/run.py:793] *****************************************
2025-10-30 20:10:50.601606: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-30 20:10:50.601606: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:0

In [11]:
%%writefile /kaggle/working/RT-DETR/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_taco_finetune_vit.yml
__include__: [
  '../dataset/coco_detection.yml',
  '../runtime.yml',
  './include/dataloader.yml',
  './include/rtdetrv2_r50vd.yml',
]

output_dir: /kaggle/working/FINAL/FINETUNE_DISTILLED/rtdetrv2_finetune_taco_vit_teacher

RTDETR:
  backbone: PResNet

PResNet:
  depth: 50
  variant: d
  freeze_at: 0
  return_idx: [1, 2, 3]
  num_stages: 4
  freeze_norm: True
  pretrained: False

task: detection
remap_mscoco_category: false
tuning: '/kaggle/working/FINAL/DISTILL-VIT/distilled_rtdetr_vit_teacher_BEST.pth'
compile: true
epoches: 50

num_classes: 60

train_dataloader:
  num_workers: 4
  dataset:
    type: CocoDetection
    img_folder: /kaggle/input/dsp-pre-final/processed_taco_coco/train2017
    ann_file: /kaggle/input/dsp-pre-final/processed_taco_coco/annotations/instances_train2017.json

val_dataloader:
  num_workers: 4
  dataset:
    type: CocoDetection
    img_folder: /kaggle/input/dsp-pre-final/processed_taco_coco/val2017
    ann_file: /kaggle/input/dsp-pre-final/processed_taco_coco/annotations/instances_val2017.json

batch_size: 16

optimizer:
  type: AdamW
  params:
    - params: '^(?=.*backbone)'
      lr: 0.00002   
  lr: 0.00002   
  weight_decay: 0.0001
  betas: [0.9, 0.999]

lr_scheduler:
  type: MultiStepLR
  milestones: [40]
  gamma: 0.1

checkpoint_freq: 10

Writing /kaggle/working/RT-DETR/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_taco_finetune_vit.yml


In [12]:
%cd /kaggle/working/RT-DETR/rtdetrv2_pytorch/

!torchrun --nproc_per_node=2 tools/train.py \
    -c configs/rtdetrv2/rtdetrv2_taco_finetune_vit.yml \
    --use-amp \
    --seed=0

/kaggle/working/RT-DETR/rtdetrv2_pytorch
W1030 21:34:01.734000 10042 torch/distributed/run.py:793] 
W1030 21:34:01.734000 10042 torch/distributed/run.py:793] *****************************************
W1030 21:34:01.734000 10042 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W1030 21:34:01.734000 10042 torch/distributed/run.py:793] *****************************************
2025-10-30 21:34:04.167425: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-30 21:34:04.167448: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 

# With baseline model

In [13]:
%%writefile /kaggle/working/RT-DETR/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_taco_finetune_BASELINE.yml
__include__: [
  '../dataset/coco_detection.yml',
  '../runtime.yml',
  './include/dataloader.yml',
  './include/rtdetrv2_r50vd.yml',
]

output_dir: /kaggle/working/FINAL/FINETUNE_BASELINE/rtdetrv2_finetune_taco_finetune_BASELINE

RTDETR:
  backbone: PResNet

PResNet:
  depth: 50
  variant: d
  freeze_at: 0
  return_idx: [1, 2, 3]
  num_stages: 4
  freeze_norm: True
  pretrained: False

task: detection
remap_mscoco_category: false

compile: true
epoches: 50
num_classes: 60


train_dataloader:
  num_workers: 4
  dataset:
    type: CocoDetection
    img_folder: /kaggle/input/dsp-pre-final/processed_taco_coco/train2017
    ann_file: /kaggle/input/dsp-pre-final/processed_taco_coco/annotations/instances_train2017.json

val_dataloader:
  num_workers: 4
  dataset:
    type: CocoDetection
    img_folder: /kaggle/input/dsp-pre-final/processed_taco_coco/val2017
    ann_file: /kaggle/input/dsp-pre-final/processed_taco_coco/annotations/instances_val2017.json

batch_size: 16

optimizer:
  type: AdamW
  params:
    - params: '^(?=.*backbone)'
      lr: 0.00002   
  lr: 0.00002   
  weight_decay: 0.0001
  betas: [0.9, 0.999]

lr_scheduler:
  type: MultiStepLR
  milestones: [40]
  gamma: 0.1

checkpoint_freq: 10

Writing /kaggle/working/RT-DETR/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_taco_finetune_BASELINE.yml


In [14]:
%cd /kaggle/working/RT-DETR/rtdetrv2_pytorch/

!torchrun --nproc_per_node=2 tools/train.py \
    -c configs/rtdetrv2/rtdetrv2_taco_finetune_BASELINE.yml \
    --use-amp \
    --seed=0

/kaggle/working/RT-DETR/rtdetrv2_pytorch
W1030 22:57:29.114000 15688 torch/distributed/run.py:793] 
W1030 22:57:29.114000 15688 torch/distributed/run.py:793] *****************************************
W1030 22:57:29.114000 15688 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W1030 22:57:29.114000 15688 torch/distributed/run.py:793] *****************************************
2025-10-30 22:57:31.524595: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-30 22:57:31.524597: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 

In [15]:
!cp -r /kaggle/working/RT-DETR/rtdetrv2_pytorch/configs/rtdetrv2/* /kaggle/working/FINAL/CONFIG/

In [16]:
!pip install -q ultralytics
%cd /kaggle/working

/kaggle/working


In [17]:
%cd /kaggle/working
!git clone https://github.com/lyuwenyu/RT-DETR.git
!cd RT-DETR/rtdetrv2_pytorch && pip install -r requirements.txt -q
!pip install -q protobuf==3.20.3
!pip install -q tensorboard
!pip install -q --upgrade pip
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
!pip install -q --upgrade numpy scipy scikit-learn
!pip install -q timm pycocotools faster-coco-eval
!pip install -q --upgrade transformers lightly-train
!pip install -q wandb
!pip install -q -U "numpy<2.1" matplotlib --force-reinstall --no-cache-dir

/kaggle/working
fatal: destination path 'RT-DETR' already exists and is not an empty directory.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
google-api-core 1.34.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<4.0.0dev,>=3.19.5, but you have protobuf 6.33.0 which is incompatible.
google-cloud-translate 3.12.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 6.33.0 which is incompatible.
google-cloud-bigtable 2.32.0 requires google-api-core[grpc]<3.0.0,>=2.17.0, but you have google-api-core 1.34.1 which is incompatible.
bigframes 2.12.0 requires google-cloud-bigquery[bqstorage,pandas]>=3.31.0, but you have google-cloud-bigquery 3.

In [18]:
import os
import json
from tqdm import tqdm
import yaml

YOLO_DATA_ROOT = '/kaggle/working/FINAL/YOLO/taco_yolo'
IMAGE_TRAIN_DIR_DEST = os.path.join(YOLO_DATA_ROOT, 'images', 'train')
IMAGE_VAL_DIR_DEST = os.path.join(YOLO_DATA_ROOT, 'images', 'val')
LABEL_TRAIN_DIR_DEST = os.path.join(YOLO_DATA_ROOT, 'labels', 'train')
LABEL_VAL_DIR_DEST = os.path.join(YOLO_DATA_ROOT, 'labels', 'val')

os.makedirs(IMAGE_TRAIN_DIR_DEST, exist_ok=True)
os.makedirs(IMAGE_VAL_DIR_DEST, exist_ok=True)
os.makedirs(LABEL_TRAIN_DIR_DEST, exist_ok=True)
os.makedirs(LABEL_VAL_DIR_DEST, exist_ok=True)

IMAGE_TRAIN_DIR_SRC = '/kaggle/input/dsp-pre-final/processed_taco_coco/train2017'
IMAGE_VAL_DIR_SRC = '/kaggle/input/dsp-pre-final/processed_taco_coco/val2017'

!cp -n {IMAGE_TRAIN_DIR_SRC}/* {IMAGE_TRAIN_DIR_DEST}/
!cp -n {IMAGE_VAL_DIR_SRC}/* {IMAGE_VAL_DIR_DEST}/

COCO_ANNOTATIONS_TRAIN = '/kaggle/input/dsp-pre-final/processed_taco_coco/annotations/instances_train2017.json'
COCO_ANNOTATIONS_VAL = '/kaggle/input/dsp-pre-final/processed_taco_coco/annotations/instances_val2017.json'

def convert_coco_to_yolo(json_file, output_labels_dir):
    with open(json_file) as f: data = json.load(f)
    images_map = {img['id']: (img['file_name'], img['width'], img['height']) for img in data['images']}
    for ann in tqdm(data['annotations'], desc=f"Converting {os.path.basename(json_file)}"):
        image_id, class_id = ann['image_id'], ann['category_id']
        if image_id not in images_map: continue
        file_name, img_w, img_h = images_map[image_id]
        box = ann['bbox']
        x, y, w, h = box
        x_center, y_center = (x + w / 2) / img_w, (y + h / 2) / img_h
        norm_w, norm_h = w / img_w, h / img_h
        label_file_name = os.path.splitext(file_name)[0] + '.txt'
        label_file_path = os.path.join(output_labels_dir, label_file_name)
        with open(label_file_path, 'a') as f:
            f.write(f"{class_id} {x_center:.6f} {y_center:.6f} {norm_w:.6f} {norm_h:.6f}\n")

convert_coco_to_yolo(COCO_ANNOTATIONS_TRAIN, LABEL_TRAIN_DIR_DEST)
convert_coco_to_yolo(COCO_ANNOTATIONS_VAL, LABEL_VAL_DIR_DEST)

with open(COCO_ANNOTATIONS_TRAIN) as f: coco_data = json.load(f)
categories = sorted(coco_data['categories'], key=lambda x: x['id'])
class_names = [cat['name'] for cat in categories]

taco_yaml_content = {
    'path': YOLO_DATA_ROOT, 'train': 'images/train', 'val': 'images/val',
    'nc': len(class_names), 'names': class_names
}

YAML_PATH = os.path.join(YOLO_DATA_ROOT, 'taco.yaml')
with open(YAML_PATH, 'w') as f: yaml.dump(taco_yaml_content, f, sort_keys=False)

Converting instances_train2017.json: 100%|██████████| 4004/4004 [00:00<00:00, 28758.61it/s]
Converting instances_val2017.json: 100%|██████████| 776/776 [00:00<00:00, 28581.54it/s]


In [19]:
import yaml
import json
import os

COCO_ANNOTATIONS_TRAIN = '/kaggle/input/dsp-pre-final/processed_taco_coco/annotations/instances_train2017.json'
YOLO_DATA_ROOT = '/kaggle/working/FINAL/YOLO/taco_yolo'

with open(COCO_ANNOTATIONS_TRAIN) as f:
    coco_data = json.load(f)

categories = sorted(coco_data['categories'], key=lambda x: x['id'])
class_names = [cat['name'] for cat in categories]

taco_yaml_content = {
    'path': YOLO_DATA_ROOT,
    'train': 'images/train',
    'val': 'images/val',
    'nc': len(class_names),
    'names': class_names
}

YAML_PATH = os.path.join(YOLO_DATA_ROOT, 'taco.yaml')
with open(YAML_PATH, 'w') as f:
    yaml.dump(taco_yaml_content, f, sort_keys=False)

with open(YAML_PATH, 'r') as f:
    print(f.read())

path: /kaggle/working/FINAL/YOLO/taco_yolo
train: images/train
val: images/val
nc: 60
names:
- Aluminium foil
- Battery
- Aluminium blister pack
- Carded blister pack
- Other plastic bottle
- Clear plastic bottle
- Glass bottle
- Plastic bottle cap
- Metal bottle cap
- Broken glass
- Food Can
- Aerosol
- Drink can
- Toilet tube
- Other carton
- Egg carton
- Drink carton
- Corrugated carton
- Meal carton
- Pizza box
- Paper cup
- Disposable plastic cup
- Foam cup
- Glass cup
- Other plastic cup
- Food waste
- Glass jar
- Plastic lid
- Metal lid
- Other plastic
- Magazine paper
- Tissues
- Wrapping paper
- Normal paper
- Paper bag
- Plastified paper bag
- Plastic film
- Six pack rings
- Garbage bag
- Other plastic wrapper
- Single-use carrier bag
- Polypropylene bag
- Crisp packet
- Spread tub
- Tupperware
- Disposable food container
- Foam food container
- Other plastic container
- Plastic glooves
- Plastic utensils
- Pop tab
- Rope & strings
- Scrap metal
- Shoe
- Squeezable tube
- Pla

In [20]:
%%writefile training_yolov11.py
from ultralytics import YOLO
import wandb
import os

output_path = '/kaggle/working/FINAL/YOLO/yolo_checkpoints'
data_yaml_path = '/kaggle/working/FINAL/YOLO/taco_yolo/taco.yaml' 
os.makedirs(output_path, exist_ok=True)

try:
    from kaggle_secrets import UserSecretsClient
    secrets = UserSecretsClient()
    wandb_key = secrets.get_secret("WANDB_API_KEY")
    wandb.login(key=wandb_key)
except Exception as e:
    print(f"{e}")
    
wandb.init(
    project='yolo_runs_taco',
    name='yolo11l_taco_finetune_baseline',
    job_type='fine-tuning',
    dir='/kaggle/working/FINAL/YOLO/'
)

model = YOLO('yolo11l.pt')

results = model.train(
    data=data_yaml_path,
    epochs=50,
    imgsz=640,
    batch=32,
    project=output_path,
    name='yolo11l_finetune_baseline',
    exist_ok=True,
    device=[0, 1]
)
wandb.finish()

Writing training_yolov11.py


In [21]:
!python training_yolov11.py

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnamthse182380[0m ([33mnamthse182380-fpt-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.21.0
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/FINAL/YOLO/wandb/run-20251031_002530-fsq4qebx[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33myolo11l_taco_finetune_baseline[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/namthse182380