# Drone Object Detection Challenge - Complete Pipeline

Notebook này chứa toàn bộ code để thực hiện Drone Object Detection Challenge. Bao gồm:

1. **Setup và cài đặt dependencies**
2. **Khám phá dữ liệu**
3. **Training model**
4. **Evaluation với ST-IoU metric**
5. **Export TensorRT cho Jetson NX**
6. **Tạo submission file**

## Mục tiêu
- Detect object trong video drone dựa trên reference images
- Optimize cho NVIDIA Jetson Xavier NX (≤50M parameters)
- Đạt ≥25 FPS real-time inference
- Maximize ST-IoU metric

In [29]:
# Import các thư viện cần thiết
import os
import sys
import json
import yaml
import cv2
import torch
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

# Thêm project root vào Python path
project_root = Path().absolute().parent
sys.path.append(str(project_root))

print(f"Project root: {project_root}")
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

Project root: /Users/macos/Downloads/Zalo/drone_detection
Python version: 3.12.3 (v3.12.3:f6650f9ad7, Apr  9 2024, 08:18:47) [Clang 13.0.0 (clang-1300.0.29.30)]
PyTorch version: 2.8.0
CUDA available: False


## Cài đặt Dependencies

Trước khi chạy, hãy đảm bảo đã cài đặt tất cả các dependencies cần thiết.

In [None]:
# Cài đặt dependencies cho OSD-YOLOv10 - Production
import subprocess
import sys

packages = ["ultralytics", "timm", "albumentations", "tensorboard", "opencv-python", "einops"]

for package in packages:
    subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet"])

print("Dependencies installed successfully!")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mp

Dependencies installed successfully!



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


## Khám phá dữ liệu

In [31]:
# Setup data paths
data_dir = project_root.parent / "data"
train_dir = data_dir / "train"
test_dir = data_dir / "public_test"

In [32]:
# Load annotations
annotations_path = train_dir / "annotations" / "annotations.json"
with open(annotations_path, 'r') as f:
    annotations = json.load(f)

## Load Model và Setup Training

Bây giờ chúng ta sẽ load model và setup training pipeline.

In [None]:
# Import models và setup
from models.ultra_light_detector import UltraLightDroneDetector, create_ultra_light_model
from models.light_osd_yolov10 import LightOSDYOLOv10
from utils.dataset import DroneDataset, collate_fn
from utils.evaluation import compute_st_iou, evaluate_video

# Load config from Python file
from configs.default import config

# Update data paths
config['data']['train_dir'] = str(data_dir)
config['data']['test_dir'] = str(test_dir)

In [34]:
# Create Production Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create Ultra-Light model for production
model = create_ultra_light_model(config)
model = model.to(device)

print(f"Model created successfully!")
print(f"Parameters: {model.count_parameters():,}")
print(f"Memory (FP16): {(model.count_parameters() * 2) / (1024**2):.1f} MB")

Model created successfully!
Parameters: 12,689,322
Memory (FP16): 24.2 MB


In [None]:
# Create training dataset
import utils.dataset
from utils.dataset import DroneDataset, collate_fn

train_dataset = DroneDataset(
    data_dir=str(data_dir),
    split="train",
    frame_sampling_rate=5,
    max_frames_per_video=100,
    augmentation=True,
    image_size=(640, 640)
)

print(f"Training dataset created: {len(train_dataset)} samples")

Training dataset created: 1400 samples


## Training Pipeline

Bây giờ chúng ta sẽ implement training loop để train model.

In [42]:
# Training Setup
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from tqdm import tqdm

def train_one_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0.0
    num_batches = 0
    
    for batch in tqdm(dataloader, desc="Training"):
        images = batch['images'].to(device)
        object_images = batch['object_images'].to(device)
        
        targets = []
        for i in range(len(batch['video_ids'])):
            if len(batch['bboxes'][i]) > 0:
                targets.append({
                    'boxes': batch['bboxes'][i].to(device),
                    'labels': batch['labels'][i].to(device)
                })
            else:
                targets.append({
                    'boxes': torch.empty(0, 4, device=device),
                    'labels': torch.empty(0, dtype=torch.long, device=device)
                })
        
        optimizer.zero_grad()
        output = model(images, object_images, targets)
        
        if 'losses' in output:
            loss = output['losses']['total_loss']
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            total_loss += loss.item()
            num_batches += 1
    
    return total_loss / max(num_batches, 1)

# Setup optimizer và scheduler
optimizer = AdamW(model.parameters(), lr=config['training']['learning_rate'], weight_decay=config['training']['weight_decay'])
scheduler = CosineAnnealingLR(optimizer, T_max=config['training']['epochs'])

# Create dataloader
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=config['training']['batch_size'],
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=4,
    pin_memory=True if torch.cuda.is_available() else False
)

In [43]:
# Training Loop
num_epochs = config['training']['epochs']
best_loss = float('inf')

checkpoints_dir = project_root / "checkpoints"
checkpoints_dir.mkdir(exist_ok=True)

for epoch in range(num_epochs):
    epoch_loss = train_one_epoch(model, train_dataloader, optimizer, device)
    scheduler.step()
    
    # Save best model
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        best_model_path = checkpoints_dir / "best_model.pth"
        
        checkpoint = {
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'config': config,
            'epoch': epoch + 1,
            'loss': epoch_loss
        }
        torch.save(checkpoint, best_model_path)
    
    print(f"Epoch {epoch + 1}/{num_epochs} - Loss: {epoch_loss:.4f} - Best: {best_loss:.4f}")

print("Training completed!")

Training:   0%|          | 0/175 [03:25<?, ?it/s]



KeyboardInterrupt: 

## Evaluation với ST-IoU Metric

Bây giờ chúng ta sẽ evaluate model sử dụng Spatio-Temporal IoU metric.

In [None]:
# Evaluation với ST-IoU
def evaluate_model_on_dataset(model, dataset, max_samples=500):
    model.eval()
    st_ious = []
    
    with torch.no_grad():
        for i in tqdm(range(min(max_samples, len(dataset))), desc="Evaluating"):
            sample = dataset[i]
            
            images = sample['image'].unsqueeze(0).to(device)
            object_images = sample['object_images'].unsqueeze(0).to(device)
            
            output = model(images, object_images)
            predictions = output['predictions'][0]
            
            # Convert predictions
            pred_list = []
            for box, score, sim in zip(predictions['boxes'], predictions['scores'], predictions['similarities']):
                if score > 0.5:
                    pred_list.append({
                        'frame': sample['frame_id'],
                        'bbox': box.cpu().numpy().tolist(),
                        'confidence': score.cpu().item(),
                        'similarity': sim.cpu().item()
                    })
            
            # Convert ground truth
            gt_list = []
            for box in sample['bboxes']:
                gt_list.append({
                    'frame': sample['frame_id'],
                    'bbox': [box[0].item(), box[1].item(), box[2].item(), box[3].item()]
                })
            
            # Compute ST-IoU
            if len(pred_list) > 0 and len(gt_list) > 0:
                st_iou = compute_st_iou(pred_list, gt_list)
            else:
                st_iou = 0.0
            
            st_ious.append(st_iou)
    
    return st_ious

# Load best model và evaluate
best_model_path = checkpoints_dir / "best_model.pth"
if best_model_path.exists():
    checkpoint = torch.load(best_model_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])

st_ious = evaluate_model_on_dataset(model, train_dataset)
avg_st_iou = np.mean(st_ious)

print(f"Evaluation Results:")
print(f"Average ST-IoU: {avg_st_iou:.4f}")
print(f"Samples evaluated: {len(st_ious)}")

## Save Model và Checkpoints

Lưu model đã train để sử dụng sau này.

In [None]:
# Save Final Model
final_checkpoint = {
    'model_state_dict': model.state_dict(),
    'config': config,
    'model_parameters': model.count_parameters(),
    'avg_st_iou': avg_st_iou
}

final_model_path = checkpoints_dir / "final_model.pth"
torch.save(final_checkpoint, final_model_path)

print(f"Final model saved to: {final_model_path}")
print(f"Model parameters: {model.count_parameters():,}")
print(f"Average ST-IoU: {avg_st_iou:.4f}")

## Generate Submission File

Tạo file submission cho test set theo format yêu cầu.

In [None]:
# Generate Submission
def generate_submission(model, test_dir, output_path, confidence_threshold=0.3):
    samples_dir = test_dir / "samples"
    video_dirs = [d for d in samples_dir.iterdir() if d.is_dir()]
    
    submission = []
    model.eval()
    
    with torch.no_grad():
        for video_dir in tqdm(video_dirs, desc="Processing test videos"):
            video_id = video_dir.name
            
            # Load reference images
            object_images_dir = video_dir / "object_images"
            ref_images = []
            
            for img_file in sorted(object_images_dir.glob("*.jpg")):
                img = cv2.imread(str(img_file))
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img = cv2.resize(img, (224, 224))
                img = img.astype(np.float32) / 255.0
                img = torch.from_numpy(img).permute(2, 0, 1)
                ref_images.append(img)
            
            if len(ref_images) == 0:
                submission.append({"video_id": video_id, "detections": []})
                continue
            
            # Pad to 3 reference images
            while len(ref_images) < 3:
                ref_images.append(ref_images[-1] if ref_images else torch.zeros(3, 224, 224))
            ref_images = ref_images[:3]
            
            ref_images_tensor = torch.stack(ref_images).unsqueeze(0).to(device)
            
            # Process video
            video_file = video_dir / "drone_video.mp4"
            if not video_file.exists():
                submission.append({"video_id": video_id, "detections": []})
                continue
            
            cap = cv2.VideoCapture(str(video_file))
            frame_detections = []
            frame_idx = 0
            
            while True:
                ret, frame = cap.read()
                if not ret:
                    break
                
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame_resized = cv2.resize(frame_rgb, (640, 640))
                frame_normalized = frame_resized.astype(np.float32) / 255.0
                frame_tensor = torch.from_numpy(frame_normalized).permute(2, 0, 1).unsqueeze(0).to(device)
                
                output = model(frame_tensor, ref_images_tensor)
                predictions = output['predictions'][0]
                
                for box, score, sim in zip(predictions['boxes'], predictions['scores'], predictions['similarities']):
                    if score.item() > confidence_threshold:
                        h, w = frame.shape[:2]
                        x1, y1, x2, y2 = box.cpu().numpy()
                        
                        x1 = int(x1 * w / 640)
                        y1 = int(y1 * h / 640)
                        x2 = int(x2 * w / 640)
                        y2 = int(y2 * h / 640)
                        
                        frame_detections.append({
                            "frame": frame_idx,
                            "x1": x1, "y1": y1, "x2": x2, "y2": y2,
                            "confidence": float(score.item()),
                            "similarity": float(sim.item())
                        })
                
                frame_idx += 1
            
            cap.release()
            submission.append({"video_id": video_id, "detections": frame_detections})
    
    # Save submission
    output_path.parent.mkdir(exist_ok=True)
    with open(output_path, 'w') as f:
        json.dump(submission, f, indent=2)
    
    return submission

# Generate submission
submission_dir = project_root / "submissions"
submission_path = submission_dir / "final_submission.json"

submission = generate_submission(model, test_dir, submission_path)
total_detections = sum(len(v['detections']) for v in submission)

print(f"Submission generated!")
print(f"Total videos: {len(submission)}")
print(f"Total detections: {total_detections}")
print(f"Saved to: {submission_path}")

## TensorRT Export cho Jetson NX

Export model sang TensorRT để deploy trên Jetson Xavier NX.

In [None]:
# Export Model for Deployment
def export_model_for_deployment(model, export_dir):
    export_dir = Path(export_dir)
    export_dir.mkdir(exist_ok=True)
    
    model.eval()
    
    # Test inputs
    dummy_image = torch.randn(1, 3, 640, 640).to(device)
    dummy_ref_images = torch.randn(1, 3, 3, 224, 224).to(device)
    
    # Export ONNX
    onnx_path = export_dir / "drone_detector.onnx"
    torch.onnx.export(
        model, (dummy_image, dummy_ref_images), str(onnx_path),
        export_params=True, opset_version=11, do_constant_folding=True,
        input_names=['image', 'reference_images'], output_names=['predictions'],
        dynamic_axes={'image': {0: 'batch_size'}, 'reference_images': {0: 'batch_size'}}
    )
    
    # Export TorchScript
    traced_path = export_dir / "drone_detector_traced.pt"
    traced_model = torch.jit.trace(model, (dummy_image, dummy_ref_images))
    traced_model.save(str(traced_path))
    
    # Save complete model
    complete_model_path = export_dir / "drone_detector_complete.pth"
    torch.save({
        'model_state_dict': model.state_dict(),
        'config': config,
        'model_parameters': model.count_parameters()
    }, complete_model_path)
    
    return export_dir

# Export model
export_dir = project_root / "exports"
export_results = export_model_for_deployment(model, export_dir)

print(f"Model exported successfully!")
print(f"Files saved to: {export_results}")
print(f"Ready for deployment on Jetson Xavier NX!")