## Load Custom Dataset

In [126]:
import os
import random
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
import numpy as np

class CustomDataset(Dataset):
    def __init__(self, 
                 root_dir, 
                 dataset = 'custom',
                 img_size = 224,
                 transform=None,
                 is_train = False,
                 len_clip=16,
                 split_ratio=0.8,
                 sampling_rate=1):
        self.root_dir = root_dir
        self.dataset = dataset
        self.transform = transform

        self.img_size = img_size
        self.len_clip = len_clip
        self.is_train = is_train
        self.sampling_rate = sampling_rate
        
        
        self.classes = self.load_classes(os.path.join(root_dir, 'classes.txt'))
        self.samples = self.load_samples()

        self.train_size = int(split_ratio * len(self.samples))
        self.train_dataset, self.test_dataset = self.samples[:self.train_size], self.samples[self.train_size:]

    def __len__(self):
        if self.is_train:
            return len(self.train_dataset)
        else:
            return len(self.test_dataset)
        

    def __getitem__(self, idx):
        frame_idx, video_clip, target = self.pull_item(idx)

        return frame_idx, video_clip, target
    
    def pull_item(self, idx):
        if self.is_train:
            img_path, label_path = self.train_dataset[idx]
            d = random.randint(1, 2)
            max_num = len(self.train_dataset)
        else:
            img_path, label_path = self.test_dataset[idx]
            d = self.sampling_rate
            max_num = len(self.train_dataset)
        img_split = label_path.split("/")
        img_id = img_split[-1].split("_")[-1].split(".")[0]
        video_clip = []
        for i in reversed(range(self.len_clip)):
            img_id_temp = int(img_id) - i * d
            if img_id_temp < 1:
                img_id_temp = 1
            elif img_id_temp > max_num:
                img_id_temp = max_num
            
            frame = Image.open(img_path).convert('RGB')

            if self.transform:
                frame = self.transform(frame)
            ow, oh = frame.size(2), frame.size(1)
            # ow, oh = frame.width, frame.height



            video_clip.append(frame)

            frame_id = f"{img_split[6]}_{img_split[7]}_{img_split[8]}"
        
        if os.path.getsize(label_path):
            target = np.loadtxt(label_path)
        else:
            target = None 

        label = target[..., :1]
        # print('label', label)
        boxes = target[..., 1:]
        # print('boxes', boxes)
        target = np.concatenate([boxes, label], axis=-1).reshape(-1, 5)  
        # print('target', target)
        # print('bbox', target[:, :4])
        # print('labels', target[:, -1])
        
        target = torch.as_tensor(target).float()
        
        # transform
        # video_clip, target = self.transform(video_clip, target)
        # List [T, 3, H, W] -> [3, T, H, W]
        video_clip = torch.stack(video_clip, dim=1)
        # print("Video clip shape", video_clip.shape)

        # reformat target
        target = {
            'boxes': target[:, :4].float(),      # [N, 4]
            'labels': target[:, -1].long(),    # [N,]
            'orig_size': [ow, oh],
            'video_idx':frame_id
            }

        # print(target)

        return frame_id, video_clip, target     
        
        

    def load_classes(self, classes_file):
        with open(classes_file, 'r') as file: 
            classes = [line.strip() for line in file.readlines()]
            self.num_classes = len(classes)
            return classes
        

    def load_samples(self):
        samples = []

        for class_folder in os.listdir(self.root_dir):
            class_folder_path = os.path.join(self.root_dir, class_folder)
            # print("class_folder_path: ", class_folder_path)
            if os.path.isdir(class_folder_path):
                data_folder_path = os.path.join(class_folder_path, "labels")
                # print("data folder path:", data_folder_path)
                for file in os.listdir(data_folder_path):
                    if file.endswith(".txt"):
                        frame_num = file.split("_")[-1].split(".")[0]
                        label_path = os.path.join(data_folder_path, file)
                        img_path = os.path.join(class_folder_path, "images", f"frame_{frame_num}.jpg")
                        # print(img_path, label_path)
                        if os.path.exists(label_path) and os.path.exists(img_path):
                            samples.append((img_path, label_path))
        # print(len(samples))               
        return samples

In [108]:
class CollateFunc(object):
    def __call__(self, batch):
        batch_frame_id = []
        batch_key_target = []
        batch_video_clips = []

        for sample in batch:
            key_frame_id = sample[0]
            video_clip = sample[1]
            key_target = sample[2]
            
            batch_frame_id.append(key_frame_id)
            batch_video_clips.append(video_clip)
            batch_key_target.append(key_target)

        # List [B, 3, T, H, W] -> [B, 3, T, H, W]
        batch_video_clips = torch.stack(batch_video_clips)
        
        return batch_frame_id, batch_video_clips, batch_key_target

In [109]:
collate_fn = CollateFunc()

In [102]:
import cv2
cv2.setNumThreads(0)
cv2.ocl.setUseOpenCL(False)

import os
import time
import argparse
from copy import deepcopy
import torch
import torch.backends.cudnn as cudnn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

from utils import distributed_utils
from utils.com_flops_params import FLOPs_and_Params
from utils.solver.optimizer import build_optimizer
from utils.solver.warmup_schedule import build_warmup

from config import build_dataset_config, build_model_config
from models import build_model

In [103]:
class args:
    batch_size = 8
    cuda = False
    save_folder = "./checkpoint/custom"
    dataset = "custom_dataset"
    version = "yowo_v2_large"
    loss_conf_weight = 1
    loss_cls_weight = 1
    loss_reg_weight = 5
    topk_candicate = 10
    center_sampling_radius = 2.5
    sybn = False
    focal_loss = False
    freeze_backbone_2d = False
    freeze_backbone_3d = False
    distributed = False
    resume = None
    topk = 16
    base_lr = 1e-5
    len_clip = 16

In [60]:
world_size = distributed_utils.get_world_size()
per_gpu_batch = args.batch_size // world_size
print('World size: {}'.format(world_size))
if args.distributed:
    distributed_utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(distributed_utils.get_sha()))

# path to save model
path_to_save = os.path.join(args.save_folder, args.dataset, args.version)
os.makedirs(path_to_save, exist_ok=True)

# cuda
if args.cuda:
    print('use cuda')
    cudnn.benchmark = True
    device = torch.device("cuda")
else:
    device = torch.device("cpu")


World size: 1


In [61]:
d_cfg = build_dataset_config(args)
m_cfg = build_model_config(args)

Dataset Config: CUSTOM_DATASET 
Model Config: YOWO_V2_LARGE 


In [110]:
def random_jitter(img, jitter):
    if random.random() < jitter:
        angle = random.uniform(-10, 10)
        img = transforms.functional.rotate(img, angle)
    return img

def adjust_exposure(img, exposure):
    img = transforms.functional.adjust_brightness(img, 1 + random.uniform(-exposure, exposure))
    return img

def img_augmentation(img_size=224, jitter=0.2, hue=0.1, saturation=1.5, exposure=1.5):
    transform = transforms.Compose([
        transforms.Resize((img_size, img_size)),
        transforms.Lambda(lambda x: random_jitter(x, jitter)),
        transforms.ColorJitter(hue=hue, saturation=saturation),
        transforms.Lambda(lambda x: adjust_exposure(x, exposure)),
        transforms.ToTensor(),
    ])
    return transform

augmentation = img_augmentation()

In [118]:
def img_basetransform(img_size=224, pixel_mean=[0., 0., 0.], pixel_std=[1., 1., 1.]):
    transform = transforms.Compose([
        transforms.Resize((img_size, img_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=pixel_mean, std=pixel_std)
    ])
    return transform

basetransform = img_basetransform()


In [127]:
root_dir = '/home/longbach/Desktop/motion-det-dataset/processed_data'

# Create an instance of your custom dataset with a split ratio of 0.8
custom_dataset = CustomDataset(root_dir, transform=augmentation, is_train=True)

# Create a data loader for training
train_batch_size = 1
train_data_loader = DataLoader(custom_dataset, batch_size=train_batch_size, shuffle=True, collate_fn=collate_fn, drop_last=False, pin_memory=True)

# Create an instance of your custom dataset with a split ratio of 0.2 for validation
val_dataset = CustomDataset(root_dir, transform=basetransform, is_train=False)

# Create a data loader for validation
val_batch_size = 1
val_data_loader = DataLoader(val_dataset, batch_size=val_batch_size, shuffle=False, collate_fn=collate_fn, drop_last=False, pin_memory=True)

In [128]:
for i in train_data_loader:
    print(i)
    break

(['ch5_20231207_labels_frame_0320.txt'], tensor([[[[[0.1686, 0.1647, 0.1647,  ..., 0.0471, 0.0471, 0.0471],
           [0.1686, 0.1647, 0.1647,  ..., 0.0510, 0.0471, 0.0471],
           [0.1686, 0.1686, 0.1647,  ..., 0.0510, 0.0471, 0.0471],
           ...,
           [0.1137, 0.1294, 0.1412,  ..., 0.0314, 0.0314, 0.0314],
           [0.1098, 0.1294, 0.1451,  ..., 0.0314, 0.0314, 0.0314],
           [0.1098, 0.1294, 0.1451,  ..., 0.0314, 0.0314, 0.0314]],

          [[0.6549, 0.6471, 0.6431,  ..., 0.1804, 0.1725, 0.1804],
           [0.6588, 0.6471, 0.6431,  ..., 0.1843, 0.1725, 0.1804],
           [0.6667, 0.6549, 0.6431,  ..., 0.1922, 0.1804, 0.1804],
           ...,
           [0.4235, 0.5020, 0.5412,  ..., 0.1137, 0.1137, 0.1137],
           [0.4353, 0.5176, 0.5725,  ..., 0.1098, 0.1137, 0.1137],
           [0.4353, 0.5176, 0.5765,  ..., 0.1098, 0.1137, 0.1137]],

          [[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.

In [129]:
for i in val_data_loader:
    print(i)
    break

(['ch2_20230712_labels_frame_11552.txt'], tensor([[[[[0.4078, 0.6510, 0.2549,  ..., 0.1176, 0.0902, 0.0902],
           [0.6667, 0.4588, 0.0745,  ..., 0.1176, 0.0902, 0.0902],
           [0.6431, 0.1686, 0.0706,  ..., 0.1137, 0.0863, 0.0941],
           ...,
           [0.2392, 0.1569, 0.0471,  ..., 0.1490, 0.1451, 0.1412],
           [0.3216, 0.2510, 0.1569,  ..., 0.1608, 0.1529, 0.1451],
           [0.2000, 0.3059, 0.2510,  ..., 0.1647, 0.1569, 0.1490]],

          [[0.4078, 0.6510, 0.2549,  ..., 0.1176, 0.0902, 0.0902],
           [0.6667, 0.4588, 0.0745,  ..., 0.1176, 0.0902, 0.0902],
           [0.6431, 0.1686, 0.0706,  ..., 0.1137, 0.0863, 0.0941],
           ...,
           [0.2392, 0.1569, 0.0471,  ..., 0.1490, 0.1451, 0.1412],
           [0.3216, 0.2510, 0.1569,  ..., 0.1608, 0.1529, 0.1451],
           [0.2000, 0.3059, 0.2510,  ..., 0.1647, 0.1569, 0.1490]],

          [[0.4078, 0.6510, 0.2549,  ..., 0.1176, 0.0902, 0.0902],
           [0.6667, 0.4588, 0.0745,  ..., 0.1176, 0

In [130]:
model, criterion = build_model(
    args=args,
    d_cfg=d_cfg,
    m_cfg=m_cfg,
    device=device,
    num_classes=5, 
    trainable=True,
    resume=args.resume
    )
model = model.to(device).train()

# DDP
model_without_ddp = model
if args.distributed:
    model = DDP(model, device_ids=[args.gpu])
    model_without_ddp = model.module

# SyncBatchNorm
if args.sybn and args.distributed:
    print('use SyncBatchNorm ...')
    model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)

# Compute FLOPs and Params
if distributed_utils.is_main_process():
    model_copy = deepcopy(model_without_ddp)
    FLOPs_and_Params(
        model=model_copy,
        img_size=d_cfg['test_size'],
        len_clip=args.len_clip,
        device=device)
    del model_copy


Build YOWO_V2_LARGE ...
2D Backbone: YOLO_FREE_LARGE
--pretrained: True
FPN: pafpn_elan
Head: Decoupled Head
Head: Decoupled Head
Head: Decoupled Head
Loading 2D backbone pretrained weight: YOLO_FREE_LARGE
3D Backbone: RESNEXT101
--pretrained: True
Loading pretrained weight ...
Loading 3D backbone pretrained weight: RESNEXT101
Head: Decoupled Head
Head: Decoupled Head
Head: Decoupled Head
[INFO] Register count_convNd() for <class 'torch.nn.modules.conv.Conv2d'>.
[INFO] Register count_normalization() for <class 'torch.nn.modules.batchnorm.BatchNorm2d'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.container.Sequential'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.pooling.MaxPool2d'>.
[INFO] Register count_convNd() for <class 'torch.nn.modules.conv.Conv3d'>.
[INFO] Register count_normalization() for <class 'torch.nn.modules.batchnorm.BatchNorm3d'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.activation.ReLU'>.
[INFO] Register zero_ops() for <class 'tor

In [131]:
# optimizer
base_lr = d_cfg["base_lr"]
accumulate = d_cfg["accumulate"]
optimizer, start_epoch = build_optimizer(d_cfg, model_without_ddp, base_lr, args.resume)

# lr scheduler
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, d_cfg["lr_epoch"], d_cfg["lr_decay_ratio"])

# warmup scheduler
warmup_scheduler = build_warmup(d_cfg, base_lr=base_lr)

# training configuration
max_epoch = d_cfg["max_epoch"]
epoch_size = 10
warmup = True

Optimizer: adamw
--momentum: 0.9
--weight_decay: 0.0005
WarmUpScheduler: linear
--base_lr: 0.0001
--warmup_factor: 0.00066667
--wp_iter: 500


In [50]:
def print_log(lr, epoch, max_epoch, iter_i, epoch_size, loss_dict, time, accumulate):
    # basic infor
    log =  '[Epoch: {}/{}]'.format(epoch+1, max_epoch)
    log += '[Iter: {}/{}]'.format(iter_i, epoch_size)
    log += '[lr: {:.6f}]'.format(lr[0])
    # loss infor
    for k in loss_dict.keys():
        if k == 'losses':
            log += '[{}: {:.2f}]'.format(k, loss_dict[k] * accumulate)
        else:
            log += '[{}: {:.2f}]'.format(k, loss_dict[k])

    # other infor
    log += '[time: {:.2f}]'.format(time)

    # print log infor
    print(log, flush=True)

In [132]:
# start to train
t0 = time.time()
for epoch in range(start_epoch, max_epoch):
    if args.distributed:
        train_data_loader.batch_sampler.sampler.set_epoch(epoch)            

    # train one epoch
    for iter_i, (frame_ids, video_clips, targets) in enumerate(train_data_loader):
        ni = iter_i + epoch * epoch_size

        # warmup
        if ni < d_cfg['wp_iter'] and warmup:
            warmup_scheduler.warmup(ni, optimizer)

        elif ni == d_cfg['wp_iter'] and warmup:
            # warmup is over
            print('Warmup is over')
            warmup = False
            warmup_scheduler.set_lr(optimizer, lr=base_lr, base_lr=base_lr)

        # to device
        video_clips = video_clips.to(device)

        # inference
        outputs = model(video_clips)
        
        # loss
        loss_dict = criterion(outputs, targets)
        losses = loss_dict['losses']

        # reduce            
        loss_dict_reduced = distributed_utils.reduce_dict(loss_dict)

        # check loss
        if torch.isnan(losses):
            print('loss is NAN !!')
            continue

        # Backward
        losses /= accumulate
        losses.backward()

        # Optimize
        if ni % accumulate == 0:
            optimizer.step()
            optimizer.zero_grad()
                
        # Display
        if distributed_utils.is_main_process() and iter_i % 10 == 0:
            t1 = time.time()
            cur_lr = [param_group['lr']  for param_group in optimizer.param_groups]
            print_log(cur_lr, epoch,  max_epoch, iter_i, epoch_size,loss_dict_reduced, t1-t0, accumulate)
        
            t0 = time.time()

    lr_scheduler.step()
    


RuntimeError: selected index k out of range