In [1]:

import torch
import torch.nn as nn
import torch.nn.functional as F


def conv_bn_act(in_ch, out_ch, k=3, s=1, p=1):
    return nn.Sequential(
        nn.Conv2d(in_ch, out_ch, k, s, p, bias=False),
        nn.BatchNorm2d(out_ch),
        nn.SiLU()
    )

class C3(nn.Module):
    def __init__(self, in_ch, out_ch, n=1):
        super().__init__()
        hidden = out_ch // 2
        self.cv1 = conv_bn_act(in_ch, hidden, k=1, s=1, p=0)
        self.cv2 = conv_bn_act(in_ch, hidden, k=1, s=1, p=0)
        self.m = nn.Sequential(
            *[nn.Sequential(conv_bn_act(hidden, hidden), conv_bn_act(hidden, hidden))
              for _ in range(n)]
        )
        self.cv3 = conv_bn_act(2 * hidden, out_ch, k=1, s=1, p=0)

    def forward(self, x):
        y1 = self.cv1(x)
        y2 = self.cv2(x)
        y1 = self.m(y1)
        return self.cv3(torch.cat([y1, y2], dim=1))

class SPPF(nn.Module):
    def __init__(self, ch, pool_k=5):
        super().__init__()
        hidden = ch // 2
        self.cv1 = conv_bn_act(ch, hidden, k=1, s=1, p=0)
        self.cv2 = conv_bn_act(hidden * 4, ch, k=1, s=1, p=0)
        self.pool = nn.MaxPool2d(kernel_size=pool_k, stride=1, padding=pool_k // 2)

    def forward(self, x):
        x = self.cv1(x)
        y1 = self.pool(x)
        y2 = self.pool(y1)
        y3 = self.pool(y2)
        return self.cv2(torch.cat([x, y1, y2, y3], dim=1))


class YOLOHead(nn.Module):
    def __init__(self, in_ch, num_anchors, num_classes):
        super().__init__()
        self.conv = conv_bn_act(in_ch, in_ch)
        self.pred = nn.Conv2d(in_ch, num_anchors * (num_classes + 5), 1, 1, 0)
        self.num_anchors = num_anchors
        self.num_classes = num_classes

    def forward(self, x):
        x = self.conv(x)
        B, _, H, W = x.shape
        pred = self.pred(x)
        pred = pred.view(B, self.num_anchors, self.num_classes + 5, H, W)
        return pred.permute(0, 1, 3, 4, 2)


class YOLOv5(nn.Module):
    def __init__(self, nc=80, anchors=None):
        super().__init__()
        if anchors is None:
            anchors = [
                [[10,13],[16,30],[33,23]],
                [[30,61],[62,45],[59,119]],
                [[116,90],[156,198],[373,326]],
            ]

        self.nc = nc
        self.anchors = torch.tensor(anchors, dtype=torch.float32)

        # Backbone 
        self.stem = nn.Sequential(conv_bn_act(3, 64, k=6, s=2, p=2), C3(64, 64, n=1))
        self.down1 = nn.Sequential(conv_bn_act(64, 128, k=3, s=2, p=1), C3(128, 128, n=3))
        self.down2 = nn.Sequential(conv_bn_act(128, 256, k=3, s=2, p=1), C3(256, 256, n=3))
        self.down3 = nn.Sequential(conv_bn_act(256, 512, k=3, s=2, p=1), C3(512, 512, n=1))
        self.sppf = SPPF(512)

        # Neck (PANet-style)
        self.up1 = conv_bn_act(512, 256, k=1, s=1, p=0)
        self.pan1 = C3(512, 256, n=1)
        self.up2 = conv_bn_act(256, 128, k=1, s=1, p=0)
        self.pan2 = C3(256, 128, n=1)

        # Heads
        self.head_small = YOLOHead(128, num_anchors=3, num_classes=nc)
        self.head_medium = YOLOHead(256, num_anchors=3, num_classes=nc)
        self.head_large = YOLOHead(512, num_anchors=3, num_classes=nc)

    def forward(self, x):
        # Backbone
        x = self.stem(x)
        x1 = self.down1(x)
        x2 = self.down2(x1)
        x3 = self.down3(x2)
        x3 = self.sppf(x3)

        # Neck
        p5 = x3
        p5_up = F.interpolate(self.up1(p5), scale_factor=2, mode="nearest")
        p4 = self.pan1(torch.cat([p5_up, x2], dim=1))
        p4_up = F.interpolate(self.up2(p4), scale_factor=2, mode="nearest")
        p3 = self.pan2(torch.cat([p4_up, x1], dim=1))

        # Heads
        out_small = self.head_small(p3)
        out_medium = self.head_medium(p4)
        out_large = self.head_large(p5)

        return [out_small, out_medium, out_large]


In [2]:
import torch
from torch.utils.data import Dataset
import os
import cv2
import numpy as np
from pycocotools.coco import COCO
import random

class COCODetection(Dataset):
    def __init__(self, root, annFile, img_size=640, transforms=None):
        self.root = root
        self.coco = COCO(annFile)
        self.ids = list(sorted(self.coco.imgs.keys()))
        self.img_size = img_size
        self.transforms = transforms

        # Build mapping for COCO category IDs → continuous [0–79]
        coco_categories = sorted(cat['id'] for cat in self.coco.dataset['categories'])
        self.cat_id_to_idx = {cat_id: idx for idx, cat_id in enumerate(coco_categories)}

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        img_id = self.ids[idx]
        img_info = self.coco.loadImgs(img_id)[0]
        path = os.path.join(self.root, img_info['file_name'])

        # Load image
        img = cv2.imread(path)
        if img is None:
            raise ValueError(f"Image at {path} not found")
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        h0, w0 = img.shape[:2]
        scale = self.img_size / max(h0, w0)
        nh, nw = int(h0 * scale), int(w0 * scale)
        img_resized = cv2.resize(img, (nw, nh))

        canvas = np.full((self.img_size, self.img_size, 3), 114, dtype=np.uint8)
        canvas[:nh, :nw] = img_resized
        img = canvas

        # Load annotations
        anns = self.coco.loadAnns(self.coco.getAnnIds(imgIds=img_id))
        boxes = []
        labels = []

        for a in anns:
            x, y, w, h = a['bbox']
            x_c = (x + w / 2) * scale / self.img_size
            y_c = (y + h / 2) * scale / self.img_size
            w_n = (w * scale) / self.img_size
            h_n = (h * scale) / self.img_size

            if w_n <= 0 or h_n <= 0:
                continue

            boxes.append([x_c, y_c, w_n, h_n])
            labels.append(self.cat_id_to_idx[a['category_id']])  # Remap ID

        target = {
            "boxes": torch.tensor(boxes, dtype=torch.float32) if boxes else torch.zeros((0, 4)),
            "labels": torch.tensor(labels, dtype=torch.long) if labels else torch.zeros((0,), dtype=torch.long)
        }

        img = img.astype(np.float32) / 255.0
        img = np.transpose(img, (2, 0, 1))

        return torch.tensor(img, dtype=torch.float32), target


In [3]:
import torch
import torch.nn.functional as F
import math

def bbox_iou(box1, box2, eps=1e-7):
    # boxes in x1y1x2y2
    inter_x1 = torch.max(box1[...,0], box2[...,0])
    inter_y1 = torch.max(box1[...,1], box2[...,1])
    inter_x2 = torch.min(box1[...,2], box2[...,2])
    inter_y2 = torch.min(box1[...,3], box2[...,3])
    inter_w = (inter_x2 - inter_x1).clamp(0)
    inter_h = (inter_y2 - inter_y1).clamp(0)
    inter = inter_w * inter_h
    area1 = (box1[...,2]-box1[...,0]).clamp(0) * (box1[...,3]-box1[...,1]).clamp(0)
    area2 = (box2[...,2]-box2[...,0]).clamp(0) * (box2[...,3]-box2[...,1]).clamp(0)
    union = area1 + area2 - inter + eps
    return inter / union

def xywh2xyxy(x):
    # x: [N,4] x_c,y_c,w,h normalized
    x_c, y_c, w, h = x[...,0], x[...,1], x[...,2], x[...,3]
    x1 = x_c - w/2
    y1 = y_c - h/2
    x2 = x_c + w/2
    y2 = y_c + h/2
    return torch.stack([x1,y1,x2,y2], dim=-1)

def compute_loss(preds, targets, anchors, strides, device, num_classes=80):
    
    #preds: list of prediction tensors [B, A, H, W, 5+nc] for 3 scales
    #targets: list of targets per image (boxes normalized to 0-1 and labels)
    #anchors: tensor shape (3,3,2) or list of 3 lists of 3 pairs
    #strides: list of strides per scale (tuple)
    #returns total_loss, dict breakdown
   
    bce = torch.nn.BCEWithLogitsLoss(reduction='mean')
    mse = torch.nn.MSELoss(reduction='mean')
    device = device
    loss_obj = torch.tensor(0., device=device)
    loss_cls = torch.tensor(0., device=device)
    loss_box = torch.tensor(0., device=device)
    # naive matching: for every target box, find best scale based on size and assign to its grid cell & best anchor by IoU
    B = preds[0].shape[0]
    for b in range(B):
        t = targets[b]
        if t['boxes'].numel() == 0:
            # no objects: encourage objectness to 0 across all preds
            for p in preds:
                obj_pred = p[b,...,4]
                loss_obj = loss_obj + bce(obj_pred, torch.zeros_like(obj_pred))
            continue
        boxes = t['boxes'].to(device)  # normalized xywh
        labels = t['labels'].to(device)
        # for each box, compute which scale to place (based on area) and compute losses
        for i_box in range(boxes.shape[0]):
            box = boxes[i_box:i_box+1]  # 1,4
            lab = labels[i_box:i_box+1]
            # choose scale by width (very rough heuristic)
            w = box[0,2]
            if w < 0.06:
                scale_idx = 0
            elif w < 0.2:
                scale_idx = 1
            else:
                scale_idx = 2
            p = preds[scale_idx][b]  # [A, H, W, 5+nc]
            A, H, W, _ = p.shape
            # map center to grid
            gx = box[0,0] * W
            gy = box[0,1] * H
            gi = int(gx.clamp(0, W-1).item())
            gj = int(gy.clamp(0, H-1).item())
            # for anchor matching pick best anchor by comparing anchor box ratios vs target
            anchor_set = anchors[scale_idx].to(device) / strides[scale_idx]
            # compute IoU between scaled anchors (centered) and target w,h
            tw = box[0,2] * W
            th = box[0,3] * H
            anchor_wh = anchor_set
            # convert to xyxy
            a_box = torch.zeros((anchor_wh.shape[0],4), device=device)
            a_box[:,0:2] = torch.stack([torch.zeros_like(anchor_wh[:,0]), torch.zeros_like(anchor_wh[:,1])], dim=1)
            a_box[:,2] = anchor_wh[:,0]
            a_box[:,3] = anchor_wh[:,1]
            t_box = torch.tensor([0,0,tw,th], device=device).unsqueeze(0)
            # intersection over anchors: use min(w,h) heuristic
            inter = torch.min(a_box[:,2], t_box[:,2]) * torch.min(a_box[:,3], t_box[:,3])
            area_a = a_box[:,2]*a_box[:,3]
            area_t = tw*th
            ious = inter / (area_a + area_t - inter + 1e-9)
            best_anchor = int(torch.argmax(ious).item())
            # predictions at anchor/grid
            pred = p[best_anchor, gj, gi]  # (5+nc)
            # decode pred: tx,ty,tw,th
            # For simplicity assume pred[...,0:2] are offsets (sigmoid), 2:4 are log-space, 4 is obj, 5: are class logits
            px = (torch.sigmoid(pred[0]) + gx - gi) / W  # normalized
            py = (torch.sigmoid(pred[1]) + gy - gj) / H
            pw = (torch.exp(pred[2]) * anchor_wh[best_anchor,0]) / (W)
            ph = (torch.exp(pred[3]) * anchor_wh[best_anchor,1]) / (H)
            pred_box = torch.stack([px, py, pw, ph], dim=0).unsqueeze(0)
            # box loss: use IoU between predicted box and target
            iou = bbox_iou(xywh2xyxy(pred_box), xywh2xyxy(box.to(device)))
            loss_box = loss_box + (1.0 - iou).squeeze()
            # objectness: target 1 for this cell/anchor, 0 elsewhere
            obj_pred = pred[4]
            loss_obj = loss_obj + bce(obj_pred.unsqueeze(0), torch.ones((1,), device=device))
            # classification
            cls_pred = pred[5:]
            target_cls = torch.zeros_like(cls_pred)
            target_cls[lab] = 1.0
            loss_cls = loss_cls + bce(cls_pred.unsqueeze(0), target_cls.unsqueeze(0))
    # normalize by batch size
    denom = max(1, B)
    return (loss_box/denom, loss_obj/denom, loss_cls/denom), {"box":loss_box.item()/denom, "obj":loss_obj.item()/denom, "cls":loss_cls.item()/denom}


In [None]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import os

class Config:
    img_root = "/kaggle/input/2017-2017/train2017/train2017"
    ann = "/kaggle/input/2017-2017/annotations_trainval2017/annotations/instances_train2017.json"
    img_size = 640
    batch = 8
    epochs = 10
    lr = 1e-3
    num_classes = 80
    save_dir = "./weights"

cfg = Config()


def train(cfg):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Training on: {device}")

    # Initialize model
    model = YOLOv5(nc=cfg.num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=5e-4)

    anchors = torch.tensor([
        [[10,13],[16,30],[33,23]],
        [[30,61],[62,45],[59,119]],
        [[116,90],[156,198],[373,326]]
    ], dtype=torch.float32)
    strides = [8, 16, 32]

    # Dataset and DataLoader
    train_ds = COCODetection(cfg.img_root, cfg.ann, img_size=cfg.img_size)
    loader = DataLoader(train_ds, batch_size=cfg.batch, shuffle=True, num_workers=4, collate_fn=collate_fn)

    model.train()
    for epoch in range(1, cfg.epochs + 1):
        pbar = tqdm(loader, desc=f"Epoch {epoch}")
        epoch_loss = 0.0

        for imgs, targets in pbar:
            imgs = imgs.to(device)

            # Forward
            preds = model(imgs)

            # Loss computation
            loss_vals, breakdown = compute_loss(preds, targets, anchors, strides, device, num_classes=cfg.num_classes)
            loss = sum(loss_vals)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            pbar.set_postfix(loss=epoch_loss / (pbar.n + 1),
                             box=breakdown['box'],
                             obj=breakdown['obj'],
                             cls=breakdown['cls'])

        # Save checkpoints
        os.makedirs(cfg.save_dir, exist_ok=True)
        torch.save({'model': model.state_dict(),
                    'optimizer': optimizer.state_dict()},
                   os.path.join(cfg.save_dir, f"ckpt_epoch{epoch}.pt"))

def collate_fn(batch):
    imgs = torch.stack([i[0] for i in batch], 0)
    targets = [i[1] for i in batch]
    return imgs, targets


if __name__ == "__main__":
    train(cfg)


Training on: cuda
loading annotations into memory...
Done (t=21.71s)
creating index...
index created!


Epoch 1:  99%|█████████▉| 14637/14786 [3:56:02<02:35,  1.04s/it, box=11.6, cls=0.583, loss=7.55, obj=0.116]     