In [24]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms

from torchvision.datasets import CocoDetection
from typing import Iterable

print(torch.__version__)
print(torch.cuda.is_available())

2.3.1
True


In [88]:
transform = transforms.Compose([
    transforms.ToTensor()
])
dataset = CocoDetection(
    root=os.path.join("data", "images"), 
    annFile=os.path.join("data", "coco_annotations.json"), 
    transform=transform
)
train_size = int(0.7 * len(dataset))
valid_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - valid_size
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, valid_size, test_size])
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=32, shuffle=False, num_workers=4)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

loading annotations into memory...
Done (t=0.15s)
creating index...
index created!
cuda


In [89]:
class BoundingBoxModel(nn.Module):
    def __init__(self):
        super(BoundingBoxModel, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = self._initialize_fc1()
        self.fc2 = nn.Linear(128, 4)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self._run_first_layers(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

    def _initialize_fc1(self) -> nn.Linear:
        with torch.no_grad():
            dummy_input = torch.zeros(1, 3, 150, 200)
            x = self._run_first_layers(dummy_input)
            input_size = x.size(1)
            
            return nn.Linear(input_size, 128)
        
    def _run_first_layers(self, x: torch.Tensor) -> torch.Tensor:
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(-1, x.size(1) * x.size(2) * x.size(3))
        
        return x

In [90]:
def extract_bboxes(targets: list[dict]) -> list[torch.Tensor]:
    bboxes = []
    
    for target in targets:
        xs, ys, widths, heights = target["bbox"]
    
        for idx, _ in enumerate(xs):
            x1, y1, width, height = xs[idx], ys[idx], widths[idx], heights[idx]
            # Convert COCO format (x, y, width, height) to (x1, y1, x2, y2)
            x2, y2 = x1 + width, y1 + height
    
            bboxes.append(torch.IntTensor([x1, y1, x2, y2]))

    return bboxes


model = BoundingBoxModel().to(device)
criterion = nn.SmoothL1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [91]:
num_epochs = 25

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for images, targets in train_loader:
        images = images.to(device)
        bboxes = extract_bboxes(targets)
        bboxes = torch.stack(bboxes).to(device)

        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs, bboxes)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {running_loss / len(train_loader)}")

    model.eval()

    val_loss = 0.0

    with torch.no_grad():
        for images, targets in valid_loader:
            images = images.to(device)
            bboxes = [extract_bboxes(target) for target in targets]
            bboxes = torch.stack(bboxes).to(device)
            outputs = model(images)
            loss = criterion(outputs, bboxes)
            val_loss += loss.item()

    print(f"Validation Loss: {val_loss / len(valid_loader)}")

print("Training complete")

Epoch 1, Loss: 19.600274926965888


TypeError: string indices must be integers, not 'str'