In [17]:
import torch
import os
import torch
from PIL import Image
from torch.utils.data import Dataset
import albumentations as A
from albumentations.pytorch import ToTensorV2
from torch.utils.data import DataLoader
import torch.nn as nn
import numpy as np

from platform import python_version

print("Torch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())
print("Python version:", python_version())

!nvcc --version

Torch version: 2.1.0.dev20230506
CUDA version: 11.8
cuDNN version: 8700
Python version: 3.11.3
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Tue_May__3_19:00:59_Pacific_Daylight_Time_2022
Cuda compilation tools, release 11.7, V11.7.64
Build cuda_11.7.r11.7/compiler.31294372_0


In [53]:
class FXBoundingBoxDataset(Dataset):
    def __init__(self, img_dir, annotation_dir, img_list, transform=None):
        self.img_dir = img_dir
        self.annotation_dir = annotation_dir
        self.img_list = img_list
        self.transform = transform

    def __len__(self):
        return len(self.img_list)   
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_list[idx])
        img = Image.open(img_path).convert('RGB')
        img = np.array(img)

        annotation_path = os.path.join(self.annotation_dir, self.img_list[idx].replace('.jpg', '.txt'))
        with open(annotation_path, 'r') as f:
            annotations = f.readlines()

        boxes = []
        labels = []
        for ann in annotations:
            ann_split = ann.strip().split()
            print("ann_split", ann_split)
            label, x, y, w, h = map(float, ann_split)
            boxes.append([x - w/2, y - h/2, x + w/2, y + h/2])
            labels.append(int(label))

        boxes = torch.tensor(boxes)
        labels = torch.tensor(labels, dtype=torch.int64)
        print("boxes", boxes)
        print("labels", labels)
        if self.transform:
            transformed = self.transform(image=img, bboxes=boxes, bboxes_labels=['labels'])
            
            img = transformed["image"]
            boxes = transformed["bbox"]
            labels = transformed["labels"]

        target = {'boxes': boxes, 'labels': labels}

        return img, target

In [54]:
def get_train_transforms():
    return A.Compose([
        A.RandomResizedCrop(height=416, width=416, scale=(0.5, 1.0), p=0.5),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.RandomBrightnessContrast(p=0.2),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
        ToTensorV2(p=1.0)
    ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bboxes_labels']))


In [55]:
img_dir = 'train/images'
labels_dir = 'train/labels'
img_list = os.listdir(img_dir)
train_transforms = get_train_transforms()
train_dataset = FXBoundingBoxDataset(img_dir, labels_dir, img_list, transform=train_transforms)

In [56]:
batch_size = 16
num_workers = 0

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, collate_fn=None)

In [57]:
train_loader.__iter__().__next__()

ann_split ['2', '0.478584729981378', '0.64111328125', '0.29981378026070765', '0.1201171875']
boxes tensor([[0.3287, 0.5811, 0.6285, 0.7012]])
labels tensor([2])


KeyError: 'bbox'

In [None]:
class PatchEmbedding(nn.Module):
    def __init__(self, in_channels=3, patch_size=16, embedding_dim=768):
        super().__init__()
        self.patch_size = patch_size
        self.projection = nn.Sequential(
            nn.Conv2d(in_channels, embedding_dim, kernel_size=patch_size, stride=patch_size),
            nn.Flatten(1)
        )

    def forward(self, x):
        return self.projection(x)

class VisionTransformer(nn.Module):
    def __init__(self, in_channels=3, patch_size=16, embedding_dim=768, num_heads=12, num_layers=12, output_dim=4, img_size=224, dropout_rate=0.1):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.patch_embedding = PatchEmbedding(in_channels, patch_size, embedding_dim)
        self.positional_embedding = nn.Parameter(torch.randn(1, (img_size // patch_size) ** 2 + 1, embedding_dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, embedding_dim))
        self.dropout = nn.Dropout(dropout_rate)

        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=embedding_dim,
                nhead=num_heads,
                dropout=dropout_rate,
            ) for _ in range(num_layers)
        ])

        self.regressor = nn.Linear(embedding_dim, output_dim)

    def forward(self, x):
        x = self.patch_embedding(x)
        batch_size, num_patches, _ = x.shape

        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        x = torch.cat([cls_tokens, x], dim=1)

        x += self.positional_embedding
        x = self.dropout(x)

        for layer in self.transformer_layers:
            x = layer(x)

        cls_token_final = x[:, 0]
        output = self.regressor(cls_token_final)

        return output


In [None]:
import torch.optim as optim

# Instantiate the model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VisionTransformer(output_dim=4).to(device)
criterion = nn.SmoothL1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Number of epochs
num_epochs = 10

# Train the model
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    running_loss = 0.0

    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(train_dataset)

    print(f"Loss: {epoch_loss:.4f}\n")

print("Training complete.")
