In [35]:
import torch
import cv2
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import sentencepiece as spm
import matplotlib.pyplot as plt
import multiprocessing
import time
from torch.utils.data import Dataset, DataLoader

if torch.backends.mps.is_available():  # Check for Apple Silicon GPU availability (requires PyTorch 1.12 or later)
    device = torch.device("mps")
elif torch.cuda.is_available():  # Check for NVIDIA GPU availability
    device = torch.device("cuda")
else:
    device = torch.device("cpu")  # Fall back to CPU

print(f"Using device: {device}")

Using device: mps


## Load images to RAM

In [36]:
def dump_images(df):
  for i, row in df.iterrows():
    image = cv2.imdecode(np.frombuffer(row['image'], np.uint8), cv2.IMREAD_COLOR)
    cv2.imwrite(f'./easy-500/images/{i}.jpg', image)

images = pd.read_parquet('easy-500/images.parquet')
annotations = pd.read_parquet('easy-500/labels.parquet')

### Preprocessing Step

In [37]:
# Group by index and aggregate into lists
# t1 = annotations.set_index('image_id')
# aggregated_annotations = t1.groupby(t1.index).agg(list)

# print(aggregated_annotations.shape)
print(annotations[:2])
print(images.shape)

   image_id    x    y  orientation  radius  class
0         0  269  450     0.000000      17      0
1         0  533  299     0.663225      45      1
(500, 2)


### YOLO simplified Example 

In [38]:
class AsymmetricYOLO(nn.Module):
    def __init__(self, imageWidth = 1024, imageHeight = 512, W=32, H=16, B=3, C=3, device = device):
        super(AsymmetricYOLO, self).__init__()
        self.device     = device
        self.imageWidth = imageWidth
        self.imageHeight= imageHeight
        self.W          = W # Width of Grid
        self.H          = H # Height of Grid
        self.B          = B # How many boxes in Grid Cell to search
        self.C          = C # Amount of Classes (Red Player, Blue Player, Ball)
        self.BoxSize    = 5
        
        self.input_channels     = 3     # The number of input channels. For RGB images, this is 3.
        self.output_channels    = 16    # The number of output channels. This layer will create 16 filters, each producing a separate output channel.
        self.kernel_size        = 21     # The size of the filter applied to the input image. A size of 3 means a 3x3 filter.
        self.stride             = 7    # The stride of the convolution. A stride of 1 means the filter moves one pixel at a time as it slides across the image.
        
        # Padding added to the edges of the input. A padding of 1 adds a one-pixel border of zeros around the input image, 
        # allowing the output size to be the same as the input size when using a stride of 1 and a 3x3 kernel.
        self.padding            = 0   
        
        # Define the CNN architecture
        self.conv_layers = nn.Sequential(
            nn.Conv2d(
                self.input_channels, 
                self.output_channels, 
                kernel_size=self.kernel_size, 
                stride=self.stride, 
                padding=self.padding
            ),
            nn.BatchNorm2d(self.output_channels),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # Add more convolutional layers as needed
        ).to(self.device)

        # Assuming the feature map size here is compatible with the input image size
        # feature_size = 512 * 256 * self.output_channels
        # and grid size after the convolutions and pooling

        # Temporarily forward a dummy input through the conv_layers to find feature_size
        dummy_input = torch.autograd.Variable(torch.rand(1, 3, 1024, 512)).to(self.device)
        output_feat = self.conv_layers(dummy_input)
        feature_size = int(np.prod(output_feat.size()[1:]))  # Multiply dimensions for feature size

        self.output_layer = nn.Sequential(
            nn.Flatten(),
            nn.Linear(feature_size, W * H * (C + B * self.BoxSize)),  # Adjust 'feature_size' based on the output of the last conv layer
            nn.LeakyReLU(0.1),
        ).to(self.device)

    def forward(self, x):
        # x = self.conv_layers(x)
        # x = self.output_layer(x)
        # x = x.view(-1, self.H, self.W, self.C + self.B * self.BoxSize)  # Reshape to match the grid size and predictions
        return x

### Dataset


In [39]:
class SportsDataset(Dataset):
    def __init__(self, images, annotations, device):
        self.images = images
        self.annotations = annotations.set_index('image_id')
        self.device = device

    def __len__(self):
        return len(self.images)
    
    def decodeAnnotation(self, annotation):
        bboxes          = []
        classes         = []
        orientations    = []

        
        for _, object in annotation.iterrows():
            bboxes.append([
               object['x'],
               object['y'],
               object['radius']
            ]) 
           
            classes.append(object['class'])
            orientations.append(object['orientation'])

        return {
            'bboxes': bboxes, 
            'classes': classes, 
            'orientations': orientations
        }
    
    def normalization(self, image):
        
         # Normalize pixel values
        image   = image / 255.0  # Scale pixels to [0, 1]
        
        # Manually normalize the image using specified means and stds
        mean    = np.array([0.485, 0.456, 0.406])
        std     = np.array([0.229, 0.224, 0.225])
        image   = (image - mean) / std  # Apply normalization
        
        return image

    def __getitem__(self, idx):
        images_row = self.images.iloc[idx]
        annotations = self.annotations.loc[idx]
        
        # Decode image from binary data
        image = cv2.imdecode(np.frombuffer(images_row['image'], np.uint8), cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB

        image = self.normalization(image)

        decodedAnnotation = self.decodeAnnotation(annotations)

        images = torch.tensor(image, dtype=torch.float32, device = self.device).permute(2, 0, 1)
        annotations = {
           'bboxes':        torch.tensor(decodedAnnotation['bboxes'], dtype=torch.float32, device = self.device), 
           'classes':       torch.tensor(decodedAnnotation['classes'], dtype=torch.int64, device = self.device), 
           'orientations':  torch.tensor(decodedAnnotation['orientations'], dtype=torch.float32, device = self.device),
        }

        return images, annotations


### initialization

In [40]:
batch_size=2
lr=0.001
epochs = 1

ImagesDataset = SportsDataset(images, annotations, device)
ImagesDataLoader = DataLoader(ImagesDataset, batch_size, shuffle=True)

In [41]:

Yolo = AsymmetricYOLO(
    imageWidth = 1024,
    imageHeight = 512,
    W=32, 
    H=16, 
    B=3, 
    C=3, 
    device = device,
).to(device)

optimizer = torch.optim.Adam(Yolo.parameters(), lr)
loss_function = nn.MSELoss()  # Simplified loss function for illustration

In [42]:

for epoch in range(epochs):
    for images, annotations in ImagesDataLoader:
        outputs = Yolo(images)
        # loss = loss_function(outputs, annotations)
        # optimizer.zero_grad()
        # loss.backward()
        # optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

NameError: name 'loss' is not defined

In [None]:
print(outputs)

tensor([[[[-1.1418, -1.1247, -1.1247,  ..., -0.8335, -0.8335, -0.8507],
          [-1.1075, -1.1247, -1.1418,  ..., -0.8507, -0.8507, -0.8678],
          [-1.0904, -1.1075, -1.1418,  ..., -0.8507, -0.8335, -0.8507],
          ...,
          [-1.3815, -1.3473, -1.3644,  ..., -1.1418, -1.1589, -1.1075],
          [-1.3815, -1.3644, -1.3644,  ..., -1.1589, -1.1589, -1.1075],
          [-1.4158, -1.3815, -1.3987,  ..., -1.1760, -1.1760, -1.1075]],

         [[ 0.1176,  0.1352,  0.1352,  ...,  0.1702,  0.1702,  0.1527],
          [ 0.1527,  0.1352,  0.1176,  ...,  0.1527,  0.1527,  0.1352],
          [ 0.1702,  0.1527,  0.1176,  ...,  0.1527,  0.1702,  0.1527],
          ...,
          [ 0.0826,  0.1176,  0.1001,  ...,  0.1176,  0.1001,  0.1527],
          [ 0.0826,  0.1001,  0.1001,  ...,  0.1001,  0.1001,  0.1527],
          [ 0.0476,  0.0826,  0.0651,  ...,  0.0826,  0.0826,  0.1527]],

         [[-1.5081, -1.4907, -1.4907,  ..., -1.3687, -1.3687, -1.3861],
          [-1.4733, -1.4907, -

In [None]:


data_loader = DataLoader(dataset, batch_size, shuffle=True)
optimizer = torch.optim.Adam(Yolo.parameters(), lr)
loss_function = nn.MSELoss()  # Simplified loss function for illustration

for epoch in range(epochs):
    for images, labels in data_loader:
        outputs = model(images)
        loss = loss_function(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")