In [1]:
import numpy as np
import sys
import pandas as pd
import os
import tqdm
import torch
import torch.nn as nn
import cv2

from torchvision import transforms
from torch.utils.data import DataLoader, Dataset

from sklearn.metrics import roc_auc_score

from create_dataset import create_dataset_df


In [2]:
DATASET_PATH = r"C:\Users\Eren\Downloads\Ball-Detection.v1i.voc\train"
IMAGE_SIZE = 224



In [63]:
class CustomCNNYedek(nn.Module):
    
    def __init__(self, in_channels=3, out_channels_cnn=3, bboxes=4):
        super().__init__()
        kernel_size = 3
        stride = 1
        padding = 1
        
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=32, kernel_size=kernel_size, stride=stride, padding=padding)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=kernel_size, stride=stride, padding=padding)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=kernel_size, stride=stride, padding=padding)
        self.conv4 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=kernel_size, stride=stride, padding=padding)
        self.conv5 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=kernel_size, stride=stride, padding=padding)
        
        self.batchnorm1 = nn.BatchNorm2d(32)
        self.batchnorm2 = nn.BatchNorm2d(64)
        self.batchnorm3 = nn.BatchNorm2d(128)
        self.maxpool = nn.MaxPool2d(kernel_size=2)
        self.relu = nn.ReLU()
        self.fc = nn.Flatten()
        self.cnn_layer = nn.Linear(7*7*128, out_channels_cnn)
        self.regressor = nn.Linear(7*7*128, bboxes)
    
    def cnn_layers(self, x):
        x = self.relu(x)
        x = self.maxpool(x)
        return x
    
    def feature_extractor(self, x):

        x = self.conv1(x)
        x = self.batchnorm1(x)
        x = self.cnn_layers(x)
        
        x = self.conv2(x)
        x = self.batchnorm2(x)
        x = self.cnn_layers(x)
        
        x = self.conv3(x)
        x = self.batchnorm2(x)
        x = self.cnn_layers(x)
        
        x = self.conv4(x)
        x = self.batchnorm3(x)
        x = self.cnn_layers(x)
        
        x = self.conv5(x)
        x = self.batchnorm3(x)
        x = self.cnn_layers(x)
        
        x = self.fc(x)
        return x
    
    def forward(self, x):
        x = self.feature_extractor(x)
        classifier_op = self.cnn_layer(x)
        regressor_op = self.regressor(x)
        return (regressor_op, classifier_op)

In [4]:
class CustomTensorDataset(Dataset):
    def __init__(self, tensors, transforms=None):
        self.tensors = tensors
        self.transforms = transforms
    
    def __getitem__(self, index):
        label = self.tensors[1][index]
        bbox = self.tensors[2][index]
        
        image = cv2.imread(os.path.join(DATASET_PATH, self.tensors[0][index]))
        image = torch.tensor(image, dtype=torch.float32).permute(2, 0, 1)
        
        if self.transforms:
            image = self.transforms(image)
        
        return (image, label, bbox)
    
    def __len__(self):
        return self.tensors[0].shape[0]

In [None]:
class CustomCNN(nn.Module):
    
    def __init__(self, in_channels=3, out_channels_cnn=3, bboxes=4):
        super().__init__()
        kernel_size = 3
        stride = 1
        padding = 1
        
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=64, kernel_size=kernel_size, stride=stride, padding=padding)
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=96, kernel_size=kernel_size, stride=stride, padding=padding)
        self.conv3 = nn.Conv2d(in_channels=96, out_channels=128, kernel_size=kernel_size, stride=stride, padding=padding)
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=kernel_size, stride=stride, padding=padding)
        self.conv5 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=kernel_size, stride=stride, padding=padding)
        
        self.batchnorm1 = nn.BatchNorm2d(32)
        self.batchnorm2 = nn.BatchNorm2d(64)
        self.batchnorm3 = nn.BatchNorm2d(128)
        self.maxpool = nn.MaxPool2d(kernel_size=2)
        self.relu = nn.ReLU()
        self.fc = nn.Flatten()
        self.cnn_layer = nn.Linear(7*7*128, out_channels_cnn)
        self.regressor = nn.Linear(7*7*128, bboxes)
    
    def cnn_layers(self, x):
        x = self.relu(x)
        x = self.maxpool(x)
        return x
    
    def feature_extractor(self, x):

        x = self.conv1(x)
        x = self.batchnorm1(x)
        x = self.cnn_layers(x)
        
        x = self.conv2(x)
        x = self.batchnorm2(x)
        x = self.cnn_layers(x)
        
        x = self.conv3(x)
        x = self.batchnorm2(x)
        x = self.cnn_layers(x)
        
        x = self.conv4(x)
        x = self.batchnorm3(x)
        x = self.cnn_layers(x)
        
        x = self.conv5(x)
        x = self.batchnorm3(x)
        x = self.cnn_layers(x)
        
        x = self.fc(x)
        return x
    
    def forward(self, x):
        x = self.feature_extractor(x)
        classifier_op = self.cnn_layer(x)
        regressor_op = self.regressor(x)
        return (regressor_op, classifier_op)

In [41]:
train_set, validation_set, test_set = create_dataset_df(DATASET_PATH)



In [43]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
])

train_images = train_set["file_names"].values
train_class = torch.from_numpy(train_set["class"].values)
train_bbox = torch.from_numpy(train_set[['xmin', 'xmax', 'ymin', 'ymax']].values)

val_images = validation_set["file_names"].values
val_class = torch.from_numpy(validation_set["class"].values)
val_bbox = torch.from_numpy(validation_set[['xmin', 'xmax', 'ymin', 'ymax']].values)

test_images = test_set["file_names"].values
test_class = torch.from_numpy(test_set["class"].values)
test_bbox = torch.from_numpy(test_set[['xmin', 'xmax', 'ymin', 'ymax']].values)

train_tensor = CustomTensorDataset((train_images, train_class, train_bbox), transforms=transform)
val_tensor = CustomTensorDataset((val_images, val_class, val_bbox), transforms=transform)
test_tensor = CustomTensorDataset((test_images, test_class, test_bbox), transforms=transform)


In [62]:
model = CustomCNN(in_channels=3, out_channels_cnn=3, bboxes=4)

lr=0.0001

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
opt = torch.optim.Adam(model.parameters(), lr=lr)

classLossFunc = nn.CrossEntropyLoss()
bboxLossFunc = nn.MSELoss()
bbox_l1_loss_func = nn.L1Loss(reduction="mean")

model = model.to(device)

In [60]:
EPOCHS = 15
train_loss = []
train_accuracy = []
test_loss = []
test_accuracy = []


for epoch in tqdm.tqdm(range(EPOCHS)):
    correct = 0
    iterations = 0
    iter_loss = 0
    model.train()
    for i, (images, labels, bbox) in enumerate(train_tensor):
        images = images.to(device)
        labels = labels.to(device)
        bbox = bbox.to(device)

        regressor, classifier = model(images.unsqueeze(0))

        _, predicted = torch.max(classifier, 1) ## To get the labels of predicted 
        predicted_bbox = bbox + regressor ## to get the bbox of the predicted (add the regression offset with the original bbox)

        clf_loss = classLossFunc(classifier, labels.unsqueeze(0))
        #reg_loss = bboxLossFunc(predicted_bbox, bbox)
        
        #total_loss = (clf_loss + reg_loss).clone().detach().requires_grad_(True)
        reg_loss = bbox_l1_loss_func(predicted_bbox, bbox, reduction="none").sum(1)
        reg_loss = reg_loss.sum()
        total_loss = clf_loss + (reg_loss/1000) 
        
        opt.zero_grad()
        total_loss.backward()
        opt.step()
        
        iter_loss += total_loss.item()
        correct += (predicted == labels).sum().item()
        iterations += 1
        
    train_loss.append(iter_loss / iterations)
    train_accuracy.append((100 * correct / len(train_tensor)))
    print(f"Epoch [{epoch + 1} / {EPOCHS}], Training Loss: {train_loss[-1]:.3f}, Training Accuracy: {train_accuracy[-1]:.3f}")

  0%|                                                                                           | 0/15 [00:00<?, ?it/s]


RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same