In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as T

from pathlib import Path

import cv2

from math import exp, pi
import math

import matplotlib.pyplot as plt

from tqdm import tqdm

from PIL import Image
import warnings
warnings.filterwarnings('ignore')

In [3]:
batch_size = 1

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [5]:
train_images_path = Path('/kaggle/input/table-tennis-ball-position-detection-dataset/openttgames/openttgames/images/train')
train_labels_path = Path('/kaggle/input/table-tennis-ball-position-detection-dataset/openttgames/openttgames/labels/train')

test_images_path = Path('/kaggle/input/table-tennis-ball-position-detection-dataset/openttgames/openttgames/images/test')
test_labels_path = Path('/kaggle/input/table-tennis-ball-position-detection-dataset/openttgames/openttgames/labels/test')

train_annot_file = Path('/kaggle/input/openttannot/train_dataset_info.csv')
test_annot_file = Path('/kaggle/input/openttannot/test_dataset_info.csv')

train_df = pd.read_csv(train_annot_file)
test_df = pd.read_csv(test_annot_file)

train_df = train_df.dropna()
test_df = test_df.dropna()
train_df = train_df[train_df['game_id'] == 0]
test_df = test_df[test_df['game_id'] == 0]
train_df

In [6]:
class BallDetectionStage(nn.Module):
    
    def __init__(self):
        super().__init__()
        
        self.conv_1x1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=1, stride=1, padding=0)
        self.batch_norm = nn.BatchNorm2d(64)
        self.relu_1 = nn.ReLU()
        
        self.conv_block_1 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        )
        
        self.conv_block_2 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        )
        
        self.dropout_2d_1 = nn.Dropout2d(p=0.5)
        
        self.conv_block_3 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        )
        
        self.conv_block_4 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        )
        
        self.dropout_2d_2 = nn.Dropout2d(p=0.5)
        
        self.conv_block_5 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        )
        
        self.conv_block_6 = nn.Sequential(
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        )
        
        self.dropout_2d_3 = nn.Dropout2d(p=0.5)
        
        self.flatten = nn.Flatten()
        
        self.general_fc_block = nn.Sequential(
            nn.Linear(in_features=2560, out_features=1792),
            nn.ReLU(),
            nn.Dropout2d(p=0.5)
        )
        
        self.x_fc_block = nn.Sequential(
            nn.Linear(in_features=1792, out_features=640),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(in_features=640, out_features=320),
            nn.Sigmoid()
        )
        
        self.y_fc_block = nn.Sequential(
            nn.Linear(in_features=1792, out_features=256),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(in_features=256, out_features=128),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        x = self.conv_1x1(x)
        x = self.batch_norm(x)
        x = self.relu_1(x)

        x = self.conv_block_1(x)
        x = self.conv_block_2(x)
        x = self.dropout_2d_1(x)

        x = self.conv_block_3(x)
        x = self.conv_block_4(x)
        x = self.dropout_2d_2(x)

        x = self.conv_block_5(x)
        x = self.conv_block_6(x)
        x = self.dropout_2d_3(x)

        x = self.flatten(x)

        general_out = self.general_fc_block(x)

        x_out = self.x_fc_block(general_out)
        y_out = self.y_fc_block(general_out)

        return x_out, y_out

In [7]:
class DetectionModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.global_stage = BallDetectionStage()
        self.local_stage = BallDetectionStage()
        
    def __crop_image(self, image, x_c, y_c, height=128, width=320):
        if x_c in range(width // 2):
            x_low = 0
            x_high = width
        elif x_c in range(image.shape[3] - width // 2, image.shape[3]):
            x_low = image.shape[3] - width
            x_high = image.shape[3]
        else:
            x_low = x_c - (width // 2)
            x_high = x_c + (width // 2)

        if y_c in range(height // 2):
            y_low = 0
            y_high = height
        elif y_c in range(image.shape[2] - height // 2, image.shape[2]):
            y_high = image.shape[2] - height
            y_low = image.shape[2]
        else:
            y_low = y_c - (height // 2)
            y_high = y_c + (height // 2)

        return image[:, :, y_low:y_high, x_low:x_high]
        
    def forward(self, in_image, in_resized):
        out_global_x, out_global_y = self.global_stage(in_resized)
        
        g_x = np.argmax(out_global_x.cpu().detach().numpy())
        g_y = np.argmax(out_global_y.cpu().detach().numpy())
        
        g_x = int((g_x / 320) * 1920)
        g_y = int((g_y / 128) * 1080)
        
        cropped_image = self.__crop_image(in_image, g_x, g_y)
        
        out_local_x, out_local_y = self.local_stage(cropped_image)
        
        return (out_global_x, out_global_y), (out_local_x, out_local_y)

In [8]:
class OpenTTDataset(Dataset):
    
    def __init__(self, df, image_dir, transforms=None):
        super().__init__()
        self.df = df
        self.images_ids = df['file_name'].unique()
        self.image_dir = image_dir
#         self.image_size = image_size
        self.transforms = transforms
        
    def __transform_coords_to_corners(self, coords, width=320, height=128):
        x_1 = int(width * coords[0, 0] - width * coords[0, 2] / 2)
        y_1 = int(height * coords[0, 1] - height * coords[0, 3] / 2)
        x_2 = int(width * coords[0, 0] + width * coords[0, 2] / 2)
        y_2 = int(height * coords[0, 1] + height * coords[0, 3] / 2)
        return x_1, y_1, x_2, y_2
        
    def __get_coords(self, coords, stage):
        if stage == 'global':
            x_1, y_1, x_2, y_2 = self.__transform_coords_to_corners(coords, width=320, height=128)
            return x_1 + (x_2 - x_1) // 2, y_1 + (y_2 - y_1) // 2
        elif stage == 'local':
            x_1, y_1, x_2, y_2 = self.__transform_coords_to_corners(coords, width=1920, height=1080)
            x_c = x_1 + (x_2 - x_1) // 2
            y_c = y_1 + (y_2 - y_1) // 2
            
            # MANY MAGIC NUMBERS!!!            
            if x_c in range(160):
                x_local = x_c
            elif x_c in range(1920 - 160, 1920):
                x_local = 320 - (1920 - x_c)
            else:
                x_local = 160
                
            if y_c in range(64):
                y_local = y_c
            elif y_c in range(1080 - 64, 1080):
                y_local = 128 - (1080 - y_c)
            else:
                y_local = 64
            return x_local, y_local
            
    def __norm_distrib(self, x, m=400, sd=20):
        return exp(-((x - m) ** 2) / (2 * sd ** 2)) / (sd * ((2 * pi) ** 0.5))        
                
    def __getitem__(self, indx):
        fname = self.images_ids[indx]
        
        image = cv2.imread(str(self.image_dir / fname), cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0
        
        resized_image = cv2.resize(image, (320, 128))
        
        image = torch.tensor(image)
        image = torch.permute(image, (2,0,1))
        
        resized_image = torch.tensor(resized_image)
        resized_image = torch.permute(resized_image, (2,0,1))
        
        coords = self.df[self.df['file_name'] == fname][['a', 'b', 'c', 'd']].values
        
        coords_global = self.__get_coords(coords, stage='global')
        coords_local = self.__get_coords(coords, stage='local')
        
        out_coords_x_global = list()
        out_coords_y_global = list()
        
        out_coords_x_local = list()
        out_coords_y_local = list()
        
        for i in range(320):
            out_coords_x_global.append(self.__norm_distrib(i, m=coords_global[0], sd=5))
            out_coords_x_local.append(self.__norm_distrib(i, m=coords_local[0], sd=12))
            
        
        for i in range(128):
            out_coords_y_global.append(self.__norm_distrib(i, m=coords_global[1], sd=5))
            out_coords_y_local.append(self.__norm_distrib(i, m=coords_local[1], sd=12))
            
        out_coords_x_global = torch.tensor(out_coords_x_global).to(torch.float32)
        out_coords_y_global = torch.tensor(out_coords_y_global).to(torch.float32)
        
        out_coords_x_local = torch.tensor(out_coords_x_local).to(torch.float32)
        out_coords_y_local = torch.tensor(out_coords_y_local).to(torch.float32)
        
        return image, resized_image, (out_coords_x_global, out_coords_y_global), (out_coords_x_local, out_coords_y_local)
    
    def __len__(self):
        return self.df.shape[0]

In [9]:
class Averager:

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val: float, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [10]:
def train_one_epoch(train_data_loader, model, loss_fn, optimizer, device):
    
    model.train()
    
    summary_loss = Averager()
    
    prog_bar = tqdm(train_data_loader, total=len(train_data_loader))
    
    for image, image_resized, global_target, local_target in prog_bar:
        
        image = image.to(device)
        image_resized = image_resized.to(device)
        
        global_target = [coords.to(device) for coords in global_target]
        local_target = [coords.to(device) for coords in local_target]
        
        optimizer.zero_grad()
        
        global_pred, local_pred = model(image, image_resized)
        
        loss_x = loss_fn(global_pred[0], global_target[0]) + loss_fn(local_pred[0], local_target[0])
        loss_y = loss_fn(global_pred[1], global_target[1]) + loss_fn(local_pred[1], local_target[1])
        
        total_loss = loss_x + loss_y
        
        total_loss.backward()
        optimizer.step()
        
        summary_loss.update(total_loss, batch_size)
        prog_bar.set_postfix(loss=summary_loss.avg)
        
    return summary_loss

In [11]:
class SaveBestModel:

    def __init__(self, best_loss = 1000):
        self.best_loss = best_loss
        
    def __call__(self, current_loss, epoch, model):
        if self.best_loss > current_loss:
            self.best_loss = current_loss
            print(f'Best model found for epoch {epoch+1}')
            torch.save(model.state_dict(), 'checkpoint.pth')

In [12]:
def train(train_data_loader, test_data_loader, model, loss_fn, optimizer, device, epochs):
    save_best_model = SaveBestModel()
    for epoch in range(epochs):
        print(f'TRAIN EPOCH {epoch+1}')
        summary_loss = train_one_epoch(train_data_loader, model, loss_fn, optimizer, device)
        print(f'SUMMARY EPOCH LOSS: {summary_loss.avg}')
        summary_distance = validate(test_data_loader, model, device)
        print(f'SUMMARY EPOCH DISTANCE: {summary_distance.avg}')
        save_best_model(summary_loss.avg, epoch, model)
        print('---------------------------')

In [13]:
def validate(test_data_loader, model, device):
    model.eval()
    
    averager = Averager()
    
    with torch.no_grad():
        prog_bar = tqdm(test_data_loader, total=len(test_data_loader))
        
        for image, resized_image, global_target, local_target in prog_bar:
            image = image.to(device)
            resized_image = resized_image.to(device)
        
#             global_target = [coords.to(device) for coords in global_target]
#             local_target = [coords.to(device) for coords in local_target]
            
            global_pred, local_pred = model(image, resized_image)
            
            g_x_pred = np.argmax(global_pred[0].cpu().numpy())
            g_y_pred = np.argmax(global_pred[1].cpu().numpy())
            
            l_x_pred = np.argmax(local_pred[0].cpu().numpy())
            l_y_pred = np.argmax(local_pred[1].cpu().numpy())
            
            g_x_target = np.argmax(global_target[0].numpy())
            g_y_target = np.argmax(global_target[1].numpy())
            
            l_x_target = np.argmax(local_target[0].numpy())
            l_y_target = np.argmax(local_target[1].numpy())
            
            x_pred = int((g_x_pred * 1920) / 320 - (320 / 2) + l_x_pred)
            y_pred = int((g_y_pred * 1080) / 128 - (128 / 2) + l_y_pred)
            
            x_target = int((g_x_target * 1920) / 320 - (320 / 2) + l_x_target)
            y_target = int((g_y_target * 1080) / 128 - (128 / 2) + l_y_target)
            
            distance = ((x_target - x_pred) ** 2 + (y_target - y_pred) ** 2) ** 0.5
            
            averager.update(distance, batch_size)
            prog_bar.set_postfix(distance = averager.avg)
            
    return averager

In [None]:
train_dataset = OpenTTDataset(train_df, train_images_path)
train_data_loader = DataLoader(train_dataset, batch_size, shuffle=True)

test_dataset = OpenTTDataset(test_df, test_images_path)
test_data_loader = DataLoader(test_dataset, batch_size, shuffle=True)

model = DetectionModel()
model = model.to(device)

params = [p for p in model.parameters() if p.requires_grad]

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params)

train(train_data_loader, test_data_loader, model, loss_fn, optimizer, device, 30)