# global variables

In [1]:
# data options
LABELS_CODES = [0, 1, 2, 3]
LABEL_CODE_BACKGROUND = 0
NUMBER_OF_CLASSES = len(LABELS_CODES)

# labels conversions
LABEL_CODE_TO_DESC = {
    1: 'monorail',
    2: 'person',
    3: 'forklift'
}
LABEL_CODE_TO_COLOR = {
    1: 'red',
    2: 'green',
    3: 'blue'
}

# used for format printing
LENGTH_LONGEST_LABEL = max(len(label) for label in LABEL_CODE_TO_DESC.values())

# dependecies

In [2]:
import csv
import json
import numpy as np
import random

# load metadata

In [14]:
# training
data = []

# train
with open('data/train.json', 'r') as f:
    data.extend(json.load(f))

# train additional - persons
with open('data/train-additional-persons.json', 'r') as f:
    persons = json.load(f)
    persons = random.sample(persons, int(len(persons)*0.8))
    data.extend(persons)

# train additional - forklifts
with open('data/train-additional-forklifts.json', 'r') as f:
    data.extend(json.load(f))

# the training set it's small and the validation set even smaller..
# it's so small that probably any metrics on it won't be particularly reliable 
# at this point maybe it's just better to use the validation set as additional training data
with open('data/eval-persons-forklifts.json', 'r') as f:
    data.extend(json.load(f))

# unpack train metadata into separate lists
path_files_images_train, path_files_masks_train, path_files_labels_boxes_train = map(list, zip(*data))

# test
with open('data/test.json', 'r') as f:
    path_files_images_test, path_files_masks_test, path_files_labels_boxes_test = map(list, zip(*json.load(f)))

# object detection

In [15]:
# which data should be evaluated?
PATH_FILES_LABELS_BOXES = path_files_labels_boxes_train

## samples, images and boxes aspect ratios for each class

In [16]:
# for each class initialize counters for samples (images), instances (objects) and boxes aspect ratios (width / height)
# storing samples indexes per class and then counting the number of unique indexes it's a simple way to count samples per class
samples_per_class = {label: [] for label in LABELS_CODES if label != LABEL_CODE_BACKGROUND}
instances_per_class = {label: 0 for label in LABELS_CODES if label != LABEL_CODE_BACKGROUND}
boxes_aspect_ratios_per_class = {label: [] for label in LABELS_CODES if label != LABEL_CODE_BACKGROUND}

# for each file count number of samples per class and images per class
for i, path_file_labels_boxes in enumerate(PATH_FILES_LABELS_BOXES):

    # read ground truth labels and boxes
    with open(path_file_labels_boxes, 'r') as f:
        for label, xmin, ymin, xmax, ymax in csv.reader(f):

            # format ground truth data
            label = int(label)
            width = float(xmax) - float(xmin) + 1.0
            height = float(ymax) - float(ymin) + 1.0            

            # add indexes for count samples later on
            samples_per_class[label].append(i)

            # increment instances counter
            instances_per_class[label] += 1

            # add aspect ratio to the list
            boxes_aspect_ratios_per_class[label].append(width / height)


# calculate the number of samples per class
samples_per_class = {label: len(set(indexes)) for label, indexes in samples_per_class.items()}

In [18]:
# print samples
total_samples = sum(samples_per_class.values())
print('\n************************')
print(f'***      samples     ***')
print('************************')
for label, samples in samples_per_class.items():
    text_desc = f'{LABEL_CODE_TO_DESC[label]:>{LENGTH_LONGEST_LABEL}}'
    text_values_absolute = format(samples, ",")
    text_values_percentages = f'{samples / total_samples * 100:.0f}%'
    print(f'> {text_desc}: {text_values_absolute:>5} - {text_values_percentages}')

# print instances
total_instances = sum(instances_per_class.values())
print('\n************************')
print(f'***    instances     ***')
print('************************')
for label, instances in instances_per_class.items():
    text_desc = f'{LABEL_CODE_TO_DESC[label]:>{LENGTH_LONGEST_LABEL}}'
    text_values_absolute = format(instances, ",")
    text_values_percentages = f'{instances / total_instances * 100:.0f}%'
    print(f'> {text_desc}: {text_values_absolute:>5} - {text_values_percentages}')
    
# print aspect ratios
percentiles = [10, 20, 30, 40, 50, 60, 70, 80, 90]
print('\n************************')
print(f'***   aspect ratios  ***')
print(f'***   (percentiles)  ***')
print('************************')
for label, aspect_ratios in boxes_aspect_ratios_per_class.items():
    aspect_ratios = np.array(aspect_ratios)
    percentile_values = np.percentile(aspect_ratios, percentiles)
    print(f'> {LABEL_CODE_TO_DESC[label]}')
    for percentile, value in zip(percentiles, percentile_values):
        print(f'   - p{percentile}: {value:.3f}')


************************
***      samples     ***
************************
> monorail: 1,340 - 34%
>   person: 1,443 - 37%
> forklift: 1,153 - 29%

************************
***    instances     ***
************************
> monorail: 1,861 - 34%
>   person: 2,097 - 38%
> forklift: 1,535 - 28%

************************
***   aspect ratios  ***
***   (percentiles)  ***
************************
> monorail
   - p10: 0.300
   - p20: 0.503
   - p30: 0.801
   - p40: 1.233
   - p50: 1.570
   - p60: 2.044
   - p70: 2.502
   - p80: 3.376
   - p90: 5.129
> person
   - p10: 0.315
   - p20: 0.385
   - p30: 0.467
   - p40: 0.557
   - p50: 0.662
   - p60: 0.781
   - p70: 0.955
   - p80: 1.277
   - p90: 2.571
> forklift
   - p10: 0.461
   - p20: 0.587
   - p30: 0.728
   - p40: 0.845
   - p50: 0.975
   - p60: 1.134
   - p70: 1.272
   - p80: 1.442
   - p90: 1.930
