# Risiko Detection: Boxes Analisys #
Here we will analyze the properties of the bounding boxes specified as text files inside the directory specified as *labels_path*.

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import cv2
plt.rcParams['figure.figsize'] = [9, 9]

labels_path = "Generated_Dataset/train/labels"
images_path = "Generated_Dataset/train/images"

#### Read Data ####
Read labels files, compute areas and ratios and do some statistics

In [None]:
labels_files = sorted( filter( lambda x: os.path.isfile(os.path.join(labels_path, x)),os.listdir(labels_path) ) )

# load each image to fix 
load_imgs = False

# get img size in pixel
if load_imgs:
    img_dims = np.zeros([len(labels_files), 2], dtype=np.float32)
    for i in range(len(labels_files)):
        img_path = os.path.join(images_path, os.path.splitext(labels_files[i])[0] + ".jpg")
        img = cv2.imread(img_path)
        img_dims[i,0] = img.shape[1]
        img_dims[i,1] = img.shape[0]

    img_stats = np.hstack([img_dims, np.expand_dims(img_dims[:,0]/img_dims[:,1], 1), np.expand_dims(img_dims[:,0]*img_dims[:,1], 1), np.expand_dims(np.sqrt(img_dims[:,0]*img_dims[:,1]), 1)])

    df = pd.DataFrame(img_stats)
    del img_stats
    df.columns = ["img width", "img height", "img ratio", "img area", "img root area"]
    df.describe()

In [None]:
labels_list = []
std_size = np.array([1000, 1000], dtype=np.float32) # size of the image to be input of nn
boxes = np.empty([0,4], dtype=np.float32)

for i in range(len(labels_files)):
    lbl = np.loadtxt(os.path.join(labels_path, labels_files[i]), dtype=np.float32)
    if lbl.shape[0] == 0:
        continue
    elif len(lbl.shape) == 1:
        lbl = np.expand_dims(lbl, 0)

    if load_imgs:
        max_dim_id = np.argmax(img_dims[i])
        scale = std_size[max_dim_id] / img_dims[i,max_dim_id]
        rescaled_size = np.round(img_dims[i] * scale)
    else:
        rescaled_size = std_size

    # Scale boxes centers and sizes
    np.multiply(lbl[:,1:], np.hstack([rescaled_size, rescaled_size]), out=lbl[:,1:])

    # Fix boxes centers with offset and scale
    np.add(lbl[:,1:3], (std_size - rescaled_size) / 2, out=lbl[:,1:3])

    np.round(lbl, out=lbl)

    labels_list.append(lbl)
    boxes = np.vstack([boxes, lbl[:,1:]], dtype=np.float32)

sizes = boxes[:,2:]
areas = np.sqrt(sizes[:,0] * sizes[:,1])
ratios = sizes[:,0] / sizes[:,1]

box_stat_data = np.hstack([sizes, areas.reshape([areas.shape[0],1]), ratios.reshape([ratios.shape[0],1])])

df = pd.DataFrame(box_stat_data)
df.columns = ["Width", "Height", "Area", "Ratio"]
df.describe(percentiles=[0.02,0.1,0.25,0.5,0.75,0.9,0.98])

Find Image with objects that have the smallest area to inspect it manually (check whether it's actually worth it to allow detection for such small objects)

In [None]:
min_area = 1e100
min_fname = ""
for i in range(len(labels_list)):
    inst_areas = labels_list[i][:,3] * labels_list[i][:,4]
    min_id = np.argmin(inst_areas)
    if inst_areas[min_id] < min_area:
        min_dims = labels_list[i][min_id]
        min_area = inst_areas[min_id]
        min_fname = labels_files[i]

print(min_area)
print(min_dims)
print(min_fname)

Now we check some statistics about the IoU in each image

In [None]:
def IoU(boxes:np.ndarray) -> float:

    half_dims = boxes[:, 2:] / 2
    x1 = boxes[:, 0] - half_dims[:, 0]
    x2 = boxes[:, 0] + half_dims[:, 0]
    y1 = boxes[:, 1] - half_dims[:, 1]
    y2 = boxes[:, 1] + half_dims[:, 1]

    xA = np.maximum(x1, np.expand_dims(x1, 1))
    yA = np.maximum(y1, np.expand_dims(y1, 1))
    xB = np.minimum(x2, np.expand_dims(x2, 1))
    yB = np.minimum(y2, np.expand_dims(y2, 1))

    interArea = np.maximum((xB - xA), 0) * np.maximum((yB - yA), 0)

    boxesArea = (x2 - x1) * (y2 - y1)
    unionArea = boxesArea + np.expand_dims(boxesArea,1) - interArea

    IoU = interArea / unionArea

    IoU = IoU[~np.eye(IoU.shape[0],dtype=bool)].reshape(IoU.shape[0],-1)

    IoU = IoU.flatten()

    return np.max(IoU)

max_img_iou_list = np.empty([0], dtype=np.float32)
for boxes in labels_list:
    if boxes.shape[0] > 1:
        max_img_iou_list = np.hstack([max_img_iou_list, IoU(boxes[:,1:])])
df = pd.DataFrame(max_img_iou_list)
df.columns= ["Max IoU each img"]
df.describe(percentiles=[0.02,0.1,0.25,0.5,0.75,0.9,0.98])

#### Plot all points ####

In [None]:
plt.scatter(sizes[:,0], sizes[:,1], s=5, marker='.')
plt.xlim([0,sizes[:,0].max()+20])
plt.ylim([0,sizes[:,1].max()+20])
plt.grid(which='major', linestyle='-', linewidth='0.5', color='black')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.minorticks_on()
plt.show()

Get histogram on areas to estimate size

In [None]:
plt.rcParams['figure.figsize'] = [20, 5]
plt.hist(areas, bins=250)
plt.show()
plt.rcParams['figure.figsize'] = [9, 9]
print()

In [None]:
plt.rcParams['figure.figsize'] = [20, 5]
fig, (ax1, ax2) = plt.subplots(1,2)
ax1.hist(sizes[:,0], bins=250)
ax2.hist(sizes[:,1], bins=250)
plt.rcParams['figure.figsize'] = [9, 9]
print("Maxium size for the bounding boxes is: " , str(np.max(sizes,0)))
print("Minimum size for the bounding boxes is: " , str(np.min(sizes,0)))

## KMeans Clustering ##
Now we run some clustering to identify patterns in the data. This will useful when generating the neural network model and detecting the tanks and flags.
In particular this results will be useful when chosing the anchor boxes size and ratios

#### Clustering plot function ####

In [None]:
def plot_clusters(points:np.ndarray, labels:np.ndarray, k:int, centers=None):
    # plot the 3 clusters
    colors = ["green", "darkorange", "purple", "blue", "lightblue", "lightgreen"]
    for i in range(k):
        plt.scatter(points[labels == i, 0], points[labels == i, 1], s=5, c=colors[i], marker='.', label="cluster " + str(i))

    # plot the centroids
    if not (centers is None):
        plt.scatter(centers[:, 0], centers[:, 1], s=20, marker='o', c='red', edgecolor='black', label='centroids')

    plt.legend(scatterpoints=1)
    plt.minorticks_on()
    plt.xlim([0,points[:,0].max()+20])
    plt.ylim([0,points[:,1].max()+20])
    plt.grid(which='major', linestyle='-', linewidth='0.5', color='black')
    plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
    plt.show()

#### Clustering on Size ####

In [None]:
k_sizes = 3
km_size = KMeans(n_clusters=k_sizes, n_init='auto')
km_sizes_output = km_size.fit_predict(np.expand_dims(areas, 1))
print(np.sort(km_size.cluster_centers_.round(), 0))

plot_clusters(sizes, km_sizes_output, k_sizes)

#### Clustering on Ratios

In [None]:
k_ratios = 3
km_ratios = KMeans(n_clusters=k_ratios, n_init='auto')
km_ratios_output = km_ratios.fit_predict(ratios.reshape(ratios.shape[0], 1))
print(km_ratios.cluster_centers_)

plot_clusters(sizes, km_ratios_output, k_ratios)