# Risiko Detection: Boxes Analisys #
Here we will analyze the properties of the bounding boxes specified as text files inside the directory specified as *labels_path*.

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import cv2
plt.rcParams['figure.figsize'] = [9, 9]

labels_path = "Professor_Material/real_images/labels"
images_path = "Professor_Material/real_images/images"

#### Read Data ####
Read labels files, compute areas and ratios and do some statistics

In [None]:
labels_files = sorted( filter( lambda x: os.path.isfile(os.path.join(labels_path, x)),os.listdir(labels_path) ) )
boxes = np.empty([0,4], dtype=np.float32)

labels_list = []
img_dims = []
for filename in labels_files:
    lbl = np.loadtxt(os.path.join(labels_path, filename), dtype=np.float32)

    img_path = os.path.join(images_path, os.path.splitext(filename)[0] + ".jpg")
    img = cv2.imread(img_path)
    img_dim = np.array([img.shape[0], img.shape[1], img.shape[0], img.shape[1]], dtype=np.float32)
    #img_dim = np.array([1920,1080,1920,1080], dtype=np.float32)

    np.multiply(lbl[:,1:], img_dim, out=lbl[:,1:])
    np.floor(lbl, out=lbl)

    img_dims.append(img_dim)
    labels_list.append(lbl)
    boxes = np.vstack([boxes, lbl[:,1:]], dtype=np.float32)

sizes = boxes[:,2:]
areas = np.sqrt(sizes[:,0] * sizes[:,1])
ratios = sizes[:,0] / sizes[:,1]

box_stat_data = np.hstack([sizes, areas.reshape([areas.shape[0],1]), ratios.reshape([ratios.shape[0],1])])

df = pd.DataFrame(box_stat_data)
df.columns = ["Width", "Height", "Area", "Ratio"]
df.describe()

Find Image with objects that have the smallest area to inspect it manually (check whether it's actually worth it to allow detection for such small objects)

In [None]:
min_dims = np.array([0, 1000, 1000, 1000], np.float32)
for i in range(len(labels_list)):
    inst_areas = labels_list[i][:,3] * labels_list[i][:,4]
    min_id = np.argmin(inst_areas)
    if inst_areas[min_id] < min_dims[1]:
        min_dims[0:] = i, inst_areas[min_id], labels_list[i][min_id,3] , labels_list[i][min_id,4]

print(min_dims) 
print(labels_files[int(min_dims[0])])

Now we check some statistics about the IoU in each image

In [None]:
def IoU(boxes:np.ndarray) -> float:

    half_dims = boxes[:, 2:] / 2
    x1 = boxes[:, 0] - half_dims[:, 0]
    x2 = boxes[:, 0] + half_dims[:, 0]
    y1 = boxes[:, 1] - half_dims[:, 1]
    y2 = boxes[:, 1] + half_dims[:, 1]

    xA = np.maximum(x1, np.expand_dims(x1, 1))
    yA = np.maximum(y1, np.expand_dims(y1, 1))
    xB = np.minimum(x2, np.expand_dims(x2, 1))
    yB = np.minimum(y2, np.expand_dims(y2, 1))

    interArea = np.maximum((xB - xA), 0) * np.maximum((yB - yA), 0)

    boxesArea = (x2 - x1) * (y2 - y1)
    unionArea = boxesArea + np.expand_dims(boxesArea,1) - interArea

    IoU = interArea / unionArea

    IoU = IoU[~np.eye(IoU.shape[0],dtype=bool)].reshape(IoU.shape[0],-1)

    IoU = IoU.flatten()

    return np.max(IoU)

max_img_iou_list = np.empty([0], dtype=np.float32)
for boxes in labels_list:
    max_img_iou_list = np.hstack([max_img_iou_list, IoU(boxes[:,1:])])
df = pd.DataFrame(max_img_iou_list)
df.columns= ["Max IoU each img"]
df.describe()

#### Plot all points ####

In [None]:
plt.scatter(sizes[:,0], sizes[:,1], s=5, marker='.')
plt.xlim([0,sizes[:,0].max()+20])
plt.ylim([0,sizes[:,1].max()+20])
plt.grid(which='major', linestyle='-', linewidth='0.5', color='black')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.minorticks_on()
plt.show()

Get histogram on areas to estimate size

In [None]:
plt.rcParams['figure.figsize'] = [20, 5]
plt.hist(areas, bins=100)
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2)
ax1.hist(sizes[:,0], bins=100)
ax2.hist(sizes[:,1], bins=100)
plt.rcParams['figure.figsize'] = [9, 9]

## KMeans Clustering ##
Now we run some clustering to identify patterns in the data. This will useful when generating the neural network model and detecting the tanks and flags.
In particular this results will be useful when chosing the anchor boxes size and ratios

#### Clustering plot function ####

In [None]:
def plot_clusters(points:np.ndarray, labels:np.ndarray, k:int, centers=None):
    # plot the 3 clusters
    colors = ["green", "darkorange", "purple", "blue", "lightblue", "lightgreen"]
    for i in range(k):
        plt.scatter(points[labels == i, 0], points[labels == i, 1], s=5, c=colors[i], marker='.', label="cluster " + str(i))

    # plot the centroids
    if not (centers is None):
        plt.scatter(centers[:, 0], centers[:, 1], s=20, marker='o', c='red', edgecolor='black', label='centroids')

    plt.legend(scatterpoints=1)
    plt.minorticks_on()
    plt.xlim([0,points[:,0].max()+20])
    plt.ylim([0,points[:,1].max()+20])
    plt.grid(which='major', linestyle='-', linewidth='0.5', color='black')
    plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
    plt.show()

#### Clustering on Size ####

In [None]:
k_sizes = 3
km_size = KMeans(n_clusters=k_sizes, n_init='auto')
km_sizes_output = km_size.fit_predict(sizes)
print(km_size.cluster_centers_.round())

plot_clusters(sizes, km_sizes_output, k_sizes, km_size.cluster_centers_)

#### Clustering on Ratios

In [None]:
k_ratios = 3
km_ratios = KMeans(n_clusters=k_ratios, n_init='auto')
km_ratios_output = km_ratios.fit_predict(ratios.reshape(ratios.shape[0], 1))
print(km_size.cluster_centers_)

plot_clusters(sizes, km_ratios_output, k_ratios)

#### Some combination of the both clusterings ####
First we separtate the sets into the clusters identified by size clustering, then we run clustering on ratios on each subset

In [None]:
sizes_subsets, ratios_subsets = [], []
for i in range(k_sizes):
    sizes_subsets.append(sizes[km_sizes_output == i])
    ratios_subsets.append(ratios[km_sizes_output == i])

k_ratios = 3
km_ratios = KMeans(n_clusters=k_ratios, n_init='auto')
for i in range(k_sizes):
    km_ratios_output = km_ratios.fit_predict(ratios_subsets[i].reshape(ratios_subsets[i].shape[0], 1))

    

    plot_clusters(sizes_subsets[i], km_ratios_output, k_ratios)
    

In [None]:
print(km_ratios_output)