# Dataset analysis

### Imports

In [None]:
# to make interactive plotting possible
%matplotlib inline
# for auto-reloading external modules
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# specify paths
pathes = {}
pathes['project_root'] = '../'
pathes['project_root'] = os.path.abspath(pathes['project_root'])
pathes['source_1'] = os.path.join(pathes['project_root'], 'src/thirdparty/cocoapi/PythonAPI/')
pathes['source_2'] = os.path.join(pathes['project_root'], 'src/datasets')

# --- check pathes
for (n,p) in pathes.items():
    if not os.path.exists(p):
        print(p)
        raise

In [None]:
sys.path.append(pathes['source_1'])
sys.path.append(pathes['source_2'])

In [None]:
from pycocotools.coco import COCO
from coco_to_pd import coco_ann_to_pd, coco_pred_to_pd, extract_info

### Load coco annotation file and transform to pandas df

In [None]:
path_data_gt = '/media/andrii/earth/work/data/MS_COCO'
image_gt_dir = os.path.join(path_data_gt, 'val2017/')
ann_file = os.path.join(path_data_gt, 'annotations', 'instances_val2017.json')

In [None]:
image_path = os.path.join(path_data_gt, image_gt_dir)

In [None]:
# coco ground truth
coco=COCO(ann_file)
df = coco_ann_to_pd(coco, image_path) # to pd

In [None]:
df.head(10)

### General statistics

In [None]:
print("Number of boxes in the dataset: {}".format(len(df)))
print("Number of images: {}".format(len(df['image_id'].unique())))

In [None]:
boxes_per_class = df['box_class'].value_counts()
all_classes = boxes_per_class.index.values
print("Number of classes: {}".format(len(all_classes)))
print("Classes: \n{}".format(all_classes))

In [None]:
# Number of boxes and images per class
grouped = df.groupby('box_class')
print('Class'.ljust(20) + '# images'.ljust(10) + '# boxes'.ljust(10) + '# boxes / # images'.ljust(10))
print('='*60)
for cl, df_cl in grouped:
    n_images = len(df_cl['image_id'].unique())
    n_boxes = len(df_cl['image_id'])    
    
    print(cl.ljust(20) + str(n_images).ljust(10) + str(n_boxes).ljust(10) + str(n_boxes/n_images).ljust(10))
    

n_images_total = len(df['image_id'].unique())
n_boxes_total = len(df['image_id']) 
print('='*60)
print('Total'.ljust(20) + str(n_images_total).ljust(10) + str(n_boxes_total).ljust(10) + str(n_boxes_total/n_images_total).ljust(10))

### Histograms

In [None]:
cl ='person'

In [None]:
boxes_per_im = df['image_id'].value_counts()
boxes_per_im_class = df[df['box_class'] == cl]['image_id'].value_counts()

In [None]:
q = boxes_per_im

x_name = "#boxes"
y_name = "#images"
title_name = "#boxes per image" #+ "\n [class]: " + cl
nbins = 10

print('Total (images): {:^5}'.format(len(q.values)))
print('Total  (boxes): {:^5}'.format(np.sum(q.values)))
print('Mean: {:^5}'.format(np.mean(q.values)))
print('Max: {:^5}'.format(np.max(q.values)))
print('Min: {:^5}'.format(np.min(q.values)))

# calculate the histogram
histo, bin_edges = np.histogram(q, nbins)

# auxilary for histogram display
bin_edges_ = bin_edges + (bin_edges[1] - bin_edges[0])/2
bin_edges_ = bin_edges_[:-1]
delta = bin_edges_[1] - bin_edges_[0]

# plot histogram
plt.bar(bin_edges_, histo, align='center', alpha=0.3, facecolor='g', width=delta)
plt.xlabel(x_name)
plt.ylabel(y_name)
plt.title(title_name)
plt.grid(True)


In [None]:
print(histo)

### Some histograms

In [None]:
boxes_centers = df[['box_x', 'box_y']].values + df[['box_w', 'box_h']].values / 2
boxes_centers_class = df[df['box_class'] == cl][['box_x', 'box_y']].values + df[df['box_class'] == cl][['box_w', 'box_h']].values / 2

In [None]:
plt.hist2d(boxes_centers[:,0], boxes_centers[:,1], (50, 50), cmap=plt.cm.jet)
plt.colorbar()

In [None]:
plt.hist2d(boxes_centers_class[:,0], boxes_centers_class[:,1], (10, 10), cmap=plt.cm.jet)
plt.colorbar()