# Task 1 - Analyze the dataset
Parse the groundtruth information into a dictionary. The key will be the name of the picture and the value a list of the signals and their information. 

This data structure let us find directly the picture that we want to process.

In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

# Enable matplotline inline display
%matplotlib inline

# Import built-in modules
import os

# Import third party modules
import imageio
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Import local functions
from utils import get_files_from_dir, get_gt_data, get_img, get_n_samples_of_col, \
    get_unique_values_of_col, get_patch, gt_to_img, gt_to_mask, parse_gt_data, ROOT_DIR


In [None]:

# Useful directories
TRAIN_DIR = os.path.join(ROOT_DIR, 'dataset', 'train')
TRAIN_GTS_DIR = os.path.join(TRAIN_DIR, 'gt')
TRAIN_MASKS_DIR = os.path.join(TRAIN_DIR, 'mask')
TEST_DIR = os.path.join(ROOT_DIR, 'dataset', 'test')


In [None]:


gt_filenames = get_files_from_dir(TRAIN_GTS_DIR)
mask_filenames = get_files_from_dir(TRAIN_MASKS_DIR)

data = dict()

for gt in gt_filenames:
    mask_path = gt_to_mask(gt)
    mask_img = get_img(TRAIN_MASKS_DIR, mask_path)
    
    lines = get_gt_data(TRAIN_GTS_DIR, gt)
        
    im_name = gt_to_mask(gt)
    im_open = get_img(TRAIN_MASKS_DIR, im_name)

    lista = list()
    for l in lines:
        tly, tlx, bry, brx, tipo = parse_gt_data(l)

        d = dict()
        d['type'] = tipo.strip()
        
        w = brx - tlx
        h = bry - tly
        
        d['width'] = w
        d['height'] = h
        d['bbox_area'] = w*h
        d['form_factor'] = w/h
        
        d['tly'] = round(tly)
        d['tlx'] = round(tlx)
        d['bry'] = round(bry)
        d['brx'] = round(brx)
        
        mask_patch = get_patch(mask_img, d['tlx'], d['tly'], d['brx'], d['bry'])
        mask_area = np.count_nonzero(mask_patch)
        d['mask_area'] = mask_area
        d['filling_ratio'] = mask_area / d['bbox_area']
        d['mask'] = mask_patch != 0
        
        lista.append(d)
        
    data[gt] = lista


Show information in pandas format to filter the signals by type easily.


In [None]:
columns = ['type','width','height','form_factor','bbox_area','mask_area','filling_ratio']
df = pd.DataFrame.from_dict({(i,n): data[i][n]
                        for i in data.keys()
                        for n,v in enumerate(data[i])}, columns=columns, orient='index').sort_values(['type'])
df