# item 1 - Analyze the dataset
Parse the groundtruth information into a dictionary and a pandas table

In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

%matplotlib inline

import numpy as np
import os, sys
import imageio
import pandas as pd
import matplotlib.pyplot as plt

path_txt = os.path.join('dataset', 'train', 'gt')
path_mask = os.path.join('dataset', 'train', 'mask')

dirs_txt = os.listdir(path_txt)
dirs_mask = os.listdir(path_mask)

data = dict()

for gt in dirs_txt:
    with open(os.path.join(path_txt, gt)) as f:
        lines = f.readlines()
        
        im_name = gt.replace('gt', 'mask').replace('txt', 'png')
        im_open = (imageio.imread(os.path.join(path_mask, im_name)))

        lista = list()
        
        # Extract grountruth information
        for l in lines:
            tly, tlx, bry, brx, tipo = l.split(' ')  
            tly, tlx, bry, brx = map(float, [tly, tlx, bry, brx])
            d = dict()
            d['type'] = tipo.strip()
            
            w = brx - tlx
            h = bry - tly
            
            d['width'] = w
            d['height'] = h
            d['bbox_area'] = w*h
            d['form_factor'] = w/h
            
            d['tly'] = round(tly)
            d['tlx'] = round(tlx)
            d['bry'] = round(bry)
            d['brx'] = round(brx)
            
            sub_mask = im_open[d['tly']:d['bry'], d['tlx']:d['brx']]
            mask_area = np.count_nonzero(sub_mask)
            d['mask_area'] = mask_area
            d['filling_ratio'] = mask_area / d['bbox_area']
            
            lista.append(d)
            
        data[gt] = lista
    

In [None]:
# Show information in pandas format
columns = ['type','width','height','form_factor','bbox_area','mask_area','filling_ratio']
df = pd.DataFrame.from_dict({(i,n): data[i][n]
                        for i in data.keys()
                        for n,v in enumerate(data[i])}, columns=columns, orient='index').sort_values(['type'])
#print(df)
#df['form_factor'].plot(figsize=(10, 7))

In [None]:
# Statistical calculations
type_counts = df.groupby('type').aggregate(np.std).round(2)
type_counts

#type_counts = df.groupby('type').aggregate(np.median)
#type_counts = df.groupby('type').aggregate(np.average)
#type_counts = df.groupby('type').aggregate(np.std)

In [None]:
# Group data by "type" (letter) and get statistics:

# Function = get_stats (only shape, aspect ratio, etc, NOT COLOUR)

# Queremos estadisticas de tamaño y forma en función de cada letra        

# Item 2 - Split training dataset

Extract the 30% of the training images of each class to set up a validation dataset

In [None]:
# Count number of signals per class
n_signals = df['type'].value_counts(sort=False).reindex(['A','B','C','D','E','F'])
print(n_signals)

# Plot
df['type'].value_counts(sort=False).reindex(['A','B','C','D','E','F']).plot(figsize=(10, 7),kind='bar',sort_columns=True)

In [None]:
# Get only one class
df_filtered = df[df['type'] == "B"]
df_filtered

In [None]:
df_sorted_train = df_filtered.sample(frac=0.7)
df_sorted_train

In [None]:
# Save train selection
tuple(zip(df_sorted_train.index.get_level_values(0).tolist(),df_sorted_train.index.get_level_values(1).tolist()))


In [None]:
# Delete train selection to get validation selection
df_sorted_test = pd.concat([df_filtered,df_sorted_train]).drop_duplicates(keep=False)
tuple(zip(df_sorted_test.index.get_level_values(0).tolist(),df_sorted_test.index.get_level_values(1).tolist()))

Functions to do all the process in one cell

In [None]:
def split_class(signal_class, train_percentage):
    
    # Choose one signal class
    df_filtered = df[df['type'] == signal_class]
    
    # Sample randomly the percentage choosen
    df_sorted_train = df_filtered.sample(frac=train_percentage)
    
    # Save train selection
    train_images = tuple(zip(df_sorted_train.index.get_level_values(0).tolist(),df_sorted_train.index.get_level_values(1).tolist()))
    
    # Delete train selection to get validation selection
    df_sorted_test = pd.concat([df_filtered,df_sorted_train]).drop_duplicates(keep=False)
    val_images = tuple(zip(df_sorted_test.index.get_level_values(0).tolist(),df_sorted_test.index.get_level_values(1).tolist()))
    
    return train_images, val_images

def split_dataset(data,percentage, classes):
    
    train_images = []
    val_images = []
    
    for signal_class in classes:
        temp_train_images, temp_val_images = split_class(signal_class, percentage)
        train_images += temp_train_images
        val_images += temp_val_images
    return train_images, val_images


classes = ['A','B','C','D','E','F']
train_images, val_images = split_dataset(data, 0.7, classes)
#print(train_images)
#print(val_images)
    

# Item 3 - Separation by colour


Para cada imagen de prueba, nos basamos en el diccionario y usamos el bounding box que ya calculamos.

Recortamos la imagen, le calculamos el histograma. Sumamos los histogramas de todas las imágenes y obtenemos la suma total.

Luego hacemos lo mismo y al recortar la roi de cada imagen convertimos la roi de rgb a hsv, calculamos el histograma
y obtenemos la suma de todos los histogramas. 

Con estas cosas deberíamos poder obtener los thresholds a aplicar luego.


Hacer gráficos chetos de los histogramas. Ajustar con multiples gaussianas, calcular promedio, mediana, std, etc.
Compararlos y elegir el mejor.

Luego aplicar las máscaras con los thresholds calculados

In [None]:
# Convert RGB images tu HUE images

# Get stats from HUE images

# Plot and compare

# Select thresholds (RGB y HUE)

# Create masks using previous thresholds



# Calculate histograms:

In [None]:
from skimage import color

path_jpg = os.path.join('dataset', 'train', 'jpg')
dirs_jpg = os.listdir(path_jpg)

def rgb_histogram(gt_dictionary, path_jpg):

    r_hist = np.zeros(255)
    g_hist = np.zeros(255)
    b_hist = np.zeros(255)

    for gt, values in list(gt_dictionary.items()):
        for v in values:

            jpg_name = gt.replace('gt.', '').replace('txt', 'jpg')
            jpg_roi = imageio.imread(os.path.join(path_jpg, jpg_name))[v['tly']:v['bry'], v['tlx']:v['brx']]

            bins = np.histogram(jpg_roi[:,:,0], bins=255, range=(1,255))[1]
            r_hist += np.histogram(jpg_roi[:,:,0], bins=255, range=(1,255))[0]
            g_hist += np.histogram(jpg_roi[:,:,1], bins=255, range=(1,255))[0]
            b_hist += np.histogram(jpg_roi[:,:,2], bins=255, range=(1,255))[0]
    
    return bins, r_hist, g_hist, b_hist


def hsv_histogram(gt_dictionary, path_jpg):

    h_hist = np.zeros(255)
    s_hist = np.zeros(255)
    v_hist = np.zeros(255)

    for gt, values in list(gt_dictionary.items()):
        for v in values:

            jpg_name = gt.replace('gt.', '').replace('txt', 'jpg')
            jpg_roi = imageio.imread(os.path.join(path_jpg, jpg_name))[v['tly']:v['bry'], v['tlx']:v['brx']]
            hsv_roi = color.rgb2hsv(jpg_roi)*255
            
            bins = np.histogram(hsv_roi[:,:,0], bins=255, range=(1,255))[1]
            h_hist += np.histogram(hsv_roi[:,:,0], bins=255, range=(1,255))[0]
            s_hist += np.histogram(hsv_roi[:,:,1], bins=255, range=(1,255))[0]
            v_hist += np.histogram(hsv_roi[:,:,2], bins=255, range=(1,255))[0]
    
    return bins, h_hist, s_hist, v_hist

################################################
# Calculate histograms with traffic signal data:
################################################

# RGB histograms:

bins, r_hist, g_hist, b_hist = rgb_histogram(data, path_jpg)

# HSV histograms:

hbins, h_hist, s_hist, v_hist = hsv_histogram(data, path_jpg)


# Plot histograms:

In [None]:
# Questions: why can't I use the variable 'bins' to plot the histograms?
# Why do I get a divergence at 255?
# Why am I dividing by zero when converting to HSV?

def plot_histogram(hist, color_name, color_plot):

    x = np.arange(255)

    plt.figure(figsize=(7,7))
    plt.bar(x, hist, color=color_plot)
    plt.ylim((0,60000))
    plt.title(color_name + ' histogram')
    plt.xlabel('8bit quantification')
    plt.ylabel('Total number of px')
    plt.show()
    
# Plot RGB histograms:

plot_histogram(r_hist, 'Red', 'r')
plot_histogram(g_hist, 'Green', 'g')
plot_histogram(b_hist, 'Blue', 'b')

# Plot HSV histograms:

plot_histogram(h_hist, 'H', 'r')
plot_histogram(s_hist, 'S', 'g')
plot_histogram(v_hist, 'V', 'b')


# Item 4 - Evaluate colour masks 

Comparar las máscaras obtenidas con el ground truth.

Podemos ver si sirven las funciones que ya nos dieron hechas.

# Lo que sigue a partir de aqui fueron pruebas hechas el martes:

In [None]:
print(data['gt.00.005025.txt'][0])
print(data['gt.00.005025.txt'][1])

for gt in dirs_txt:
    print(data[gt][0]['filling_ratio'])

In [None]:
for mask in dirs_mask:
    gt = mask.replace('mask', 'gt').replace('png', 'txt')
    m = (imageio.imread(os.path.join(path_mask, mask)))
    mask_area = m.sum()
    d = data[gt]
    d['mask_area'] = mask_area
    d['filling_ratio'] = mask_area / d['bbox_area']
    
    print(d['mask_area'], d['filling_ratio'], mask)
    

In [None]:
print(dirs_mask[0].replace('mask', 'gt').replace('png', 'txt'))
print(dirs_txt[0])


In [None]:
import matplotlib.pyplot as plt

% matplotlib inline

m = imageio.imread(os.path.join(path_mask,'mask.00.005025.png')).astype(np.int8)

plt.imshow(m[146:201, 1324:1375])
