# Notes

Based on feedback:

* Better not to compute mean values on the three configurations
* Not really meaningful to examine the MAE and validation MAE of the last epoch


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle

In [None]:
from ipypublish import nb_setup
# https://stackoverflow.com/a/39566040/11552622
rcparams = {
    'axes.titlesize':13,
    'axes.labelsize':9,
    'xtick.labelsize':8,
    'ytick.labelsize':8
}
plt = nb_setup.setup_matplotlib(rcparams=rcparams)

# Introduction

## Aim

Implement framework to extract data generated by the script `script_stats_dnn.py` and concatenate it. This aims to process data from a single dataset. Further (but easy) manipulation are required to merge everything together (e.g. `pandas.join`).


## Quick implementation overview

The script is runned from a folder containing `X.mat` and `Y.mat` files. It creates a new directory for each tested DNN configuration. For instance, the file hierarchy of the root folder once script is finished: 

```
.
├── 1_hlayers_16_neurons_40_aks
├── 1_hlayers_32_neurons_40_aks
├── 1_hlayers_64_neurons_40_aks
├── 1_hlayers_8_neurons_40_aks
├── 2_hlayers_16_neurons_40_aks
├── 2_hlayers_32_neurons_40_aks
├── 2_hlayers_64_neurons_40_aks
├── 2_hlayers_8_neurons_40_aks
├── 4_hlayers_16_neurons_40_aks
├── 4_hlayers_32_neurons_40_aks
├── 4_hlayers_64_neurons_40_aks
├── 4_hlayers_8_neurons_40_aks
├── 6_hlayers_16_neurons_40_aks
├── 6_hlayers_32_neurons_40_aks
├── 6_hlayers_64_neurons_40_aks
├── 6_hlayers_8_neurons_40_aks
├── X.mat
└── Y.mat
```

Each folder has the structure:

```
1_hlayers_16_neurons_40_aks/
├── coefmaxs.npy
├── coefmins.npy
├── DNN_0D_Model.h5
├── DNN_Performance.eps
├── history.bin
├── log.txt
├── Losses.eps
├── parammaxs.npy
├── parammins.npy
├── Ytestpred.txt
└── Ytest.txt
```

We're interested in the files:

* `history.bin`: a pickled dictionnary containing the keras model's history (losses)
* 

## Setup

In [None]:
path = '/media/maousi/Data/tmp/dnn_stats/4000RPM_Pulse'

# Prepare code for script 

## Aim

Implement and test some routines that will be subsequently used in a script.

## Walking around file hierarchy

### Aim

Given root folder, obtain the list of all files to load

## Let's go


In [None]:
def get_list(rootdir, fileformat):
    """
    :param fileformat: example: `.txt`
    """
    ls = []
    for dirname, subdirlist, filelist in os.walk(rootdir):
        for file in filelist:
            if file.endswith(fileformat):
                ls.append(os.path.join(dirname, file))
    
    return ls

In [None]:
get_list(path, '.bin')

In [None]:
ls = get_list(path, '.bin')

## Load, extract, create DataFrame

### Aim

Given the list of history files, load them, extract the losses, generate DataFrame.

### Routine: parse dnn config from file path



In [None]:
def parse_config(filepath):
    parent_dir = os.path.basename(os.path.dirname(filepath))
    ls = parent_dir.split('_')
    res = {
        'hlayers': 0,
        'neurons': 2,
        'aks': 4
    }
    return {key: int(ls[index]) for key, index in res.items()}

In [None]:
parse_config(ls[0]), parse_config(ls[1])

### Routine : Extract losses

In [None]:
with open(ls[0], 'rb') as f:
    data = pickle.load(f)
data.keys()

In [None]:
plt.plot(data['loss'])
plt.plot(data['val_loss'])
data['loss'][-1], data['val_loss'][-1]

In [None]:
def load_losses(filepath):
    with open(filepath, 'rb') as f:
        data = pickle.load(f)
    
    # We extract the last item of every list
    return {key: lst[-1] for key, lst in data.items()}

In [None]:
load_losses(ls[0])

### Wrapper

In [None]:
def process_stats_data(rootdir):
    files = get_list(rootdir, 'history.bin')
    data = []
    for f in files:
        info = parse_config(f)
        info.update(load_losses(f))
        data.append(info)
    return pd.DataFrame(data).sort_values(['hlayers', 'neurons', 'aks'])

In [None]:
df = process_stats_data(path)
df

In [None]:
df.sort_values(['val_loss']).head()

## Plot - Investigate loss <-> architecture

### Aim 

Visualize losses in function of number of layers, number of neurons.

In [None]:
sns.set(style='whitegrid')

In [None]:
def plot_dnn_stats(df):
    fig, ax = plt.subplots(1, 2, figsize=(8, 3))
    
    ls_hlayers = df['hlayers'].value_counts('hlayers').index.values
    for hlayers in ls_hlayers:
        sub = df[df.hlayers == hlayers]
        ax[0].plot(sub.neurons, sub.val_loss, '-o', label=str(hlayers))
        ax[1].plot(sub.neurons, sub.val_mae, '-o')
    
    ax[0].legend(title='Nb layers')
    ax[0].set_xlabel('Number of neurons')
    ax[0].set_ylabel('Validation loss')
    ax[1].set_xlabel('Number of neurons')
    ax[1].set_ylabel('Mean absolute error')
    plt.subplots_adjust(wspace=.4)
    plt.tight_layout()

In [None]:
plot_dnn_stats(df)
plt.savefig('figs/dnn_architecture_stats.eps')

## General wrapper

In [None]:
def perform_dnn_architecture_analysis(basefolder, subfolders, suffixes):
    for folder, suffix_name in zip(subfolders, suffixes):
        folderpath = os.path.join(basefolder, folder)
        print(folder)
        
        df = process_stats_data(folderpath)
        plot_dnn_stats(df)
        plt.savefig(f'figs/dnn_architecture_stats_{suffix_name}.eps')

In [None]:
perform_dnn_architecture_analysis('/media/maousi/Data/tmp/dnn_stats', 
                                 ['4000RPM_Pulse', '5000RPM_Pulse', '6000RPM_Pulse'],
                                 ['4000RPM_AP', '5000RPM_AP', '6000RPM_AP'])

In [None]:
!ls $path

In [None]:
tmp = path+'/2_hlayers_32_neurons_40_aks/'

In [None]:
!ls $tmp

# Aggregate and analyze results

## Aim

Combines `results.csv` files of all datasets and plot.

## Prerequisites

Run the script `script_stats_dnn.py --analyze` in each folder that is dedicated to a dataset.

## Setup

In [None]:
path = '/media/maousi/Data/tmp/dnn_stats/'
!ls $path

## Combine in a single dataframe

In [None]:
folders = {e : os.path.join(path, e) for e in os.listdir(path)}
folders = filter(lambda e: os.path.isdir(e[1]), folders.items())
folders = dict(folders)
folders

In [None]:
# Keys = RPM, value = results file path
files = {int(name.split('RPM')[0]) : os.path.join(path, 'results.csv')
         for name, path in folders.items()}
files

In [None]:
df = []
for rpm, file in files.items():
    tmp = pd.read_csv(file)
    tmp['RPM'] = rpm
    df.append(tmp)
df = pd.concat(df, ignore_index=True)

In [None]:
df

In [None]:
df.to_csv('data/dnn_stats.csv', index=False)

## Plot

In [None]:
df.head()

In [None]:
df.neurons.unique()

In [None]:
def plot_dataset1(df):
    # Hue = neurons
    # x = number of coeffs
    # y = loss
    # separate plots = hlayers
    
    hue_var, plots = 'neurons', 'hlayers'
    hue_values = df[hue_var].unique()
    n_hue = len(hue_values)
    hlayers_values = df.hlayers.unique()
    n_plots = len(hlayers_values)
    ncol = 2
    x, y = 'aks', 'val loss'
    colors = sns.color_palette()[:n_hue]
    
    df = df.rename({'val_loss' : 'val loss'}, axis=1).sort_values(x)
    
    fig, ax = plt.subplots(int(n_plots/ncol), ncol, sharex=True, sharey=True)
    
    for a, hlayers in zip(ax.ravel(), hlayers_values):
        for i, hueval in enumerate(hue_values):
            sub = df[df[hue_var] == hueval]
            a.plot(sub[x], sub[y], color=colors[i])
            a.scatter(sub[x], sub[y], s=12)
    print(sub)
    # x axis
    for a in ax[-1]: a.set_xlabel(x)
    #for a in ax[:-1].ravel(): a.set_xticks([])
    # y axis
    for a in ax[:, 0]: a.set_ylabel(y)
    #for a in ax[:, 1:].ravel(): a.set_yticks([])
    
    plt.subplots_adjust(wspace=.07, hspace=.1)

plot_dataset1(df[df.RPM == 4000])

## Once again

In [None]:
def plot_dnn_stats(df):
    fig, ax = plt.subplots(1, 2, figsize=(8, 3))
    
    ls_hlayers = df['hlayers'].unique()
    colors = sns.color_palette()
    for i, hlayers in enumerate(ls_hlayers):
        sub = df[df.hlayers == hlayers]
        ax[0].plot(sub.neurons, sub.val_loss, '-o', label=str(hlayers), color=colors[i])
        #ax[0].plot(sub.neurons, sub.loss, '--o', color=colors[i])
        ax[1].plot(sub.neurons, sub.val_loss - sub.loss, '-o', color=colors[i])
    
    ax[0].legend(title='Nb layers')
    ax[0].set_xlabel('Number of neurons')
    ax[0].set_ylabel('Validation loss')
    ax[1].set_xlabel('Number of neurons')
    ax[1].set_ylabel('Validation loss - training loss')
    plt.subplots_adjust(wspace=.4)
    plt.tight_layout()
    ax[0].grid(ls='--'); ax[1].grid(ls='--')

In [None]:
plot_dnn_stats(df[np.logical_and(df.RPM == 4000, df.aks == 40)])
plt.suptitle('RPM = 4000', size=14)
plot_dnn_stats(df[np.logical_and(df.RPM == 5000, df.aks == 40)])
plt.suptitle('RPM = 5000', size=14)
plot_dnn_stats(df[np.logical_and(df.RPM == 6000, df.aks == 40)])
plt.suptitle('RPM = 6000', size=14)
plt.savefig('figs/test.eps')

# Average over 3 datasets



In [None]:
df.shape

In [None]:
df.head()

In [None]:
avg = df.groupby(['hlayers', 'neurons', 'aks']).mean().drop('RPM', axis=1)
avg

In [None]:
avg = avg.reset_index().sort_values(['hlayers', 'neurons'])
avg

In [None]:
avg.to_csv('data/dnn_stats_avg.csv', index=False)

In [None]:
plot_dnn_stats(avg[avg.aks == 40])

# Best configurations

In [None]:
avg[avg.aks == 50].sort_values('val_mae')

In [None]:
avg[avg.aks == 40].sort_values('val_loss')

In [None]:
avgtmp = avg[avg.aks == 40].sort_values('val_loss')
print(avgtmp.to_latex(index=False, escape=True))

# Plot effect of number of coefficients



# Overfitting evaluation



In [None]:
def parse_config(filepath):
    parent_dir = os.path.basename(os.path.dirname(filepath))
    ls = parent_dir.split('_')
    
    # values are indexes of element in `ls`
    res = {
        'hlayers': 0,
        'neurons': 2,
        'aks': 4
    }
    # Fill in with corresponding values
    res = {key: int(ls[index]) for key, index in res.items()}
    
    # Get RPM
    parent_parent_dir = os.path.dirname(os.path.dirname(filepath))
    parent_parent_dir = os.path.basename(parent_parent_dir)
    rpm = int(parent_parent_dir.split('RPM')[0])
    res['RPM'] = rpm
    
    return res

In [None]:
files = get_list(path, 'history.bin')

In [None]:
parse_config(files[0]), files[0]

In [None]:
parse_config(files[-1]), files[-1]

In [None]:
# Load data
# File structure: dictionnary
# key = (RPM, layers, neurons, aks)
# value = dictionnary {'loss': [...], 'val_loss': [...], ...}
data = {}

for f in files:
    conf = parse_config(f)
    key = (conf['RPM'], conf['hlayers'], conf['neurons'], conf['aks'])
    
    # Open the 'history' binary file containing losses and mae
    with open(f, 'rb') as file:
        data[key] = pickle.load(file)
    print('.', end='')

In [None]:
len(data), len(files)

In [None]:
with open('data/dnn_stats_data.bin', 'wb') as f:
    pickle.dump(data, f)

## Plot

Idea: plot losses and validation losses of all three data sets for a given DNN architecture and input size. Compute and display mean loss, validation loss, mae and validation mae.

In [None]:
def plot_statistics(data, configs, rpms, display_values, fun_descr):
    nrow = len(configs)
    ncol = len(rpms)
    fig, axgrid = plt.subplots(nrow, ncol, figsize=(2.8*ncol + .5, 1.75*nrow + .5),
                              sharey='row', sharex='col')
    
    for i, (axes, config) in enumerate(zip(axgrid, configs)):
        # Plot data
        last_val = {val: [] for val in display_values}
        for rpm, a in zip(rpms, axes):
            key = (rpm, ) + config
            a.plot(data[key]['loss'], label='loss', lw=.5)
            a.plot(data[key]['val_loss'], label='val loss', lw=.5)
            # Extract last values
            for val in display_values:
                last_val[val].append(data[key][val][-1])
            #last_val['loss'].append(data[key]['loss'])
        
        
        
        # X-Label management
        # First row vs not
        if i == 0:
            axes[0].legend()
            for rpm, a in zip(rpms, axes): a.set_title(f'{rpm} RPM')
        # Last row vs not
        #if i < nrow-1:
        #    for a in axes: a.set_xticklabels([])
        #else:
        #    for a in axes: a.set_xlabel('Epochs')
        
        # Y-label management
        axes[0].set_ylabel('Loss')
        #axes[-1].yaxis.set_label_position('right')
        #axes[-1].yaxis.tick_right()
        
        #fig.align_ylabels(axes)
        
        # Text management - Compute mean values, display
        text = '\n'.join([
            'mean ' + val.replace("_", " ") + ': ' + \
            '{:.3}'.format(np.mean(last_val[val]))
            for val in display_values
        ])
        text = '\\textbf{'+fun_descr(config)+'}' + '\n' + text
        axes[0].text(-1.5, .5, text, transform=axes[0].transAxes, 
                     verticalalignment='center')
    
    for a in axgrid[-1]: a.set_xlabel('Epochs')
    # General plot properties
    fig.subplots_adjust(wspace=.05, hspace=.05)
    plt.tight_layout()

In [None]:
configs = [(2, 64, 40),
           (4, 32, 40),
           (2, 32, 40),
           (6, 32, 40),
           (4, 64, 40),
           (1, 64, 40),
           (6, 16, 40)]
fun_descr = lambda config: f'{config[0]} layers, {config[1]} neurons'
plot_statistics(data, configs, [4000, 5000, 6000], 
                ['loss', 'val_loss', 'mae', 'val_mae'], fun_descr)
plt.savefig('figs/dnn_statistics_sorted_40aks.eps', bbox_inches='tight')

# Sort by validation loss

In [None]:
best = avg.sort_values('val_loss').reset_index(drop=True)
best.head(10)

In [None]:
def df2config(df):
    return [
        (l, n, c) for l,n,c in zip(df.hlayers, df.neurons, df.aks)
    ]

In [None]:
df2config(best.head(10))

In [None]:
best[best.aks == 40]

# Conmpare effect of aks


## For the assumed best architecture



In [None]:
a = df2config(best.head(1))[0]
a

In [None]:
sub = avg[np.logical_and(avg.hlayers == 2, avg.neurons == 64)].sort_values('val_loss')
configs = df2config(sub)
fun_descr = lambda config: f'L={config[0]}, N={config[1]}, C={config[2]}'
plot_statistics(data, configs, [4000, 5000, 6000], 
                ['loss', 'val_loss', 'mae', 'val_mae'], fun_descr)
plt.savefig('figs/dnn_statistics_2layers_64neurons.eps', bbox_inches='tight')

## Sort irrespectively of the number of coefs



In [None]:
best.head(10)

In [None]:
configs = df2config(best.head(10))
fun_descr = lambda config: f'L={config[0]}, N={config[1]}, C={config[2]}'
plot_statistics(data, configs, [4000, 5000, 6000], 
                ['loss', 'val_loss', 'mae', 'val_mae'], fun_descr)
plt.savefig('figs/dnn_statistics_best.eps', bbox_inches='tight')