In [98]:
training_dir = '../results/training'
run_identifier = 'balancedLoss'  # must be set, not allowed to be empty
model_selection_criterion = 'Loss' # validation loss or accuracy

In [None]:
import glob
import os
import pandas as pd
import re

# convert logs to DataFrames
for log_file in glob.glob(f'{training_dir}/*.txt'):
    logs_train = pd.DataFrame(columns=['Epoch', 'Batch', 'Training Loss', 'Training Accuracy'])
    logs_val = pd.DataFrame(columns=['Epoch', 'Validation Loss', 'Validation Accuracy'])

    for line in open(log_file, 'r').readlines():
        if line.startswith(('Device', 'Number', 'Trigger', 'Best', 'Training', 'Process')) or line == '\n':
            continue
        elif line.startswith('Epoch'):
            epoch = line.split(':')[1].strip()
        elif line.startswith('Batch'):
            splitted_line = re.split('[,:]+', line)
            batch, train_loss, train_acc = splitted_line[1].strip(), splitted_line[3].strip(), splitted_line[5].strip()
            new_row = pd.DataFrame([{'Epoch': epoch, 'Batch': batch, 'Training Loss': train_loss, 'Training Accuracy': train_acc}])
            logs_train = pd.concat([logs_train, new_row], ignore_index=True)
        elif line.startswith('Validation'):
            splitted_line = re.split('[,:]+', line)
            val_loss, val_acc = splitted_line[1].strip(), splitted_line[3].strip()
            new_row = pd.DataFrame([{'Epoch': epoch, 'Validation Loss': val_loss, 'Validation Accuracy': val_acc}])
            logs_val = pd.concat([logs_val, new_row], ignore_index=True)
        else:
            print(f'Unexpected line found in {log_file}: {line}')

    filename_prefix = '_'.join(re.split('\.|_', os.path.basename(log_file))[1:-1])
    logs_train.to_csv(f'{training_dir}/{filename_prefix}_training.csv', index=False)
    logs_val.to_csv(f'{training_dir}/{filename_prefix}_validation.csv', index=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def create_smoothed_lineplot(merged, measure, mx, cut, epochs, plots_subdir):
    plotdata = merged.copy()
    fig, ax = plt.subplots(figsize=(16, 8))
    sns.lineplot(data=plotdata,
                 x='Batch_cont',
                 y=measure,
                 hue='TYPE')

    plt.title(f'Training and Validation {measure} over Time\n(max sequence length: {mx}k, cutted: {cut.lower()} normalization, epochs: {epochs})', fontsize=22)
    ax.set(xlim=(0, 4))
    plt.xlabel('Epoch', fontsize=18)
    plt.ylabel(measure, fontsize=18)
    ax.set_xticks(plotdata[plotdata['TYPE'] == 'Validation']['Batch_cont'].tolist(), plotdata['Epoch'].astype(int).unique(), fontsize=14)
    plt.yticks(fontsize=14)
    plt.rcParams['legend.title_fontsize'] = 14
    ax.legend(title=f'{measure} of', fontsize=14, loc='upper left', bbox_to_anchor=(1, 1))

    plt.tight_layout()
    plt.savefig(f'{plots_subdir}/max{mx}_cut{cut}_epochs{epochs}_{measure.lower()}_over_time.png', dpi=300, facecolor='white', edgecolor='none')
    plt.close()

In [None]:
import numpy as np
import pandas as pd

from sympy import divisors

plots_subdir = f'../plots/{run_identifier}'
if not os.path.exists(plots_subdir):
    os.makedirs(plots_subdir)

best_models = pd.DataFrame(columns=['Maximum Sequence Length', 'Cutting Method', 'Number of Epochs', 'Epoch', 'Loss', 'Accuracy'])
means = pd.DataFrame(columns=['Maximum Sequence Length', 'Cutting Method', 'Number of Epochs', 'Type', 'Epoch', 'Mean Loss', 'Mean Accuracy'])

for mx in [4, 6, 8]:
    for cut in ['Before', 'After']:
        for epochs in [15, 30]:
            # get dataframes with loss and accuracy
            current_logs_train = pd.read_csv(f'{training_dir}/max{mx}_cut{cut}_{epochs}epochs_{run_identifier}_training.csv')
            current_logs_val = pd.read_csv(f'{training_dir}/max{mx}_cut{cut}_{epochs}epochs_{run_identifier}_validation.csv')

            # extract and store best model (based on validation accuracy or loss)
            current_bm = current_logs_val.iloc[current_logs_val[f'Validation {model_selection_criterion}'].idxmax()]
            best_models = pd.concat([best_models,
                                     pd.DataFrame([{'Maximum Sequence Length': mx,
                                                     'Cutting Method': cut,
                                                     'Number of Epochs': epochs,
                                                     'Epoch': int(current_bm['Epoch']),
                                                     'Loss': current_bm['Validation Loss'],
                                                     'Accuracy': current_bm['Validation Accuracy']}])],
                                    ignore_index=True)

            # extract mean training statistics per epoch
            means_train = current_logs_train.groupby('Epoch')[['Training Loss', 'Training Accuracy']].mean()

            # combine mean statistics of training with validation results
            for ep, (loss, acc) in enumerate(means_train.values):
                means = pd.concat([means,
                                   pd.DataFrame([{'Maximum Sequence Length': mx,
                                                  'Cutting Method': cut,
                                                  'Number of Epochs': epochs,
                                                  'Type': 'Training',
                                                  'Epoch': ep,
                                                  'Mean Loss': loss,
                                                  'Mean Accuracy': acc}])],
                                  ignore_index=True)
            for _, val_row in current_logs_val.iterrows():
                means = pd.concat([means,
                                   pd.DataFrame([{'Maximum Sequence Length': mx,
                                                  'Cutting Method': cut,
                                                  'Number of Epochs': epochs,
                                                  'Type': 'Validation',
                                                  'Epoch': int(val_row['Epoch']),
                                                  'Mean Loss': val_row['Validation Loss'],
                                                  'Mean Accuracy': val_row['Validation Accuracy']}])],
                                  ignore_index=True)

            # prepare smoothing of line plot (e.g. combine_factor = 3 means to average 3 consecutive row values to one new number)
            n_batches = current_logs_train['Batch'].nunique()
            divs = divisors(n_batches)
            combine_factor = next(d for d in divs if d > 60)
            current_logs_train_reduced = current_logs_train.groupby(np.arange(len(current_logs_train))//combine_factor).mean()

            # prepare dataframe with training logs for merging
            current_logs_train_reduced.columns = current_logs_train_reduced.columns.str.lstrip('Training ')
            current_logs_train_reduced['TYPE'] = 'Training'

            # prepare dataframe with validation logs for merging
            current_logs_val['Batch'] = n_batches - 1
            current_logs_val.columns = current_logs_val.columns.str.lstrip('Validation ')
            current_logs_val['TYPE'] = 'Validation'

            # merge both dataframes for easier plotting
            current_merged = pd.concat([current_logs_train_reduced, current_logs_val], ignore_index=True)
            current_merged['Batch_cont'] = current_merged.apply(lambda row: row['Batch'] * (row['Epoch'] + 1), axis=1)

           # plot loss and accuracy over time
            for measure in ['Loss', 'Accuracy']:
                create_smoothed_lineplot(current_merged, measure, mx, cut, epochs, plots_subdir)

best_models.to_csv(f'{training_dir}/best_models_{run_identifier}.csv', index=False)
means.to_csv(f'{training_dir}/means_{run_identifier}.csv', index=False)

In [None]:
# TODO: plot runtime of best models
# TODO: plot confusion matrix & typical metrics derived of best models

In [None]:
import pandas as pd

# lookup best models
b = pd.read_csv(f'{training_dir}/best_models_{run_identifier}.csv')
b

In [None]:
b[b['Cutting Method'] == 'Before']

In [None]:
import pandas as pd

# lookup mean statistics
m = pd.read_csv(f'{training_dir}/means_{run_identifier}.csv')
m

In [None]:
m[m['Type'] == 'Validation']