In [1]:
training_dir = f'../results/training'

In [2]:
import glob
import pandas as pd
import re

# convert logs to DataFrames
for log_file in glob.glob(f'{training_dir}/*.txt'):
    logs_train = pd.DataFrame(columns=['Epoch', 'Batch', 'Training Loss', 'Training Accuracy'])
    logs_val = pd.DataFrame(columns=['Epoch', 'Validation Loss', 'Validation Accuracy'])

    for line in open(log_file, 'r').readlines():
        if line.startswith(('Device', 'Number', 'Trigger', 'Best', 'Training', 'Process')) or line == '\n':
            continue
        elif line.startswith('Epoch'):
            epoch = line.split(':')[1].strip()
        elif line.startswith('Batch'):
            splitted_line = re.split('[,:]+', line)
            batch, train_loss, train_acc = splitted_line[1].strip(), splitted_line[3].strip(), splitted_line[5].strip()
            new_row = pd.DataFrame([{'Epoch': epoch, 'Batch': batch, 'Training Loss': train_loss, 'Training Accuracy': train_acc}])
            logs_train = pd.concat([logs_train, new_row], ignore_index=True)
        elif line.startswith('Validation'):
            splitted_line = re.split('[,:]+', line)
            val_loss, val_acc = splitted_line[1].strip(), splitted_line[3].strip()
            new_row = pd.DataFrame([{'Epoch': epoch, 'Validation Loss': val_loss, 'Validation Accuracy': val_acc}])
            logs_val = pd.concat([logs_val, new_row], ignore_index=True)
        else:
            print(f'Unexpected line found in {log_file}: {line}')

    logs_train.to_csv(f'{training_dir}/{log_file.split("_")[-3]}_{log_file.split("_")[-2]}_{log_file.split("_")[-1].split(".")[0]}_training.csv', index=False)
    logs_val.to_csv(f'{training_dir}/{log_file.split("_")[-3]}_{log_file.split("_")[-2]}_{log_file.split("_")[-1].split(".")[0]}_validation.csv', index=False)

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

def create_custom_lineplot(merged, measure, mx, cut, epochs):
    plotdata = merged.copy()
    fig, ax = plt.subplots(figsize=(16, 8))
    sns.lineplot(data=plotdata,
                 x='Batch_cont',
                 y=measure,
                 hue='TYPE')

    plt.title(f'Training and Validation {measure} over Time\n(max sequence length: {mx}k, cutted: {cut.lower()} normalization, epochs: {epochs})', fontsize=22)
    ax.set(xlim=(0, 4))
    plt.xlabel('Epoch', fontsize=18)
    plt.ylabel(measure, fontsize=18)
    ax.set_xticks(plotdata[plotdata['TYPE'] == 'Validation']['Batch_cont'].tolist(), plotdata['Epoch'].astype(int).unique(), fontsize=14)
    plt.yticks(fontsize=14)
    plt.rcParams['legend.title_fontsize'] = 14
    ax.legend(title=f'{measure} of', fontsize=14, loc='upper left', bbox_to_anchor=(1, 1))

    plt.tight_layout()
    plt.savefig(f'../plots/max{mx}_cut{cut}_epochs{epochs}_{measure.lower()}_over_time.png', dpi=300, facecolor='white', edgecolor='none')
    plt.close()

In [4]:
import numpy as np
import pandas as pd

from sympy import divisors

best_models = pd.DataFrame(columns=['Maximum Sequence Length', 'Cutting Method', 'Number of Epochs', 'Epoch', 'Loss', 'Accuracy'])
means = pd.DataFrame(columns=['Maximum Sequence Length', 'Cutting Method', 'Number of Epochs', 'Type', 'Epoch', 'Mean Loss', 'Mean Accuracy'])

for mx in [4, 6, 8]:
    for cut in ['Before', 'After']:
        for epochs in [5, 15]:
            # get dataframes with loss and accuracy
            current_logs_train = pd.read_csv(f'{training_dir}/max{mx}_cut{cut}_{epochs}epochs_training.csv')
            current_logs_val = pd.read_csv(f'{training_dir}/max{mx}_cut{cut}_{epochs}epochs_validation.csv')

            # extract and store best model (based on validation accuracy)
            current_bm = current_logs_val.iloc[current_logs_val['Validation Accuracy'].idxmax()]
            best_models = pd.concat([best_models,
                                     pd.DataFrame({'Maximum Sequence Length': [mx],
                                                   'Cutting Method': [cut],
                                                   'Number of Epochs': [epochs],
                                                   'Epoch': [int(current_bm['Epoch'])],
                                                   'Loss': [current_bm['Validation Loss']],
                                                   'Accuracy': [current_bm['Validation Accuracy']]})],
                                    ignore_index=True)

            # extract mean statistics per epoch
            means_train = current_logs_train.groupby('Epoch')[['Training Loss', 'Training Accuracy']].mean()
            means_val = current_logs_val.groupby('Epoch')[['Validation Loss', 'Validation Accuracy']].mean()

            # store mean statistics per epoch
            for ep, (loss, acc) in enumerate(means_train.values):
                means = pd.concat([means,
                                   pd.DataFrame({'Maximum Sequence Length': [mx],
                                                 'Cutting Method': [cut],
                                                 'Number of Epochs': [epochs],
                                                 'Type': ['Training'],
                                                 'Epoch': [ep],
                                                 'Mean Loss': [loss],
                                                 'Mean Accuracy': [acc]})],
                                  ignore_index=True)
            for ep, (loss, acc) in enumerate(means_val.values):
                means = pd.concat([means,
                                   pd.DataFrame({'Maximum Sequence Length': [mx],
                                                 'Cutting Method': [cut],
                                                 'Number of Epochs': [epochs],
                                                 'Type': ['Validation'],
                                                 'Epoch': [ep],
                                                 'Mean Loss': [loss],
                                                 'Mean Accuracy': [acc]})],
                                  ignore_index=True)

            # prepare smoothing of line plot (e.g. combine_factor = 3 means to average 3 consecutive row values to one new number)
            n_batches = current_logs_train['Batch'].nunique()
            divs = divisors(n_batches)
            combine_factor = next(d for d in divs if d > 60)
            current_logs_train_reduced = current_logs_train.groupby(np.arange(len(current_logs_train))//combine_factor).mean()

            # prepare dataframe with training logs for merging
            current_logs_train_reduced.columns = current_logs_train_reduced.columns.str.lstrip('Training ')
            current_logs_train_reduced['TYPE'] = 'Training'

            # prepare dataframe with validation logs for merging
            current_logs_val['Batch'] = n_batches - 1
            current_logs_val.columns = current_logs_val.columns.str.lstrip('Validation ')
            current_logs_val['TYPE'] = 'Validation'

            # merge both dataframes for easier plotting
            current_merged = pd.concat([current_logs_train_reduced, current_logs_val], ignore_index=True)
            current_merged['Batch_cont'] = current_merged.apply(lambda row: row['Batch'] * (row['Epoch'] + 1), axis=1)

            # plot loss and accuracy over time
            for measure in ['Loss', 'Accuracy']:
                create_custom_lineplot(current_merged, measure, mx, cut, epochs)

best_models.to_csv('../data/best_models.csv', index=False)
means.to_csv('../data/means.csv', index=False)

In [5]:
import pandas as pd

# lookup best models
b = pd.read_csv('../data/best_models.csv')
b

Unnamed: 0,Maximum Sequence Length,Cutting Method,Number of Epochs,Epoch,Loss,Accuracy
0,4,Before,5,0,0.233314,93.614047
1,4,Before,15,0,0.233185,93.617026
2,4,After,5,1,0.227655,93.613196
3,4,After,15,1,0.228414,93.614047
4,6,Before,5,4,0.224448,93.014888
5,6,Before,15,13,0.215062,93.16233
6,6,After,5,0,0.240566,93.02326
7,6,After,15,9,0.218368,93.066516
8,8,Before,5,4,0.233898,92.360004
9,8,Before,15,13,0.2162,92.82103


In [6]:
b[(b['Maximum Sequence Length'] == 4) & (b['Cutting Method'] == 'Before')]

Unnamed: 0,Maximum Sequence Length,Cutting Method,Number of Epochs,Epoch,Loss,Accuracy
0,4,Before,5,0,0.233314,93.614047
1,4,Before,15,0,0.233185,93.617026


In [7]:
import pandas as pd

# lookup mean statistics
m = pd.read_csv('../data/means.csv')
m

Unnamed: 0,Maximum Sequence Length,Cutting Method,Number of Epochs,Type,Epoch,Mean Loss,Mean Accuracy
0,4,Before,5,Training,0,0.244986,93.092312
1,4,Before,5,Training,1,0.236480,93.101967
2,4,Before,5,Training,2,0.232124,93.106371
3,4,Before,5,Training,3,0.228446,93.110349
4,4,Before,5,Training,4,0.225247,93.124355
...,...,...,...,...,...,...,...
235,8,After,15,Validation,10,0.223671,92.517440
236,8,After,15,Validation,11,0.220612,92.571799
237,8,After,15,Validation,12,0.220124,92.447184
238,8,After,15,Validation,13,0.217241,92.714363


In [8]:
m[(m['Maximum Sequence Length'] == 4) & (m['Type'] == 'Training')]

Unnamed: 0,Maximum Sequence Length,Cutting Method,Number of Epochs,Type,Epoch,Mean Loss,Mean Accuracy
0,4,Before,5,Training,0,0.244986,93.092312
1,4,Before,5,Training,1,0.23648,93.101967
2,4,Before,5,Training,2,0.232124,93.106371
3,4,Before,5,Training,3,0.228446,93.110349
4,4,Before,5,Training,4,0.225247,93.124355
10,4,Before,15,Training,0,0.246818,93.102179
11,4,Before,15,Training,1,0.238504,93.102073
12,4,Before,15,Training,2,0.233792,93.101225
13,4,Before,15,Training,3,0.230273,93.105681
14,4,Before,15,Training,4,0.227088,93.115866
