# Build evaluation tables for paper

- Precomputed results must be located in `experiments_output` directory.

In [1]:
import json
import os
import pickle
import pandas as pd
import numpy as np
import logging
import matplotlib.pyplot as plt

In [2]:
experiments = {}
out_dir = 'experiments_output'
for name in os.listdir(out_dir):
    if name.startswith('task'):
        if not os.path.exists(os.path.join(out_dir, name, 'report.json')):
            continue
            
        experiments[name] = {}
        
        experiments[name]['task'] = 'a' if name.startswith('task-a') else 'b'
        
        # Load report
        with open(os.path.join(out_dir, name, 'report.json'), 'r') as f:
            experiments[name]['report'] = json.load(f)
         
        if os.path.exists(os.path.join(out_dir, name, 'model_config.json')):
            with open(os.path.join(out_dir, name, 'model_config.json'), 'r') as f:
                experiments[name]['config'] = json.load(f)
            
        if 'author-only' in name:
            with open(os.path.join(out_dir, name, 'report_author_vec_found.json'), 'r') as f:
                experiments[name]['report'] = json.load(f) 
        

In [3]:
metrics = ['f1-score', 'precision', 'recall']
scores = []

for n, d in experiments.items():
    dd = {
        'name': n,
        'task': d['task'],
    }
    
    for metric in metrics:
        dd[metric] = d['report']['micro avg'][metric] * 100

    scores.append(dd)
    
#scores = [{'name': n, 'task': , 'micro avg f1-score': d['report']['micro avg']['f1-score']}]

df = pd.DataFrame(scores)
df = df.set_index('name')

for metric in metrics:
    df[metric + '_diff'] = 0


In [4]:
from IPython.display import display

pd.options.display.float_format = '{:,.2f}'.format

def display_task(df, task_id):
    print(f'#### Task {task_id} ####')
    
    df_a = df[df['task'] == task_id]
    for metric in metrics:
        max_val = df_a[metric].max()
        df_a = df_a.assign(**{metric + '_diff': max_val - df_a[metric]})
        
    for metric in metrics:
        print(f'Sorted by {metric}')
        display(df_a.sort_values([metric], ascending=False))

In [5]:
display_task(df, 'a')


#### Task a ####
Sorted by f1-score


Unnamed: 0_level_0,f1-score,precision,recall,task,f1-score_diff,precision_diff,recall_diff
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
task-a__bert-german_manual_author-embedding_author-gender_2,87.2,88.76,85.7,a,0.0,0.89,0.0
task-a__bert-german_full,87.12,89.23,85.11,a,0.08,0.42,0.58
task-a__bert-german_manual_no-embedding,86.9,89.65,84.3,a,0.3,0.0,1.39
task-a__bert-german_no-manual_embedding,86.84,89.02,84.75,a,0.36,0.63,0.94
task-a__bert-german_text-only,86.65,89.65,83.86,a,0.55,0.01,1.84
task-a__bert-multilingual_text-only,83.94,86.31,81.7,a,3.26,3.34,3.99
task-a__baseline,77.0,79.0,74.0,a,10.2,10.65,11.7
task-a__author-only,61.99,75.59,52.54,a,25.21,14.07,33.16


Sorted by precision


Unnamed: 0_level_0,f1-score,precision,recall,task,f1-score_diff,precision_diff,recall_diff
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
task-a__bert-german_manual_no-embedding,86.9,89.65,84.3,a,0.3,0.0,1.39
task-a__bert-german_text-only,86.65,89.65,83.86,a,0.55,0.01,1.84
task-a__bert-german_full,87.12,89.23,85.11,a,0.08,0.42,0.58
task-a__bert-german_no-manual_embedding,86.84,89.02,84.75,a,0.36,0.63,0.94
task-a__bert-german_manual_author-embedding_author-gender_2,87.2,88.76,85.7,a,0.0,0.89,0.0
task-a__bert-multilingual_text-only,83.94,86.31,81.7,a,3.26,3.34,3.99
task-a__baseline,77.0,79.0,74.0,a,10.2,10.65,11.7
task-a__author-only,61.99,75.59,52.54,a,25.21,14.07,33.16


Sorted by recall


Unnamed: 0_level_0,f1-score,precision,recall,task,f1-score_diff,precision_diff,recall_diff
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
task-a__bert-german_manual_author-embedding_author-gender_2,87.2,88.76,85.7,a,0.0,0.89,0.0
task-a__bert-german_full,87.12,89.23,85.11,a,0.08,0.42,0.58
task-a__bert-german_no-manual_embedding,86.84,89.02,84.75,a,0.36,0.63,0.94
task-a__bert-german_manual_no-embedding,86.9,89.65,84.3,a,0.3,0.0,1.39
task-a__bert-german_text-only,86.65,89.65,83.86,a,0.55,0.01,1.84
task-a__bert-multilingual_text-only,83.94,86.31,81.7,a,3.26,3.34,3.99
task-a__baseline,77.0,79.0,74.0,a,10.2,10.65,11.7
task-a__author-only,61.99,75.59,52.54,a,25.21,14.07,33.16


In [6]:
display_task(df, 'b')

#### Task b ####
Sorted by f1-score


Unnamed: 0_level_0,f1-score,precision,recall,task,f1-score_diff,precision_diff,recall_diff
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
task-b__bert-german_full,64.7,83.78,52.7,b,0.0,0.17,0.32
task-b__bert-german_no-manual_embedding,64.41,82.02,53.03,b,0.29,1.93,0.0
task-b__bert-german_manual_no-embedding,63.96,83.94,51.67,b,0.74,0.0,1.36
task-b__bert-german_text-only,60.51,83.44,47.47,b,4.19,0.5,5.56
task-b__bert-multilingual_text-only,54.08,82.63,40.19,b,10.62,1.31,12.83
task-b__baseline,45.0,67.0,34.0,b,19.7,16.94,19.03
task-b__author-only,32.13,72.39,20.65,b,32.57,11.56,32.38


Sorted by precision


Unnamed: 0_level_0,f1-score,precision,recall,task,f1-score_diff,precision_diff,recall_diff
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
task-b__bert-german_manual_no-embedding,63.96,83.94,51.67,b,0.74,0.0,1.36
task-b__bert-german_full,64.7,83.78,52.7,b,0.0,0.17,0.32
task-b__bert-german_text-only,60.51,83.44,47.47,b,4.19,0.5,5.56
task-b__bert-multilingual_text-only,54.08,82.63,40.19,b,10.62,1.31,12.83
task-b__bert-german_no-manual_embedding,64.41,82.02,53.03,b,0.29,1.93,0.0
task-b__author-only,32.13,72.39,20.65,b,32.57,11.56,32.38
task-b__baseline,45.0,67.0,34.0,b,19.7,16.94,19.03


Sorted by recall


Unnamed: 0_level_0,f1-score,precision,recall,task,f1-score_diff,precision_diff,recall_diff
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
task-b__bert-german_no-manual_embedding,64.41,82.02,53.03,b,0.29,1.93,0.0
task-b__bert-german_full,64.7,83.78,52.7,b,0.0,0.17,0.32
task-b__bert-german_manual_no-embedding,63.96,83.94,51.67,b,0.74,0.0,1.36
task-b__bert-german_text-only,60.51,83.44,47.47,b,4.19,0.5,5.56
task-b__bert-multilingual_text-only,54.08,82.63,40.19,b,10.62,1.31,12.83
task-b__baseline,45.0,67.0,34.0,b,19.7,16.94,19.03
task-b__author-only,32.13,72.39,20.65,b,32.57,11.56,32.38
