# Evaluation (TODO Latex figures without Type3 fonts!!!)

- Tables: Overall / By Label Class
- Figures: By Citation Count / By Year (difference)
- Samples (ACL only?)
- Confusion matrix?

```bash
# Compress all result files (exclude model weights)
tar -cvzf acl_docrel.tar.gz --exclude='*.bin' acl_docrel/*
```


In [439]:
import re
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from collections import defaultdict
from experiments.utils import highlight_max
import matplotlib.patches as mpatches
from matplotlib.container import ErrorbarContainer

import os
import random
import json
import pickle
import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from IPython.core.display import display

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

INFO:transformers.file_utils:PyTorch version 1.4.0 available.


In [180]:
output_dir = './output/'
folds = 4

metric_cols = {
    'micro avg__precision': 'Precision (micro)',
    'micro avg__recall': 'Recall (micro)',
    'micro avg__f1-score': 'F1 (micro)',
    'macro avg__precision': 'Precision (macro)',
    'macro avg__recall': 'Recall (macro)',
    'macro avg__f1-score': 'F1 (macro)',
}

# Overall results

Table: System, P, R, F micro (std) F macro (std) | for ACL-Anthology + CORD-19


In [359]:
overall_rows = []
exp_names = []

for exp in os.listdir(output_dir):
    exp_dir = os.path.join(output_dir, exp)

    if not exp.endswith('_docrel') or os.path.isfile(exp_dir):
        continue
    
    print(f'Experiment: {exp}')
    
    for fold in range(1, folds+1):
        fold_dir = os.path.join(exp_dir, 'folds', str(fold))
        print(f'- Fold: {fold}')

        if not os.path.exists(fold_dir):
            logger.warning(f'Fold does not exist: {fold_dir}')
            continue

        for sys in os.listdir(fold_dir):
            sys_dir = os.path.join(fold_dir, sys)

            if not os.path.isdir(sys_dir) or sys.startswith('.') or sys.startswith('__'):
                continue

            try:
                metrics = json.load(open(os.path.join(sys_dir, 'metrics.json')))
                metrics = {k.replace('eval_', ''): v for k, v in metrics.items()}
                exp_name = exp.replace('acl_docrel', 'ACL Anthology').replace('cord19_docrel', 'CORD-19')
                row = {                    
                    'system': sys,
                    'experiment': exp_name,
                    'fold': fold,
                    'class': None,
                    'precision': 0.,
                    'recall': 0.,
                    'f1-score': 0.,
                    'support': 0,
                }
                cs = set([k.split('__')[0] for k in metrics.keys() if len(k.split('__')) == 2])

                for c in cs:
                    crow = row.copy()
                    crow['class'] = c
                    for m in ms:
                        crow[m] = metrics[c + '__' + m]
                    overall_rows.append(crow)
                    
                print(f'   - System: {sys}')

            except FileNotFoundError:
                logger.warning(f'Skip {sys_dir} (not all files exists, probably still running..)')
                pass
    #break



Experiment: acl_docrel
- Fold: 1
   - System: roberta-base
   - System: electra-base-discriminator
   - System: bert-base-cased
   - System: baseline-rnn__fasttext__custom
   - System: xlnet-base-cased
   - System: baseline-rnn
   - System: scibert-scivocab-uncased
   - System: covid_bert_base
- Fold: 2
   - System: roberta-base
   - System: electra-base-discriminator
   - System: bert-base-cased
   - System: baseline-rnn__fasttext__custom
   - System: xlnet-base-cased
   - System: baseline-rnn
   - System: scibert-scivocab-uncased
   - System: covid_bert_base
- Fold: 3
   - System: roberta-base
   - System: electra-base-discriminator
   - System: bert-base-cased
   - System: baseline-rnn__fasttext__custom
   - System: xlnet-base-cased
   - System: baseline-rnn
   - System: scibert-scivocab-uncased
   - System: covid_bert_base
- Fold: 4
   - System: roberta-base
   - System: electra-base-discriminator
   - System: bert-base-cased
   - System: baseline-rnn__fasttext__custom
   - System:

In [360]:
df = pd.DataFrame(overall_rows)

In [361]:
df

Unnamed: 0,system,experiment,fold,class,precision,recall,f1-score,support
0,roberta-base,ACL Anthology,1,evaluation,0.000000,0.000000,0.000000,253
1,roberta-base,ACL Anthology,1,discussion,0.000000,0.000000,0.000000,296
2,roberta-base,ACL Anthology,1,introduction,0.508932,0.354996,0.418250,4093
3,roberta-base,ACL Anthology,1,micro avg,0.707644,0.541440,0.613484,21646
4,roberta-base,ACL Anthology,1,none,0.905441,0.951392,0.927848,6069
...,...,...,...,...,...,...,...,...
507,covid_bert_base,ACL Anthology,4,conclusion,0.000000,0.000000,0.000000,281
508,covid_bert_base,ACL Anthology,4,related work,0.634146,0.556905,0.593021,3128
509,covid_bert_base,ACL Anthology,4,other,0.607843,0.589046,0.598297,5368
510,covid_bert_base,ACL Anthology,4,results,0.000000,0.000000,0.000000,286


In [362]:
systems = df['system'].unique().tolist()

In [363]:
for sys in sorted(systems):
    print(sys)
    display(df[(df['system'] == sys) & (df['class'] == 'micro avg')])
    #display(df[(df['system'] == sys) & (df['class'] == 'macro avg')])
    
    

baseline-rnn


Unnamed: 0,system,experiment,fold,class,precision,recall,f1-score,support
83,baseline-rnn,ACL Anthology,1,micro avg,0.760135,0.198374,0.314636,21646
211,baseline-rnn,ACL Anthology,2,micro avg,0.75,0.20206,0.318351,21558
339,baseline-rnn,ACL Anthology,3,micro avg,0.791186,0.197907,0.316616,21500
467,baseline-rnn,ACL Anthology,4,micro avg,0.78738,0.203705,0.323671,21379


baseline-rnn__fasttext__custom


Unnamed: 0,system,experiment,fold,class,precision,recall,f1-score,support
51,baseline-rnn__fasttext__custom,ACL Anthology,1,micro avg,0.771262,0.176799,0.287658,21646
179,baseline-rnn__fasttext__custom,ACL Anthology,2,micro avg,0.752587,0.185546,0.297697,21558
307,baseline-rnn__fasttext__custom,ACL Anthology,3,micro avg,0.781591,0.183256,0.296899,21500
435,baseline-rnn__fasttext__custom,ACL Anthology,4,micro avg,0.75966,0.192198,0.306779,21379


bert-base-cased


Unnamed: 0,system,experiment,fold,class,precision,recall,f1-score,support
35,bert-base-cased,ACL Anthology,1,micro avg,0.714614,0.588931,0.645714,21646
163,bert-base-cased,ACL Anthology,2,micro avg,0.71762,0.587531,0.646093,21558
291,bert-base-cased,ACL Anthology,3,micro avg,0.720533,0.596233,0.652516,21500
419,bert-base-cased,ACL Anthology,4,micro avg,0.718741,0.587492,0.646523,21379


covid_bert_base


Unnamed: 0,system,experiment,fold,class,precision,recall,f1-score,support
115,covid_bert_base,ACL Anthology,1,micro avg,0.713653,0.565555,0.631031,21646
243,covid_bert_base,ACL Anthology,2,micro avg,0.71258,0.575703,0.63687,21558
371,covid_bert_base,ACL Anthology,3,micro avg,0.715581,0.583814,0.643016,21500
499,covid_bert_base,ACL Anthology,4,micro avg,0.707958,0.580102,0.637684,21379


electra-base-discriminator


Unnamed: 0,system,experiment,fold,class,precision,recall,f1-score,support
19,electra-base-discriminator,ACL Anthology,1,micro avg,0.699091,0.543518,0.611566,21646
147,electra-base-discriminator,ACL Anthology,2,micro avg,0.702621,0.547221,0.61526,21558
275,electra-base-discriminator,ACL Anthology,3,micro avg,0.703474,0.554698,0.62029,21500
403,electra-base-discriminator,ACL Anthology,4,micro avg,0.707529,0.552084,0.620215,21379


roberta-base


Unnamed: 0,system,experiment,fold,class,precision,recall,f1-score,support
3,roberta-base,ACL Anthology,1,micro avg,0.707644,0.54144,0.613484,21646
131,roberta-base,ACL Anthology,2,micro avg,0.702251,0.561462,0.624014,21558
259,roberta-base,ACL Anthology,3,micro avg,0.70583,0.556326,0.622223,21500
387,roberta-base,ACL Anthology,4,micro avg,0.702513,0.561018,0.623843,21379


scibert-scivocab-uncased


Unnamed: 0,system,experiment,fold,class,precision,recall,f1-score,support
99,scibert-scivocab-uncased,ACL Anthology,1,micro avg,0.721928,0.617805,0.66582,21646
227,scibert-scivocab-uncased,ACL Anthology,2,micro avg,0.728194,0.620002,0.669757,21558
355,scibert-scivocab-uncased,ACL Anthology,3,micro avg,0.727209,0.628512,0.674268,21500
483,scibert-scivocab-uncased,ACL Anthology,4,micro avg,0.722643,0.620562,0.667724,21379


xlnet-base-cased


Unnamed: 0,system,experiment,fold,class,precision,recall,f1-score,support
67,xlnet-base-cased,ACL Anthology,1,micro avg,0.704042,0.584219,0.638558,21646
195,xlnet-base-cased,ACL Anthology,2,micro avg,0.702576,0.595974,0.644899,21558
323,xlnet-base-cased,ACL Anthology,3,micro avg,0.700539,0.598977,0.645789,21500
451,xlnet-base-cased,ACL Anthology,4,micro avg,0.700089,0.589176,0.639862,21379


In [364]:
df[(df['system'] == 'xlnet-base-cased') & (df['class'] == 'micro avg')]

Unnamed: 0,system,experiment,fold,class,precision,recall,f1-score,support
67,xlnet-base-cased,ACL Anthology,1,micro avg,0.704042,0.584219,0.638558,21646
195,xlnet-base-cased,ACL Anthology,2,micro avg,0.702576,0.595974,0.644899,21558
323,xlnet-base-cased,ACL Anthology,3,micro avg,0.700539,0.598977,0.645789,21500
451,xlnet-base-cased,ACL Anthology,4,micro avg,0.700089,0.589176,0.639862,21379


In [365]:
df.columns

Index(['system', 'experiment', 'fold', 'class', 'precision', 'recall',
       'f1-score', 'support'],
      dtype='object')

In [440]:
#agg_cols = {col_name:  ['mean', 'std', 'count'] for col_name, _ in metric_cols.items()}

agg_cols = {col_name:  ['mean', 'std',] for col_name in ['precision', 'recall', 'f1-score']}

overall_df = df[df['class'].isin(['micro avg', 'macro avg'])]\
    .groupby(['system','class','experiment', ])\
    .agg(agg_cols)\
    .unstack().unstack()\
    .swaplevel(0,2,axis=1).swaplevel(1,3,axis=1).sort_index(axis=1)
    

with pd.option_context('display.float_format', lambda x: ('%.3f' % x)[1:]):   
    display(
        overall_df.style.apply(highlight_max)
    )

experiment,ACL Anthology,ACL Anthology,ACL Anthology,ACL Anthology,ACL Anthology,ACL Anthology,ACL Anthology,ACL Anthology,ACL Anthology,ACL Anthology,ACL Anthology,ACL Anthology
class,macro avg,macro avg,macro avg,macro avg,macro avg,macro avg,micro avg,micro avg,micro avg,micro avg,micro avg,micro avg
Unnamed: 0_level_2,f1-score,f1-score,precision,precision,recall,recall,f1-score,f1-score,precision,precision,recall,recall
Unnamed: 0_level_3,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
system,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4
baseline-rnn,0.061683,0.000719,0.064348,0.001687,0.059252,0.000681,0.318319,0.003878,0.772175,0.020243,0.200511,0.002825
baseline-rnn__fasttext__custom,0.059225,0.001185,0.099773,0.011536,0.05453,0.00165,0.297258,0.007813,0.766275,0.012788,0.18445,0.006356
bert-base-cased,0.257502,0.007443,0.435477,0.087384,0.236415,0.004551,0.647711,0.00322,0.717877,0.002484,0.590047,0.004178
covid_bert_base,0.246851,0.007166,0.368444,0.031464,0.226926,0.006183,0.63715,0.004906,0.712443,0.003238,0.576293,0.007889
electra-base-discriminator,0.222802,0.00268,0.264237,0.002626,0.20503,0.002318,0.616833,0.004227,0.703179,0.003466,0.54938,0.004987
roberta-base,0.228616,0.003366,0.262577,0.003093,0.21092,0.004346,0.620891,0.005003,0.70456,0.002623,0.555061,0.009374
scibert-scivocab-uncased,0.304538,0.003965,0.432449,0.033814,0.275802,0.00459,0.669392,0.003626,0.724994,0.003166,0.62172,0.004681
xlnet-base-cased,0.242897,0.003791,0.327985,0.126468,0.230349,0.003581,0.642277,0.0036,0.701811,0.001839,0.592086,0.006657


In [370]:
# pd.set_option('display.float_format', lambda x: '%.3f' % x)

with pd.option_context('display.float_format', lambda x: ('%.3f' % x)[1:]):   
    overall_tex = overall_df.to_latex().split(r'\midrule', 2)
    body_tex, footer_tex = overall_tex[1].split(r'\bottomrule', 2)    
    header_tex = overall_tex[0]
    
    
    # {} &       mean
    
    header_tex = re.sub(r'(.*)mean(.*)', '', header_tex)
    header_tex = re.sub(r'(.*)system(.*)', '', header_tex)
        
    #print(re.search(r'(.*)mean(.*)', header_tex))
    
    print(
        header_tex\
            .replace(r'multicolumn{2}{l}', r'multicolumn{1}{l}')\
            .replace(r'multicolumn{12}{l}', r'multicolumn{6}{l}')\
            .replace(r'multicolumn{6}{l}', r'multicolumn{3}{l}')\
            .replace('lrrrrrrrrrrrr', 'l|r|r|r|r|r|r')
    )
    print(r'\midrule')
    
    for i, line in enumerate(body_tex.split(r' \\')):
        if (i % 2) == 0:
            print(r'\rowcolor{Gray} ')
        
        print(
            line.replace('& .', r' $\pm$.')
        )
        print(r' \\')
        
    print(r'\bottomrule')
    print(footer_tex)

\begin{tabular}{l|r|r|r|r|r|r}
\toprule
experiment & \multicolumn{3}{l}{ACL Anthology} \\
class & \multicolumn{3}{l}{macro avg} & \multicolumn{3}{l}{micro avg} \\
{} & \multicolumn{1}{l}{f1-score} & \multicolumn{1}{l}{precision} & \multicolumn{1}{l}{recall} & \multicolumn{1}{l}{f1-score} & \multicolumn{1}{l}{precision} & \multicolumn{1}{l}{recall} \\



\midrule
\rowcolor{Gray} 

baseline-rnn                   &          .062  $\pm$.001 &      .064  $\pm$.002 &   .059  $\pm$.001 &      .318  $\pm$.004 &      .772  $\pm$.020 &   .201  $\pm$.003
 \\

baseline-rnn\_\_fasttext\_\_custom &          .059  $\pm$.001 &      .100  $\pm$.012 &   .055  $\pm$.002 &      .297  $\pm$.008 &      .766  $\pm$.013 &   .184  $\pm$.006
 \\
\rowcolor{Gray} 

bert-base-cased                &          .258  $\pm$.007 &      .435  $\pm$.087 &   .236  $\pm$.005 &      .648  $\pm$.003 &      .718  $\pm$.002 &   .590  $\pm$.004
 \\

covid\_bert\_base                &          .247  $\pm$.007 &      .368  $\pm$.0

In [371]:
odf.columns.levels

FrozenList([['acl_docrel'], ['macro avg', 'micro avg'], ['f1-score', 'precision', 'recall'], ['mean', 'std']])

# By label class

- label X: sys_1_prec, sys_1_rec, sys_1_f1, sys_2_prec, ...

In [443]:
selected_system = 'scibert-scivocab-uncased'
selected_systems = [
    'bert-base-cased',
    'scibert-scivocab-uncased'
]

label_df = df[df['system'].isin(selected_systems)]\
    .groupby(['class', 'experiment', 'system' ])\
    .agg({c: 'mean' for c in ['f1-score', 'precision', 'recall']})\
    .unstack()\
    .unstack()\
    .swaplevel(0,2,axis=1).sort_index(axis=1)\
    .reindex(sorted_label_classes)

    #.drop(columns=[('ACL Anthology', 'fold'), ('ACL Anthology', 'support')])\
    #.reindex(sorted_label_classes)
    
with pd.option_context('display.float_format', lambda x: ('%.3f' % x)[1:]):  
    display(label_df)

experiment,ACL Anthology,ACL Anthology,ACL Anthology,ACL Anthology,ACL Anthology,ACL Anthology
system,bert-base-cased,bert-base-cased,bert-base-cased,scibert-scivocab-uncased,scibert-scivocab-uncased,scibert-scivocab-uncased
Unnamed: 0_level_2,f1-score,precision,recall,f1-score,precision,recall
class,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
background,0.097,0.524,0.054,0.246,0.636,0.156
conclusion,0.002,0.25,0.001,0.0,0.0,0.0
discussion,0.002,0.25,0.001,0.0,0.0,0.0
evaluation,0.0,0.0,0.0,0.004,0.25,0.002
experiment,0.314,0.517,0.227,0.366,0.495,0.292
introduction,0.488,0.551,0.438,0.521,0.569,0.48
method,0.0,0.0,0.0,0.0,0.0,0.0
previous work,0.051,0.938,0.027,0.294,0.718,0.186
related work,0.598,0.648,0.555,0.641,0.657,0.625
results,0.0,0.0,0.0,0.012,0.283,0.006


In [372]:
label_classes = set([c.split('__')[0] for c in df.columns if '__' in c and ' avg' not in c])
label_classes

set()

In [189]:
sorted_label_classes = [
    'background',
    'conclusion',
    'discussion',
    'evaluation',
    'experiment',
    'introduction',    
    'method',
    'previous work',
    'related work',
    'results',
    'other',
    'none',
    'macro avg',
    'micro avg',
    'samples avg',
    'weighted avg'
]

#systems = df['system'].unique().tolist()
#footer_classes = ['other', 'none', 'macro avg', 'micro avg', ]


by_label_class_rows = []

for idx, row in df.iterrows():
    for col in row.keys():
        v = col.split('__')
        
        if len(v) == 2 and row['system'] in selected_systems:
            by_label_class_rows.append({
                'label': v[0],
                row['system'] + '__' + v[1]: row[col],
                #'fold': row['fold'],
            })

by_label_class_df = pd.DataFrame(by_label_class_rows)

In [190]:
by_label_class_df.groupby('label').agg({selected_systems[0] + '__support': 'mean'}).values[:,0]

array([  341.25,   289.5 ,   283.  ,   242.75,  1006.25,  4069.75,
       21520.75,   179.75, 21520.75,  6068.75,  5394.25,   200.25,
        3150.  ,   295.25, 21520.75, 21520.75])

In [191]:
by_label_class_df.groupby('label').agg(['mean', 'std', 'count']).reindex(sorted_label_classes)

Unnamed: 0_level_0,bert-base-cased__precision,bert-base-cased__precision,bert-base-cased__precision,bert-base-cased__recall,bert-base-cased__recall,bert-base-cased__recall,bert-base-cased__f1-score,bert-base-cased__f1-score,bert-base-cased__f1-score,bert-base-cased__support,...,scibert-scivocab-uncased__precision,scibert-scivocab-uncased__recall,scibert-scivocab-uncased__recall,scibert-scivocab-uncased__recall,scibert-scivocab-uncased__f1-score,scibert-scivocab-uncased__f1-score,scibert-scivocab-uncased__f1-score,scibert-scivocab-uncased__support,scibert-scivocab-uncased__support,scibert-scivocab-uncased__support
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count,mean,...,count,mean,std,count,mean,std,count,mean,std,count
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
background,0.524306,0.102925,4,0.053892,0.032251,4,0.09656,0.054327,4,341.25,...,4,0.155641,0.056622,4,0.24591,0.067291,4,341.25,5.057997,4
conclusion,0.25,0.5,4,0.000842,0.001684,4,0.001678,0.003356,4,289.5,...,4,0.0,0.0,4,0.0,0.0,4,289.5,8.185353,4
discussion,0.25,0.5,4,0.000845,0.001689,4,0.001684,0.003367,4,283.0,...,4,0.0,0.0,4,0.0,0.0,4,283.0,15.556349,4
evaluation,0.0,0.0,4,0.0,0.0,4,0.0,0.0,4,242.75,...,4,0.001976,0.003953,4,0.003922,0.007843,4,242.75,8.920949,4
experiment,0.51685,0.040845,4,0.227432,0.027428,4,0.314036,0.019344,4,1006.25,...,4,0.291549,0.026184,4,0.365973,0.016732,4,1006.25,9.673848,4
introduction,0.550998,0.005105,4,0.437522,0.013988,4,0.487598,0.007017,4,4069.75,...,4,0.479862,0.00532,4,0.520798,0.001348,4,4069.75,20.645823,4
method,0.0,0.0,4,0.0,0.0,4,0.0,0.0,4,179.75,...,4,0.0,0.0,4,0.0,0.0,4,179.75,4.856267,4
previous work,0.9375,0.125,4,0.027428,0.031565,4,0.051268,0.05653,4,200.25,...,4,0.186333,0.048796,4,0.293741,0.065023,4,200.25,3.593976,4
related work,0.647622,0.01301,4,0.555243,0.013937,4,0.59788,0.013545,4,3150.0,...,4,0.625103,0.01683,4,0.64068,0.011552,4,3150.0,17.606817,4
results,0.0,0.0,4,0.0,0.0,4,0.0,0.0,4,295.25,...,4,0.006037,0.004381,4,0.011782,0.008494,4,295.25,11.295279,4


# Samples

In [192]:
sys_run_results_fp = os.path.join(exp_dir, 'folds', '1', 'bert-base-cased', 'results.csv')

results_cols = ['from_title', 'to_title', 'true', 'predicted']

results_df = pd.read_csv(sys_run_results_fp)
results_df.head()

FileNotFoundError: [Errno 2] File ./output/.ipynb_checkpoints/folds/1/bert-base-cased/results.csv does not exist: './output/.ipynb_checkpoints/folds/1/bert-base-cased/results.csv'

In [None]:
results_df[results_cols]

In [None]:
without_predictions = pd.isnull(results_df['predicted']).sum()

print(f'Samples for that no probabilty was above classification threshold: {without_predictions:,} / {len(results_df):,}')

In [None]:
results_df[results_df['predicted_background'] > 0][results_cols]

In [None]:
results_df[results_df['true_background'] > 0][results_cols]

In [None]:
results_df[results_df['predicted_related work'] > 0][results_cols]

In [None]:
results_df[results_df['true_related work'] > 0][results_cols]

In [None]:
results_df['true']

In [None]:
results_df['true'].value_counts()[:20]

In [None]:
results_df['predicted'].value_counts()

In [None]:
results_df[results_df['predicted_related work'] > 0][results_cols]

# Multi-label confusion

```

missed classification for... | predicted labels                                | true labels
                                introduction   | background | experiment | ... |
introduction                 |        -        | 12         | 3          | 
(true_introduction=1,predicted_introductioin=0)


```

(color background min/max value)
