# Text classification

This notebook provides functions to run and evaluate a text classifier, with a particular focus on metrics that assess the dataset imbalance and the prediction bias.

The first part of the notebook contains the definition of the functions, followed by an execution on the selected dataset.

In [None]:
dataset = 'reuters'             # 'webscope_r4' or 'reuters'
select_only_labelled = True     # if True, only documents with at least 1 label assigned will be considered
do_text_preprocessing = False   # if True, stopword-removal, lower-casing and stemming are applied to the documents

In [None]:
import pandas as pd
import numpy as np
import math, random, re, unidecode, os
import matplotlib.pyplot as plt
from scipy import stats
from gensim.parsing import preprocessing as pproc

from sklearn.metrics import coverage_error, label_ranking_average_precision_score, label_ranking_loss
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score
from sklearn.model_selection import KFold

import spacy
from spacy.pipeline import TextCategorizer

## Functions

Compute the proportion of samples for each unique label in ``labels_per_sample``.

Parameters:
- ``labels_per_sample``: array-like object, where the i-th element indicates the label of the i-th sample

Returns:
- a dictionary, where keys are labels and values are real numbers in the 0-1 range

In [None]:
def calculate_label_frequencies(labels_per_sample):
    
    all_labels = []
    for labels in labels_per_sample:
        all_labels.extend(labels)

    unique_labels, label_counts = np.unique(all_labels, return_counts=True)
    occurrences_dict = dict(zip(unique_labels, label_counts))
    frequencies_dict = {l: occurrences_dict[l] / len(labels_per_sample) for l in occurrences_dict}

    return frequencies_dict

Same as ``calculate_label_frequencies``, but the values are absolute numbers of occurrences of the label, instead of proportions.

In [None]:
def calculate_label_occurrences(labels_per_sample):
    
    all_labels = []
    for labels in labels_per_sample:
        all_labels.extend(labels)

    unique_labels, label_counts = np.unique(all_labels, return_counts=True)
    occurrences_dict = dict(zip(unique_labels, label_counts))

    return occurrences_dict

Normalized version of the coverage error (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.coverage_error.html#sklearn.metrics.coverage_error).

The normalized value is obtained by the formula:

```normalized_cov_err = (cov_err - best_possible_cov_err) / (worst_possible_cov_err - best_possible_cov_err)```

In [None]:
def normalized_coverage_error(y_true, y_score):
    cov_err = coverage_error(y_true, y_score)
    best_possible_cov_err = np.average([sum(yt) for yt in y_true])
    worst_possible_cov_err = len(y_true[0])
    
    normalized_cov_err = (cov_err - best_possible_cov_err) / (worst_possible_cov_err - best_possible_cov_err)
    
    return normalized_cov_err

Helper function for ``avg_exposure_aggregate``. Computes the average exposure assigned by the ranking to the ground truth labels. Follows the definition of "exposure" by Singh et al., 2018.

Params:
- ``y_true``: ground-truth labels associated with a specific sample (one-hot sparse representation)
- ``y_score``: predicted scores for each label
- ``normalize``: if True, the exposure is normalized with the formula ``avg_exp = (avg_exp - worst_possible_exp) / (best_possible_exp-worst_possible_exp)``

Returns:
- a real value representing the average exposure of the ground-truth labels for one specific sample

In [None]:
def _avg_exposure_single(y_true, y_score, normalize=True):
    
    true_labels = [l for l,v in enumerate(y_true) if v]
    ranked_labels = sorted([(l,s) for l,s in enumerate(y_score)], reverse=True, key=lambda x:x[-1])
    
    avg_exp = 0
    for j,(l,s) in enumerate(ranked_labels):
        if l in true_labels:
            avg_exp += 1 / math.log2(2+j)
    
    if normalize:
        best_possible_exp = sum([(1 / math.log2(2+j)) for j in range(len(true_labels))])
        worst_possible_exp = sum([(1 / math.log2(2+j)) for j in range(len(ranked_labels)-1,len(ranked_labels)-len(true_labels)-1,-1)])
        
        try:
            avg_exp = (avg_exp - worst_possible_exp) / (best_possible_exp-worst_possible_exp)
        except ZeroDivisionError:
            avg_exp = 0
        
    return avg_exp

Computes, for each sample, the exposure assigned to the true labels by the rankings of labels. See also ``_avg_exposure_single``.

Params:
- ``y_true``: array-like object where the i-th element contains the ground-truth labels associated with the i-th sample, represented in a one-hot sparse vector
- ``y_score``: array-like object where the i-th element contains the predicted scores for all labels associated with the i-th sample

Returns:
- real value representing the exposure, averaged over all samples

In [None]:
def avg_exposure_aggregate(y_true, y_score):
    
    avg_exp_aggr = 0
    for yt, ys in zip(y_true, y_score):
        avg_exp_aggr += _avg_exposure_single(yt, ys)
    avg_exp_aggr /= len(y_true)
    return avg_exp_aggr

Compute a variety of metrics given a ground truth and a series of predictions. The metrics included in the analysis are:
- mean imbalance ratio
- coefficient of variation of the imbalance ratio
- label ranking average precision
- label ranking loss
- normalized coverage error
- averaged exposure
- balanced accuracy
- precision
- recall
- f-score
- prediction bias coefficient (here ``fscore_correlation``).

See the paper "Evaluating the Prediction Bias Induced by Label Imbalance in Multi-label Classification" for reference on the metrics.

Params:
- ``true_labels``: array-like object where the i-th element contains the ground-truth labels associated with the i-th sample
- ``true_labels_sparse``: same as ``true_labels``, but the labels for each sample are represented through a sparse one-hot vector
- ``predicted_labels``: array-like object where the i-th element contains the predicted labels associated with the i-th sample
- ``predicted_scores``: array-like object where the i-th element contains the scores assigned to all labels associated with the i-th sample
- ``training_frequencies_dict``: dictionary where keys are labels and values are the proportion of samples in the training set associated with that label
- ``test_frequencies_dict``: dictionary where keys are labels and values are the proportion of samples in the test set associated with that label

Returns:
- a dictionary where keys are the names of the metrics and values are the obtained scores

In [None]:
def compute_metrics(true_labels, true_labels_sparse, predicted_labels, predicted_scores,
                    training_frequencies_dict, test_frequencies_dict):
    
    metrics = {}
    
    IR_per_label = [max(training_frequencies_dict.values()) / training_frequencies_dict[l] for l in training_frequencies_dict]
    meanIR = np.average(IR_per_label)
    stdevIR = np.std(IR_per_label)
    CVIR = stdevIR / meanIR
    
    metrics['meanIR'] = meanIR
    metrics['CVIR'] = CVIR
    metrics['lraps'] = label_ranking_average_precision_score(true_labels_sparse, predicted_scores)
    metrics['lrl'] = label_ranking_loss(true_labels_sparse, predicted_scores)
    metrics['cov_err'] = normalized_coverage_error(true_labels_sparse, predicted_scores)
    metrics['avg_exp'] = avg_exposure_aggregate(true_labels_sparse, predicted_scores)

    balanced_accuracies = []
    precisions = []
    recalls = []
    fscores = []
    training_frequencies = []
    test_frequencies = []
    label_names = []
    for current_label in sorted(training_frequencies_dict, key=training_frequencies_dict.get, reverse=False):
        if current_label in test_frequencies_dict:# and current_label not in ('acq', 'earn'):
            true_labels_binary = [1 if current_label in labels else 0 for labels in true_labels]
            predicted_labels_binary = [1 if current_label in labels else 0 for labels in predicted_labels]
            
            balanced_accuracies.append(balanced_accuracy_score(true_labels_binary, predicted_labels_binary))
            precisions.append(precision_score(true_labels_binary, predicted_labels_binary, zero_division=0))
            recalls.append(recall_score(true_labels_binary, predicted_labels_binary, zero_division=0))
            fscores.append(f1_score(true_labels_binary, predicted_labels_binary, zero_division=0))
            training_frequencies.append(training_frequencies_dict[current_label])
            test_frequencies.append(test_frequencies_dict[current_label])
            label_names.append(current_label)

    metrics['avg_balanced_accuracy'] = np.average(balanced_accuracies)
    metrics['avg_precision'] = np.average(precisions)
    metrics['avg_recall'] = np.average(recalls)
    metrics['avg_fscore'] = np.average(fscores)

    (metrics['precision_correlation'], 
     metrics['asymptotic_precision']) = plot_correlation(training_frequencies, precisions, xlabel='Frequency in Training Set', ylabel='Precision', label_names=label_names)

    (metrics['recall_correlation'], 
     metrics['asymptotic_recall']) = plot_correlation(training_frequencies, recalls, xlabel='Frequency in Training Set', ylabel='Recall', label_names=label_names)

    (metrics['fscore_correlation'], 
     metrics['asymptotic_fscore']) = plot_correlation(training_frequencies, fscores, xlabel='Frequency in Training Set', ylabel='F-score', label_names=label_names)

    return metrics

Print a scatter plot of the values of ``metric_array_y`` (y-axis) against the values of ``metrics_array_x`` (x-axis) and returns the Spearman's correlation between the two arrays of quantities, together with the expected value of y for x == 0, according to the interpolation function. 

Params:
- ``metric_array_x``: array-like object with the values for the x-coordinates
- ``metric_array_y``: array-like object with the values for the y-coordinates
- ``polynomial_degree``: degree of the interpolation, representing the trend line (can be 1, 2 or 3)
- ``xlabel``: label for the x-axis
- ``ylabel``: label for the y-axis
- ``label_names``: array-like object where the i-th element contains the label to print for the point at ``(metric_array_x[i], metric_array_y[i])`` 

Returns:
- Spearman's correlation coefficient between ``metric_array_x`` and ``metric_array_y``
- expected value for y when x = 0

In [None]:
def plot_correlation(metric_array_x, metric_array_y, print_plot=True, 
                     polynomial_degree=1, xlabel='', ylabel='', label_names=[]):
    assert polynomial_degree >= 1 and polynomial_degree <= 3
    
    z_1 = np.polyfit(metric_array_x, metric_array_y, 1)
    p_1 = np.poly1d(z_1)
    
    z_2 = np.polyfit(metric_array_x, metric_array_y, 2)
    p_2 = np.poly1d(z_2)
    
    z_3 = np.polyfit(metric_array_x, metric_array_y, 3)
    p_3 = np.poly1d(z_3)
    
    if polynomial_degree == 1:
        asymptotic_value = p_1(0)
    elif polynomial_degree == 2:
        asymptotic_value = p_2(0)
    elif polynomial_degree == 3:
        asymptotic_value = p_3(0)
    
    correlation, p_value = stats.spearmanr(metric_array_x, metric_array_y)
    
    if print_plot:
        fig, ax = plt.subplots(figsize=(15,8))
        title = ('Spearman Correlation Coefficient: ' + str(round(correlation,2)))
        ax.set_title(title)
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        ax.scatter(metric_array_x, metric_array_y)
        for x, y, l in zip(metric_array_x, metric_array_y, label_names):
            ax.annotate(l, (x, y), fontsize=22)
        ax.plot(metric_array_x, p_1(metric_array_x),"r--")
        plt.rc('axes', labelsize=24)    # fontsize of the x and y labels
        plt.rc('xtick', labelsize=24)    # fontsize of the tick labels
        plt.rc('ytick', labelsize=24)    # fontsize of the tick labels
        plt.show()
    
    return correlation, asymptotic_value

Apply standard pre-processing techniques to a text and return the normalized string.

In [None]:
def process_text(string, lowercase=True, remove_stopwords=True, stemming=False):
    
    string = unidecode.unidecode(string)
    if lowercase:
        string = string.lower()
    abbreviations = re.findall(r'(?:[a-z]\.)+', string)
    for abbr in abbreviations:
        string = string.replace(abbr, abbr.replace('.',''))
    string = pproc.strip_punctuation2(string)
    if remove_stopwords:
        string = pproc.remove_stopwords(string)
    if stemming:
        string = pproc.stem_text(string)
    string = string.strip()
    return string

Prepare the samples in the appropriate format for the spaCy's TextCategorizer.

Params:
- ``samples``: list of (text, labels) tuples
- ``unique_labels``: set of all possible unique labels

Returns:
- array-like object with samples ready to be processed by spaCy's TextCategorizer

In [None]:
def prepare_cats(samples, unique_labels):
    new_samples = []
    for text,labels in samples:
        cats = {unique_l:(unique_l in labels) for unique_l in unique_labels}
        new_samples.append((text, {'cats':cats}))
    return new_samples

Convert a list of lists of labels (one per each sample) into an array-like object representing the labels associated with each sample in a one-hot sparse encoding.

Params:
- ``labels_per_sample``: list of lists of labels, one per sample
- ``unique_labels``: set of all possible unique labels

Returns:
- an array-like sparse representation of labels per sample, in a one-hot encoding fashion

In [None]:
def get_sparse_label_representations(labels_per_sample, unique_labels):
    
    sparse_labels_per_sample = []
    for labels in labels_per_sample:
        sparse_labels = np.zeros(len(unique_labels))
        for l in labels:
            if l in unique_labels:
                sparse_labels[unique_labels.index(l)] = 1
        sparse_labels_per_sample.append(sparse_labels)
    return sparse_labels_per_sample

Train a text classifier on the provided ``training_samples``.

Params:
- ``nlp``: a spaCy language model already initialized (see https://v2.spacy.io/api/language)
- ``training_samples``: array-like object containing samples as returned by ``prepare_cats``
- ``n_iter``: number of iteration (epochs) to train the spaCy model
- ``batch_size``: number of samples to be processed in the same weight-update of the model

Returns:
- a trained ``TextCategorizer`` model

See https://v2.spacy.io/api/textcategorizer for further reference.

In [None]:
def train_classifier(nlp, training_samples, n_iter=10, batch_size=8):
    
    # set architecture to ensemble for better performance
    text_classifier = nlp.create_pipe("textcat", config={"exclusive_classes": False, "architecture": "ensemble"})
    nlp.add_pipe(text_classifier, last=True)
    
    all_training_labels = []
    for text,labels in training_samples:
        all_training_labels.extend(labels)
    unique_training_labels = np.unique(all_training_labels)    
    for label in unique_training_labels:
        text_classifier.add_label(label)
    
    training_samples = prepare_cats(training_samples, text_classifier.labels)

    spacy.util.fix_random_seed()
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        nlp.begin_training()
        # Train for 10 iterations
        for itn in range(n_iter):
            random.shuffle(training_samples)
            # Divide examples into batches
            for batch in spacy.util.minibatch(training_samples, size=batch_size):
                texts = [text for text, label in batch]
                labels = [label for text, label in batch]
                # Update the model
                nlp.update(docs=texts, golds=labels)
    
    trained_classifier = nlp.get_pipe('textcat')
    return trained_classifier

Apply a previously trained ``TextCategorizer`` model on a set of ``test_samples``.

Params:
- ``nlp``: a spaCy language model already initialized (see https://v2.spacy.io/api/language)
- ``classifier``: previously trained ``TextCategorizer`` model, as returned by ``train_classifier``
- ``test_samples``: list of (text,labels) tuples, one for each sample in the test set
- ``confidence_threshold``: a real value between 0 and 1; a label is assigned to a sample if its predicted score is bigger than this value

Returns:
- ``predicted_scores``: array-like object where the i-th element contains a vector of probabilities values for all labels w.r.t. the i-th element in ``test_samples``
- ``predicted_labels``: list of lists of labels assigned to each sample in ``test_samples`` according to the ``predicted_scores`` and ``confidence_threshold``

In [None]:
def predict(nlp, classifier, test_samples, confidence_threshold=0.5):
    
    test_docs = [nlp(descr) for descr,label in test_samples]
    predicted_scores, tensors = classifier.predict(test_docs)
    
    predicted_labels = []
    for score in predicted_scores:
        selected_indices = [i for i,s in enumerate(score) if s > confidence_threshold]
        selected_labels = [l for i,l in enumerate(classifier.labels) if i in selected_indices]
        predicted_labels.append(selected_labels)

    return predicted_scores, predicted_labels

Run a k-fold cross validation on all ``samples``, which will be iteratively split into training and test sets.

Params:
- ``samples``: a list of (text, labels) tuples, with all documents to be classified
- ``n_folds``: number of folds for the cross validation
- ``n_training_iter``: corresponds to ``n_iter`` in ``train_classifier``
- ``training_batch_size``: corresponds to ``batch_size`` in ``train_classifier``
- ``confidence_threshold``: see ``train_classifier``
- ``load_classifiers``: load already-trained classifiers (for all folds separately), if previously saved (the path is hard-coded)
- ``save_classifiers``: save the trained classifiers for all folds separately (the path is hard-coded)

Returns:
- a 2-level dictionary with the following structure:
    - in the first level, the keys are 'fold_1', 'fold_2', etc, and 'global'; the latter stores the values of the metrics averaged over all folds
    - in the second level, the keys are the metrics mentioned in ``compute_metrics`` and the values are the obtained scores
    
    Examples: ``metrics['fold_2']['avg_precision']`` or ``metrics['global']['avg_recall']``

In [None]:
def run_cross_validation(samples, n_folds=5, n_training_iter=10, training_batch_size=8, confidence_threshold=0.5,
                         load_classifiers=False, save_classifiers=False):
    
    if load_classifiers or save_classifiers:
        folder_path = dataset+'_'
        if select_only_labelled:
            folder_path += 'only_labelled_'
        if do_text_preprocessing:
            folder_path += 'preprocessed_'
        folder_path += ('n_folds_' + str(n_folds) + '_n_training_iter_' + str(n_training_iter) + 
                        '_batch_size_' + str(training_batch_size))
    
    assert not (load_classifiers and folder_path not in os.listdir('models'))
    
    samples = np.array(samples, dtype=object)
    kf = KFold(n_splits=n_folds)
    metrics = {}  # add metrics for each fold and averaged
    i = 0
    for train_index, test_index in kf.split(samples):
        nlp=spacy.load("en_core_web_lg")
        i += 1
        print('\nFOLD n.', i)
        training_samples = samples[train_index]
        test_samples = samples[test_index]
        training_frequencies_dict = calculate_label_frequencies([labels for text,labels in training_samples])
        test_frequencies_dict = calculate_label_frequencies([labels for text,labels in test_samples])
        
        if load_classifiers:
            print('Loading classifier...')
            text_classifier = nlp.create_pipe("textcat", config={"exclusive_classes": False, "architecture": "ensemble"})
            text_classifier.from_disk('models/' + folder_path + '/classifier_fold_'+str(i))
            print('Done.')
        else:
            print('Training classifier...')
            text_classifier = train_classifier(nlp, training_samples, n_iter=n_training_iter, batch_size=training_batch_size)
            print('Done.')
            if save_classifiers:
                if folder_path not in os.listdir('models/'):
                    os.mkdir('models/'+folder_path)
                text_classifier.to_disk('models/'+folder_path+'/classifier_fold_'+str(i))
                
        
        true_labels = [labels for descr,labels in test_samples]
        true_labels_sparse = get_sparse_label_representations(true_labels, text_classifier.labels)
        print('Predicting...')
        predicted_scores, predicted_labels = predict(nlp, text_classifier, test_samples, 
                                                     confidence_threshold=confidence_threshold)
        print('Done.')
        
        metrics['fold_'+str(i)] = compute_metrics(true_labels, true_labels_sparse, predicted_labels, predicted_scores,
                                                  training_frequencies_dict, test_frequencies_dict)
       
    metrics['global'] = {}
    for m in ['meanIR','CVIR','lraps','lrl','cov_err','avg_exp','avg_balanced_accuracy','avg_precision','avg_recall','avg_fscore','precision_correlation',
              'asymptotic_precision','recall_correlation','asymptotic_recall','fscore_correlation','asymptotic_fscore']:
        metrics['global'][m] = (np.average([metrics['fold_'+str(i+1)][m] for i in range(n_folds)]), 
                                np.std([metrics['fold_'+str(i+1)][m] for i in range(n_folds)]))
        
    return metrics

## Execution

Read the dataset indicated by ``dataset`` (see above) and apply some basic cleaning.

In [None]:
data = pd.read_csv('data/'+dataset+'.csv')
data = data.fillna('')
if dataset == 'webscope_r4':
    data['labels'] = ['' if l == '\\N' else l for l in data['labels']]
    data['labels'] = [l.replace('Action and Adventure', 'Action/Adventure') if 'Action and Adventure' in l else l for l in data['labels']]
    data['labels'] = [l.replace('~Delete', '') if '~Delete' in l else l for l in data['labels']]
    if select_only_labelled:
        data = data.loc[data['labels'] != '']
data['labels'] = [l.split(',') if l != '' else [] for l in data['labels']]
data.head()

In [None]:
if do_text_preprocessing:
    data['text'] = [process_text(t) for t in data['text']]
data.head()

Print information about the label distribution and plot a bar chart with the number of samples per label.

In [None]:
all_labels = []
n_labels_per_doc = []
for labels in data['labels']:
    n_labels_per_doc.append(len(labels))
    if len(labels) > 0:
        all_labels = all_labels + labels

print('All labels:', np.unique(all_labels))
print('Total n. of samples:', len(data))
print('\nNumber of unique labels:', len(np.unique(all_labels)))
print('Cardinality (avg. n. of labels per doc):', round(np.average(n_labels_per_doc),2))
print('Density (cardinality / |tot labels|):', round(np.average(n_labels_per_doc)/len(np.unique(all_labels)),2))
print('Max. n. of labels per doc:', round(max(n_labels_per_doc),2))
print('Min. n. of labels per doc:', round(min(n_labels_per_doc),2))

n_labeled_docs = sum([1 for labels in data['labels'] if len(labels)>0])
labeled_percentage = 100 * n_labeled_docs / len(data['labels'])
print('Percentage of labeled docs:', round(labeled_percentage,2), '%')

unique, counts = np.unique(all_labels, return_counts=True)
label_frequencies = sorted(list(zip(unique, counts)), key=lambda x : x[-1], reverse=True)
label_frequencies = label_frequencies[:20]
unique = [l for l,c in label_frequencies]
counts = [100*c/len(data['labels']) for l,c in label_frequencies]

plt.rc('axes', labelsize=14)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=14)    # fontsize of the tick labels
y = np.arange(len(unique))
fig, ax = plt.subplots(figsize = (12, 6))
ax.bar(y, counts, tick_label=['' for _ in unique])
ax.set_ylim([0,21])
ax.set_xlabel('Labels')
ax.set_ylabel('Frequency (%)')
#ax.set_title('Most frequent labels in the dataset')
plt.show()

Prepare the samples.

In [None]:
data['tuples'] = data.apply(lambda row: (row['text'],row['labels']), axis=1)
samples = data['tuples'].tolist()

Print information about the information ratio metrics (see the paper for references).

In [None]:
# see Zhu et al. 2018, Charte et al, 2015
label_frequencies = calculate_label_frequencies([labels for text,labels in samples])
label_occurrences = calculate_label_occurrences([labels for text,labels in samples])

max_occurrence = max(label_occurrences.values())
min_occurrence = min(label_occurrences.values())

#IR
maxIR = max_occurrence / min_occurrence
print('MAX IR:', round(maxIR,2))

#MeanIR and CVIR
IR_per_label = [max_occurrence / label_occurrences[l] for l in label_occurrences]

y = np.arange(len(IR_per_label))
fig, ax = plt.subplots(figsize = (12, 6))
ax.bar(y, sorted(IR_per_label, reverse=True))
ax.set_xlabel('Labels')
ax.set_ylabel('IR')
#ax.set_title('Most frequent labels in the dataset')
plt.show()

meanIR = np.average(IR_per_label)
stdevIR = np.std(IR_per_label)
CVIR = stdevIR / meanIR
print('Mean IR:', meanIR)
print('CV IR:', CVIR)

#LRID
C = len(label_occurrences)
N = len(samples)
lrid = -2 * sum([label_occurrences[l] * math.log(N / (C*label_occurrences[l])) for l in label_occurrences])
print('LRID:', round(lrid,2))

Execute the cross validation. See ``run_cross_validation`` for reference about the parameters.

In [None]:
metrics = run_cross_validation(samples, n_folds=10, training_batch_size=8, n_training_iter=10, confidence_threshold=0.5,
                               save_classifiers=False, load_classifiers=True)

Print metrics.

In [None]:
df_rows = []
metric_list = ['meanIR','CVIR','lraps','lrl','cov_err','avg_exp','avg_balanced_accuracy','avg_precision','avg_recall','avg_fscore','precision_correlation',
          'asymptotic_precision','recall_correlation','asymptotic_recall','fscore_correlation','asymptotic_fscore']
for m in metric_list:
    r = {}
    for key in metrics:
        r[key] = round(metrics[key][m],2) if key!='global' else str(round(metrics[key][m][0],2)) +' +- ' + str(round(metrics[key][m][1],2))
    df_rows.append(r)
    
metrics_df = pd.DataFrame(df_rows, index=metric_list)     
metrics_df.head(20)