# Introduction

The goal of this assignment is to create a basic program that provides an overview of basic evaluation metrics (in particular, precision, recall, f-score and a confusion matrix) from documents provided in the conll format. You will need to implement the calculations for precision, recall and f-score yourself (i.e. do not use an existing module that spits them out). Make sure that your code can handle the situation where there are no true positives for a specific class.

This notebook provides functions for reading in conll structures with pandas and proposes a structure for calculating your evaluation metrics and producing the confusion matrix. Feel free to adjust the proposed structure if you see fit.

In [1]:
# libraries

import sys
import numpy as np
import pandas as pd
# see tips & tricks on using defaultdict (remove when you do not use it)
from collections import defaultdict, Counter
# module for verifying output
from nose.tools import assert_equal

In [2]:
def extract_annotations(inputfile, annotationcolumn, delimiter='\t'):
    '''
    This function extracts annotations represented in the conll format from a file
    
    :param inputfile: the path to the conll file
    :param annotationcolumn: the name of the column in which the target annotation is provided
    :param delimiter: optional parameter to overwrite the default delimiter (tab)
    :type inputfile: string
    :type annotationcolumn: string
    :type delimiter: string
    :returns: the annotations as a list
    '''
    #https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
    conll_input = pd.read_csv(inputfile, sep=delimiter, on_bad_lines='skip')
    annotations = conll_input[annotationcolumn].tolist()
    return annotations

In [3]:
extract_annotations("datas/minigold.csv","token")

['The',
 'Computational',
 'Lexicology',
 'and',
 'Terminology',
 'Lab',
 'headed',
 'by',
 'Piek',
 'Vossen',
 'offers',
 'mutliple',
 'courses',
 'in',
 'NLP',
 '.']

In [4]:
goldannotations = extract_annotations("datas/minigold.csv","gold")
machineannotations = extract_annotations("datas/miniout1.csv","NER")

In [5]:
goldannotations

['O',
 'B-ORG',
 'I-ORG',
 'I-ORG',
 'B-ORG',
 'I-ORG',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'B-MISC',
 'O']

In [6]:
machineannotations

['O',
 'B-ORG',
 'I-ORG',
 'O',
 'B-ORG',
 'I-ORG',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [7]:
results = {}

for i in machineannotations:
    results[i] = machineannotations.count(i)
    
print(results)

{'O': 10, 'B-ORG': 2, 'I-ORG': 2, 'B-PER': 1, 'I-PER': 1}


In [8]:
results = {}

for i in goldannotations:
    results[i] = goldannotations.count(i)
    
print(results)

{'O': 8, 'B-ORG': 2, 'I-ORG': 3, 'B-PER': 1, 'I-PER': 1, 'B-MISC': 1}


In [9]:
def obtain_counts(goldannotations, machineannotations):
    '''
    This function compares the gold annotations to machine output
    
    :param goldannotations: the gold annotations
    :param machineannotations: the output annotations of the system in question
    :type goldannotations: the type of the object created in extract_annotations
    :type machineannotations: the type of the object created in extract_annotations
    
    :returns: a countainer providing the counts for each predicted and gold class pair
    '''
    evaluation_counts = defaultdict(Counter)

    for gold_annotation, machine_annotation in zip(goldannotations, machineannotations):
        evaluation_counts[gold_annotation][machine_annotation] += 1
        
    return evaluation_counts  

In [10]:
evaluation_counts = obtain_counts(goldannotations,machineannotations)
evaluation_counts

defaultdict(collections.Counter,
            {'O': Counter({'O': 8}),
             'B-ORG': Counter({'B-ORG': 2}),
             'I-ORG': Counter({'I-ORG': 2, 'O': 1}),
             'B-PER': Counter({'B-PER': 1}),
             'I-PER': Counter({'I-PER': 1}),
             'B-MISC': Counter({'O': 1})})

In [11]:
def provide_confusion_matrix(evaluation_counts):
    '''
    Read in the evaluation counts and provide a confusion matrix for each class
    
    :param evaluation_counts: a container from which you can obtain the true positives, false positives and false negatives for each class
    :type evaluation_counts: type of object returned by obtain_counts
    
    :prints out a confusion matrix
    '''
    
    # TIP: provide_output_tables does something similar, but those tables are assuming one additional nested layer
    # your solution can thus be a simpler version of the one provided in provide_output_tables below
    
    confusion_matrix = pd.DataFrame.from_dict({i: evaluation_counts[i] for i in evaluation_counts.keys()}, orient='index')
    confusion_matrix = confusion_matrix.reindex(sorted(confusion_matrix.columns), axis=1)
    confusion_matrix = confusion_matrix.reindex(sorted(confusion_matrix.columns), axis=0)
    confusion_matrix = confusion_matrix.fillna(0)
    confusion_matrix = confusion_matrix.round(0).astype(int)

    return confusion_matrix

In [12]:
confusion_matrix = provide_confusion_matrix(evaluation_counts)
confusion_matrix

Unnamed: 0,B-ORG,B-PER,I-ORG,I-PER,O
B-ORG,2,0,0,0,0
B-PER,0,1,0,0,0
I-ORG,0,0,2,0,1
I-PER,0,0,0,1,0
O,0,0,0,0,8


In [13]:
def calculate_precision_recall_fscore(evaluation_counts):
    '''
    Calculate precision recall and fscore for each class and return them in a dictionary
    
    :param calculate_true_false: a tuple from which you can obtain the true positives, false positives and false negatives for each class
    :type calculate_true_false: type of object returned by obtain_counts
    
    :returns the precision, recall and f-score of each class in a container
    '''
        
    # recall = TP / (TP+FN)
    # precision = TP / (TP+FP)
    # f1_score = (2*precision*recall) / (precision+recall)
    # accuracy =  (TP+TN)/(TP+FP+FN+TN)
    
    conf_matrix = provide_confusion_matrix(evaluation_counts)
    
    sum_of_rows = conf_matrix.sum(axis=1)
    sum_of_columns = conf_matrix.sum(axis=0)
    #total_sum = conf_matrix.sum()
    
    # initialize the lists
    accuracy = []
    recall = []
    precision = []
    f1_score = []
    
    for i in range(len(conf_matrix)):
        precision.append((conf_matrix.iloc[i, i] / sum_of_columns[i]))
        recall.append(conf_matrix.iloc[i, i] / sum_of_rows[i])
        f1_score.append((2* (precision[i] * recall[i])) / (precision[i] + recall[i]))

    return precision,recall,f1_score

In [14]:
precision,recall,f1_score = calculate_precision_recall_fscore(evaluation_counts)
print("B-ORG	B-PER	I-ORG	I-PER	O")
print(precision)
print(recall)
print(f1_score)

B-ORG	B-PER	I-ORG	I-PER	O
[1.0, 1.0, 1.0, 1.0, 0.8888888888888888]
[1.0, 1.0, 0.6666666666666666, 1.0, 1.0]
[1.0, 1.0, 0.8, 1.0, 0.9411764705882353]


In [15]:
def carry_out_evaluation(gold_annotations, systemfile, systemcolumn, delimiter='\t'):
    '''
    Carries out the evaluation process (from input file to calculating relevant scores)
    
    :param gold_annotations: list of gold annotations
    :param systemfile: path to file with system output
    :param systemcolumn: indication of column with relevant information
    :param delimiter: specification of formatting of file (default delimiter set to '\t')
    
    returns evaluation information for this specific system
    '''
    system_annotations = extract_annotations(systemfile, systemcolumn, delimiter)
    evaluation_counts = obtain_counts(gold_annotations, system_annotations)
    provide_confusion_matrix(evaluation_counts)
    evaluation_outcome = calculate_precision_recall_fscore(evaluation_counts)
    
    return evaluation_outcome

In [16]:
def provide_output_tables(evaluations):
    '''
    Create tables based on the evaluation of various systems
    
    :param evaluations: the outcome of evaluating one or more systems
    '''
    #https:stackoverflow.com/questions/13575090/construct-pandas-dataframe-from-items-in-nested-dictionary
    evaluations_pddf = pd.DataFrame.from_dict({(i,j): evaluations[i][j]
                                              for i in evaluations.keys()
                                              for j in evaluations[i].keys()},
                                             orient='index')
    print(evaluations_pddf)
    print(evaluations_pddf.to_latex())

In [17]:
def run_evaluations(goldfile, goldcolumn, systems):
    '''
    Carry out standard evaluation for one or more system outputs
    
    :param goldfile: path to file with goldstandard
    :param goldcolumn: indicator of column in gold file where gold labels can be found
    :param systems: required information to find and process system output
    :type goldfile: string
    :type goldcolumn: integer
    :type systems: list (providing file name, information on tab with system output and system name for each element)
    
    :returns the evaluations for all systems
    '''
    evaluations = {}
    #not specifying delimiters here, since it corresponds to the default ('\t')
    gold_annotations = extract_annotations(goldfile, goldcolumn)
    for system in systems:
        sys_evaluation = carry_out_evaluation(gold_annotations, system[0], system[1])
        evaluations[system[2]] = sys_evaluation
    return evaluations

In [18]:
def identify_evaluation_value(system, class_label, value_name, evaluations):
    '''
    Return the outcome of a specific value of the evaluation
    
    :param system: the name of the system
    :param class_label: the name of the class for which the value should be returned
    :param value_name: the name of the score that is returned
    :param evaluations: the overview of evaluations
    
    :returns the requested value
    '''
    return evaluations[system][class_label][value_name]

In [19]:
def create_system_information(system_information):
    '''
    Takes system information in the form that it is passed on through sys.argv or via a settingsfile
    and returns a list of elements specifying all the needed information on each system output file to carry out the evaluation.
    
    :param system_information is the input as from a commandline or an input file
    '''
    # https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    systems_list = [system_information[i:i + 3] for i in range(0, len(system_information), 3)]
    return systems_list

In [20]:
def results(label_gold, label_model):
    eval_counts = obtain_counts(label_gold, label_model)
    confusion_matrix = provide_confusion_matrix(eval_counts)
    precision,recall,f1_score = calculate_precision_recall_fscore(eval_counts)
    return confusion_matrix

In [21]:
def main(my_args=None):
    '''
    A main function. This does not make sense for a notebook, but it is here as an example.
    sys.argv is a very lightweight way of passing arguments from the commandline to a script.
    '''
    
    if my_args is None:
        my_args = sys.argv
    
    system_info = create_system_information(my_args[2:])
    evaluations = run_evaluations(my_args[0], my_args[1], system_info)
    provide_output_tables(evaluations)
    check_eval = identify_evaluation_value('system1', 'O', 'f-score', evaluations)
    #if it does not work correctly, this assert statement will indicate that
    assert_equal("%.3f" % check_eval,"0.889")

In [22]:
#my_args = ['datas/minigold.csv','gold','datas/miniout1.csv','NER','system1']
#main(my_args)

In [23]:
evaluation_outcome = carry_out_evaluation(gold_annotations=goldannotations,systemfile="datas/miniout1.csv",
                                          systemcolumn="NER")
print(evaluation_outcome)

([1.0, 1.0, 1.0, 1.0, 0.8888888888888888], [1.0, 1.0, 0.6666666666666666, 1.0, 1.0], [1.0, 1.0, 0.8, 1.0, 0.9411764705882353])


In [24]:
evaluation_outcome = carry_out_evaluation(gold_annotations=goldannotations,systemfile="datas/miniout2.csv",
                                          systemcolumn="NER")
print(evaluation_outcome)

([1.0, 1.0, 0.8888888888888888], [0.6666666666666666, 1.0, 1.0], [0.8, 1.0, 0.9411764705882353])
