# Introduction

The goal of this assignment is to create a basic program that provides an overview of basic evaluation metrics (in particular, precision, recall, f-score and a confusion matrix) from documents provided in the conll format. You will need to implement the calculations for precision, recall and f-score yourself (i.e. do not use an existing module that spits them out). Make sure that your code can handle the situation where there are no true positives for a specific class.

This notebook provides functions for reading in conll structures with pandas and proposes a structure for calculating your evaluation metrics and producing the confusion matrix. Feel free to adjust the proposed structure if you see fit.

In [1]:
# libraries
import warnings
warnings.filterwarnings("ignore")

import csv
import sys
import numpy as np
import pandas as pd
# see tips & tricks on using defaultdict (remove when you do not use it)
from collections import defaultdict, Counter
# module for verifying output
from nose.tools import assert_equal

In [2]:
def extract_annotations(inputfile, annotationcolumn,column_name,delimiter='\t'):
    '''
    This function extracts annotations represented in the conll format from a file
    
    :param inputfile: the path to the conll file
    :param annotationcolumn: the name of the column in which the target annotation is provided
    :param delimiter: optional parameter to overwrite the default delimiter (tab)
    :type inputfile: string
    :type annotationcolumn: string
    :type delimiter: string
    :returns: the annotations as a list
    '''
    #https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
    
    conll_input = pd.read_csv(inputfile, sep=delimiter, on_bad_lines='skip',names=column_name)
    annotations = conll_input[annotationcolumn].tolist()
    return annotations

In [3]:
def extract_annotations_mini(inputfile, annotationcolumn,delimiter='\t'):
    '''
    This function extracts annotations represented in the conll format from a file
    
    :param inputfile: the path to the conll file
    :param annotationcolumn: the name of the column in which the target annotation is provided
    :param delimiter: optional parameter to overwrite the default delimiter (tab)
    :type inputfile: string
    :type annotationcolumn: string
    :type delimiter: string
    :returns: the annotations as a list
    '''
    #https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
    
    conll_input = pd.read_csv(inputfile, sep=delimiter, on_bad_lines='skip')
    annotations = conll_input[annotationcolumn].tolist()
    return annotations

In [4]:
extract_annotations_mini("datas/minigold.csv","token")

['The',
 'Computational',
 'Lexicology',
 'and',
 'Terminology',
 'Lab',
 'headed',
 'by',
 'Piek',
 'Vossen',
 'offers',
 'mutliple',
 'courses',
 'in',
 'NLP',
 '.']

In [5]:
goldannotations = extract_annotations_mini("datas/minigold.csv","gold")
machineannotations = extract_annotations_mini("datas/miniout1.csv","NER")

In [7]:
machineannotations

['O',
 'B-ORG',
 'I-ORG',
 'O',
 'B-ORG',
 'I-ORG',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [8]:
results = {}

for i in machineannotations:
    results[i] = machineannotations.count(i)
    
print(results)

{'O': 10, 'B-ORG': 2, 'I-ORG': 2, 'B-PER': 1, 'I-PER': 1}


In [9]:
results = {}

for i in goldannotations:
    results[i] = goldannotations.count(i)
    
print(results)

{'O': 8, 'B-ORG': 2, 'I-ORG': 3, 'B-PER': 1, 'I-PER': 1, 'B-MISC': 1}


In [10]:
def obtain_counts(goldannotations, machineannotations):
    '''
    This function compares the gold annotations to machine output
    
    :param goldannotations: the gold annotations
    :param machineannotations: the output annotations of the system in question
    :type goldannotations: the type of the object created in extract_annotations
    :type machineannotations: the type of the object created in extract_annotations
    
    :returns: a countainer providing the counts for each predicted and gold class pair
    '''
    evaluation_counts = defaultdict(Counter)
    
    evaluation_counts['O']['O'] = 0
    evaluation_counts['O']['B-ORG'] = 0
    evaluation_counts['O']['I-ORG'] = 0
    evaluation_counts['O']['B-PER'] = 0
    evaluation_counts['O']['I-PER'] = 0
    evaluation_counts['O']['B-MISC'] = 0

    evaluation_counts['B-ORG']['O'] = 0
    evaluation_counts['B-ORG']['B-ORG'] = 0
    evaluation_counts['B-ORG']['I-ORG'] = 0
    evaluation_counts['B-ORG']['B-PER'] = 0
    evaluation_counts['B-ORG']['I-PER'] = 0
    evaluation_counts['B-ORG']['B-MISC'] = 0

    evaluation_counts['I-ORG']['O'] = 0
    evaluation_counts['I-ORG']['B-ORG'] = 0
    evaluation_counts['I-ORG']['I-ORG'] = 0
    evaluation_counts['I-ORG']['B-PER'] = 0
    evaluation_counts['I-ORG']['I-PER'] = 0
    evaluation_counts['I-ORG']['B-MISC'] = 0

    evaluation_counts['B-PER']['O'] = 0
    evaluation_counts['B-PER']['B-ORG'] = 0
    evaluation_counts['B-PER']['I-ORG'] = 0
    evaluation_counts['B-PER']['B-PER'] = 0
    evaluation_counts['B-PER']['I-PER'] = 0
    evaluation_counts['B-PER']['B-MISC'] = 0

    evaluation_counts['I-PER']['O'] = 0
    evaluation_counts['I-PER']['B-ORG'] = 0
    evaluation_counts['I-PER']['I-ORG'] = 0
    evaluation_counts['I-PER']['B-PER'] = 0
    evaluation_counts['I-PER']['I-PER'] = 0
    evaluation_counts['I-PER']['B-MISC'] = 0

    evaluation_counts['B-MISC']['O'] = 0
    evaluation_counts['B-MISC']['B-ORG'] = 0
    evaluation_counts['B-MISC']['I-ORG'] = 0
    evaluation_counts['B-MISC']['B-PER'] = 0
    evaluation_counts['B-MISC']['I-PER'] = 0
    evaluation_counts['B-MISC']['B-MISC'] = 0

    for gold_annotation, machine_annotation in zip(goldannotations, machineannotations):
        evaluation_counts[gold_annotation][machine_annotation] += 1
        
    return evaluation_counts  

In [11]:
evaluation_counts = obtain_counts(goldannotations,machineannotations)
evaluation_counts

defaultdict(collections.Counter,
            {'O': Counter({'O': 8,
                      'B-ORG': 0,
                      'I-ORG': 0,
                      'B-PER': 0,
                      'I-PER': 0,
                      'B-MISC': 0}),
             'B-ORG': Counter({'O': 0,
                      'B-ORG': 2,
                      'I-ORG': 0,
                      'B-PER': 0,
                      'I-PER': 0,
                      'B-MISC': 0}),
             'I-ORG': Counter({'O': 1,
                      'B-ORG': 0,
                      'I-ORG': 2,
                      'B-PER': 0,
                      'I-PER': 0,
                      'B-MISC': 0}),
             'B-PER': Counter({'O': 0,
                      'B-ORG': 0,
                      'I-ORG': 0,
                      'B-PER': 1,
                      'I-PER': 0,
                      'B-MISC': 0}),
             'I-PER': Counter({'O': 0,
                      'B-ORG': 0,
                      'I-ORG': 0,
                

In [12]:
def provide_confusion_matrix(evaluation_counts):
    '''
    Read in the evaluation counts and provide a confusion matrix for each class
    
    :param evaluation_counts: a container from which you can obtain the true positives, false positives and false negatives for each class
    :type evaluation_counts: type of object returned by obtain_counts
    
    :prints out a confusion matrix
    '''
    
    confusion_matrix = pd.DataFrame.from_dict({i: evaluation_counts[i] for i in evaluation_counts.keys()}, orient='index')
    #confusion_matrix = confusion_matrix.reindex(sorted(confusion_matrix.columns), axis=1)
    #confusion_matrix = confusion_matrix.reindex(sorted(confusion_matrix.columns), axis=0)
    confusion_matrix = confusion_matrix.fillna(0)
    #confusion_matrix = confusion_matrix.round(0).astype(int)

    return confusion_matrix

In [13]:
confusion_matrix = provide_confusion_matrix(evaluation_counts)
confusion_matrix

Unnamed: 0,O,B-ORG,I-ORG,B-PER,I-PER,B-MISC
O,8,0,0,0,0,0
B-ORG,0,2,0,0,0,0
I-ORG,1,0,2,0,0,0
B-PER,0,0,0,1,0,0
I-PER,0,0,0,0,1,0
B-MISC,1,0,0,0,0,0


In [14]:
def calculate_precision_recall_fscore(evaluation_counts):
    '''
    Calculate precision recall and fscore for each class and return them in a dictionary
    
    :param calculate_true_false: a tuple from which you can obtain the true positives, false positives and false negatives for each class
    :type calculate_true_false: type of object returned by obtain_counts
    
    :returns the precision, recall and f-score of each class in a container
    '''
        
    # recall = TP / (TP+FN)
    # precision = TP / (TP+FP)
    # f1_score = (2*precision*recall) / (precision+recall)
    # accuracy =  (TP+TN)/(TP+FP+FN+TN)
    
    conf_matrix = provide_confusion_matrix(evaluation_counts)
    
    sum_of_rows = conf_matrix.sum(axis=1)
    sum_of_columns = conf_matrix.sum(axis=0)
    #total_sum = conf_matrix.sum()
    
    # initialize the lists
    accuracy = []
    recall = []
    precision = []
    f1_score = []
    
    for i in range(len(conf_matrix)):
        precision.append((conf_matrix.iloc[i, i] / sum_of_columns[i]))
        recall.append(conf_matrix.iloc[i, i] / sum_of_rows[i])
        f1_score.append((2* (precision[i] * recall[i])) / (precision[i] + recall[i]))

    return precision,recall,f1_score

In [15]:
precision,recall,f1_score = calculate_precision_recall_fscore(evaluation_counts)
print("B-ORG	B-PER	I-ORG	I-PER	O")
print(precision)
print(recall)
print(f1_score)

B-ORG	B-PER	I-ORG	I-PER	O
[0.8, 1.0, 1.0, 1.0, 1.0, nan]
[1.0, 1.0, 0.6666666666666666, 1.0, 1.0, 0.0]
[0.888888888888889, 1.0, 0.8, 1.0, 1.0, nan]


In [16]:
def carry_out_evaluation(gold_annotations, systemfile, systemcolumn, delimiter='\t'):
    '''
    Carries out the evaluation process (from input file to calculating relevant scores)
    
    :param gold_annotations: list of gold annotations
    :param systemfile: path to file with system output
    :param systemcolumn: indication of column with relevant information
    :param delimiter: specification of formatting of file (default delimiter set to '\t')
    
    returns evaluation information for this specific system
    '''
    system_annotations = extract_annotations_mini(systemfile, systemcolumn, delimiter)
    evaluation_counts = obtain_counts(gold_annotations, system_annotations)
    provide_confusion_matrix(evaluation_counts)
    evaluation_outcome = calculate_precision_recall_fscore(evaluation_counts)
    
    return evaluation_outcome

In [17]:
def provide_output_tables(evaluations):
    '''
    Create tables based on the evaluation of various systems
    
    :param evaluations: the outcome of evaluating one or more systems
    '''
    #https:stackoverflow.com/questions/13575090/construct-pandas-dataframe-from-items-in-nested-dictionary
    evaluations_pddf = pd.DataFrame.from_dict({(i,j): evaluations[i][j]
                                              for i in evaluations.keys()
                                              for j in evaluations[i].keys()},
                                             orient='index')
    print(evaluations_pddf)
    print(evaluations_pddf.to_latex())

In [18]:
def run_evaluations(goldfile, goldcolumn, systems):
    '''
    Carry out standard evaluation for one or more system outputs
    
    :param goldfile: path to file with goldstandard
    :param goldcolumn: indicator of column in gold file where gold labels can be found
    :param systems: required information to find and process system output
    :type goldfile: string
    :type goldcolumn: integer
    :type systems: list (providing file name, information on tab with system output and system name for each element)
    
    :returns the evaluations for all systems
    '''
    evaluations = {}
    #not specifying delimiters here, since it corresponds to the default ('\t')
    gold_annotations = extract_annotations_mini(goldfile, goldcolumn)
    for system in systems:
        sys_evaluation = carry_out_evaluation(gold_annotations, system[0], system[1])
        evaluations[system[2]] = sys_evaluation
    return evaluations

In [19]:
def identify_evaluation_value(system, class_label, value_name, evaluations):
    '''
    Return the outcome of a specific value of the evaluation
    
    :param system: the name of the system
    :param class_label: the name of the class for which the value should be returned
    :param value_name: the name of the score that is returned
    :param evaluations: the overview of evaluations
    
    :returns the requested value
    '''
    return evaluations[system][class_label][value_name]

In [20]:
def create_system_information(system_information):
    '''
    Takes system information in the form that it is passed on through sys.argv or via a settingsfile
    and returns a list of elements specifying all the needed information on each system output file to carry out the evaluation.
    
    :param system_information is the input as from a commandline or an input file
    '''
    # https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    systems_list = [system_information[i:i + 3] for i in range(0, len(system_information), 3)]
    return systems_list

In [21]:
def results(label_gold, label_model):
    eval_counts = obtain_counts(label_gold, label_model)
    confusion_matrix = provide_confusion_matrix(eval_counts)
    precision,recall,f1_score = calculate_precision_recall_fscore(eval_counts)
    return confusion_matrix

In [22]:
def main(my_args=None):
    '''
    A main function. This does not make sense for a notebook, but it is here as an example.
    sys.argv is a very lightweight way of passing arguments from the commandline to a script.
    '''
    
    if my_args is None:
        my_args = sys.argv
    
    system_info = create_system_information(my_args[2:])
    evaluations = run_evaluations(my_args[0], my_args[1], system_info)
    provide_output_tables(evaluations)
    check_eval = identify_evaluation_value('system1', 'O', 'f-score', evaluations)
    #if it does not work correctly, this assert statement will indicate that
    assert_equal("%.3f" % check_eval,"0.889")

----------------------
# Evaluation

## Linear Regression Model with Validation and Train Data

In [27]:
def print_information(filename,targetcolumn:str,predictedcolum:str,column_names:list):

    goldannotations = extract_annotations(filename,targetcolumn,column_names)
    machineannotations = extract_annotations(filename,predictedcolum,column_names)
    evaluation_counts = obtain_counts(goldannotations,machineannotations)
    confusion_matrix = provide_confusion_matrix(evaluation_counts)
    print("\n Confusion matrix : \n", confusion_matrix)

    evaluation_counts = obtain_counts(goldannotations,machineannotations)
    precision,recall,f1_score = calculate_precision_recall_fscore(evaluation_counts)

    print("\n Precision : \n", precision)
    print("\n Recall : \n",recall)
    print("\n F1 Score : \n", f1_score)

In [28]:
# Train Data Model
print_information("outputfile","gold","predicted",["token","tag1","tag2","gold","predicted"])


 Confusion matrix : 
              O  B-ORG  I-ORG  B-PER  I-PER  B-MISC     NaN  I-MISC  B-LOC  \
O       167258     21     38      3      6      15  2178.0    37.0     18   
B-ORG     1394   4191    133     22     21     138     0.0     5.0    385   
I-ORG     1494    130   1703     23     15      38     0.0    12.0    151   
B-PER     1915     25      7   4347    290       2     0.0     0.0     10   
I-PER     2165     34     10    544   1730       3     0.0     6.0     33   
B-MISC     653     70     12     19      1    2478     0.0    56.0    144   
B-LOC     1101    329     45     11      7      35     0.0     2.0   5578   
I-MISC     477     19     27      1      8      87     0.0   515.0     11   
I-LOC      323      8     71     14     13       5     0.0     3.0     48   

        I-LOC  
O           4  
B-ORG      32  
I-ORG     138  
B-PER       4  
I-PER       3  
B-MISC      5  
B-LOC      32  
I-MISC     10  
I-LOC     672  

 Precision : 
 [0.9461364407738432, 0.8682411

In [29]:
# Validation Data Model
print_information("devfile_model.csv","gold","predicted",["token","tag1","tag2","gold","predicted"])


 Confusion matrix : 
             O  B-ORG  I-ORG  B-PER  I-PER  B-MISC    NaN  I-MISC  B-LOC  I-LOC
O       42083      4     11    0.0      1      10  640.0    10.0      3    1.0
B-ORG     479    690     38    5.0     14      23    0.0     3.0     78   11.0
I-ORG     342     47    263    4.0      5      11    0.0     5.0     36   38.0
B-PER     843      2      1  873.0    104       3    0.0     0.0     16    0.0
I-PER     895      5      1  102.0    292       2    0.0     0.0      6    0.0
B-MISC    241     14      2    8.0      1     603    0.0    12.0     41    0.0
B-LOC     395    101      6    4.0      4      15    0.0     0.0   1305    7.0
I-MISC    150      7      2    2.0      4      27    0.0   145.0      2    7.0
I-LOC      69      1     13    0.0      6       2    0.0     3.0     13  150.0

 Precision : 
 [0.924962085412225, 0.7921928817451206, 0.7804154302670623, 0.874749498997996, 0.6774941995359629, 0.8663793103448276, 0.0, 0.8146067415730337, 0.008666666666666666]

 Rec