In [85]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, classification_report

df = pd.read_csv("results.tsv",delimiter="\t")

gold = df.Gold.values
predictions = df.Predicted.values
class_labels = sorted(set(df.Gold.values))

class Score(object):
    def __init__(self, class_label, p, r, f1):
        self.class_label = class_label
        self.precision = p
        self.recall = r
        self.f1 = f1
        
    def summary(self):
        print("{}\tPrecision\tRecall\tF1".format(self.class_label))
        print("{:.2f}\t{:.2f}\t{:.2f}".format(self.precision, self.recall, self.f1))

class Evaluator(object):
    def __init__(self, results_file, neg_class = "None"):
        self.results_file = results_file
        self.neg_class = neg_class
        self.smoothing = 0.00001
        
    def compute_class_score(self, class_label):
        df = pd.read_csv(self.results_file, delimiter="\t")
        if class_label != self.neg_class:
            # ignore the negative majority class
            df = df[df.Gold != self.neg_class]
        tp = len(df[(df.Gold == df.Predicted) & (df.Gold == class_label)])
        fp = len(df[(df.Predicted == class_label) & (df.Gold != class_label)])
        p = tp / (tp + fp + self.smoothing)
        fn = len(df[df.Gold == class_label]) - tp
        r = tp / (tp + fn + self.smoothing)
        f1 = (2 * p * r) / (p + r + self.smoothing)
        return Score(class_label, p, r, f1)
    
    def compute_micro_score(self):
        df = pd.read_csv(self.results_file, delimiter="\t")
        # ignore the negative majority class
        df = df[df.Gold != self.neg_class]
        # compute micro p, r, and f1
        tp = len(df[(df.Gold == df.Predicted)])
        fp = len(df[(df.Predicted != self.neg_class)]) - tp
        fn = len(df[df.Gold != self.neg_class]) - tp
        p = tp / (tp + fp)
        r = tp / (tp + fn)
        f1 = (2 * p * r) / (p + r)

        return Score("MICRO", p, r, f1)
        
    def compute_macro_score(self):
        df = pd.read_csv(self.results_file, delimiter="\t")
        scores = [self.compute_class_score(cl) for cl in set(df.Gold.values)]
        p = 0
        r = 0
        f1 = 0
        pos_classes = [s for s in scores if s.class_label != self.neg_class]
        for s in pos_classes:    
            p += s.precision
            r += s.recall
            f1 += s.f1
            
        return Score("MACRO ({} classes)".format(len(pos_classes)), p/len(pos_classes), r/len(pos_classes), f1/len(pos_classes))
            
    def generate_scores_df(self):
        df = pd.read_csv(self.results_file, delimiter="\t")
        header = ("Class", "Precision", "Recall", "F1")
        scores = [self.compute_class_score(cl) for cl in set(df.Gold.values)]
        data = [(s.class_label, s.precision, s.recall, s.f1) for s in scores]
        macro = self.compute_macro_score()
        micro = self.compute_micro_score()
        data += [(macro.class_label, macro.precision, macro.recall, macro.f1), (micro.class_label, micro.precision, micro.recall, micro.f1)]
        return pd.DataFrame(data, columns=header)

In [86]:
evaluator = Evaluator("results.tsv", "None")

macro= evaluator.compute_macro_score()
macro.summary()
micro = evaluator.compute_micro_score()
micro.summary()

MACRO (2 classes)	Precision	Recall	F1
0.50	0.10	0.17
MICRO	Precision	Recall	F1
1.00	0.18	0.31


In [87]:
evaluator.generate_scores_df().round(2)

Unnamed: 0,Class,Precision,Recall,F1
0,E1 precedes E2,1.0,0.2,0.34
1,,0.84,0.96,0.9
2,E2 precedes E1,0.0,0.0,0.0
3,MACRO (2 classes),0.5,0.1,0.17
4,MICRO,1.0,0.18,0.31
