In [1]:
import json
import pandas as pd
from typing import List
import os
import string 
from itertools import combinations
import spacy
import pickle
import string 
import logging
import math

## paths

In [38]:
indir_mturk = r"C:\Users\aviv\OneDrive\Desktop\controlled_reduction_production\cleaned_data\second_batch\mturk_alignments.csv"
indir_superpal = r"C:\Users\aviv\OneDrive\Desktop\controlled_reduction_production\cleaned_data\second_batch\for_annotator_parser_agreement\filtered_checkpoint_2000.csv"
indir_spacy_tokenization = r"C:\Users\aviv\OneDrive\Desktop\controlled_reduction_production\cleaned_data\second_batch\spacy_tokenization.json"
outdir = r"C:\Users\aviv\OneDrive\Desktop\controlled_reduction_production\cleaned_data\second_batch"

## reading content

In [39]:
mturk_df = pd.read_csv(indir_mturk)

superpal_df = pd.read_csv(indir_superpal)

with open(indir_spacy_tokenization) as f1:
    spacy_jsons = json.loads(f1.read())

## Calculate IAA

In [40]:
def span_to_str(subspans):
    output_str = ""
    for subspan in subspans:
        output_str = output_str + f"{str(subspan[0])}, {str(subspan[1])};"
    return output_str[:-1]

In [43]:
def get_consecutive_subspans(idx_list):
    if not idx_list:
        return []
    idx_subspans = []
    low_lim, up_lim = -1, -1
    for i in range(len(idx_list)-1):
        if low_lim == -1:
            low_lim = idx_list[i]
            up_lim = -1
        if idx_list[i+1] > idx_list[i]+1:
            up_lim = idx_list[i]
            idx_subspans.append([low_lim, up_lim])
            low_lim = -1
    if low_lim == -1:
        idx_subspans.append([idx_list[-1], idx_list[-1]])
    else:
        idx_subspans.append([low_lim, idx_list[-1]])
    return idx_subspans

In [44]:
def get_full_idx_spans(span_list):
    all_idx = []
    for span in span_list:
        if type(span)!= str and math.isnan(span):
            continue
        subspans = span.split(";")
        for subspan in subspans:
            min_lim, max_lim = subspan.split(",")
            all_idx = all_idx + list(range(int(min_lim), int(max_lim)+1))
    all_idx = list(set(all_idx))
    all_idx.sort()
    
    idx_subspans = get_consecutive_subspans(all_idx)
    idx_subspans_str = span_to_str(idx_subspans)
    return all_idx, idx_subspans, idx_subspans_str


In [45]:
def idx_span_to_tkn_span(idx_subspans, curr_spacy_jsons, isSuperPal):
    subt_val = 0 
    if isSuperPal and (curr_spacy_jsons['0']['text'] == ' '): # some documents started wiht a space which was considered a separate token, while superPAL remove them. So, should be treated accordingly
        subt_val = 1
    tkn_subspans = []
    for idx_subspan in idx_subspans:
        tkn_min_lim = [key for key, value in curr_spacy_jsons.items() if value['idx']-subt_val==idx_subspan[0]]
        tkn_max_lim = [key for key, value in curr_spacy_jsons.items() if value['idx']-subt_val+len(value['text'])==idx_subspan[1]]


        # if len(tkn_max_lim) != 1 or len(tkn_min_lim) != 1:
        #     print("gotcha")
        tkn_min_lim, tkn_max_lim = tkn_min_lim[0], tkn_max_lim[0]
        tkn_subspans.append([tkn_min_lim, tkn_max_lim])
    return tkn_subspans

In [54]:
NON_CONTENT_POS = ["AUX", "DET", "ADP", "SCONJ", "CONJ", "CCONJ", "PUNCT", "SYM", "X", "SPACE"]
def calc_agreement(mturk_tkn_subspans, superpal_tkn_subspans, curr_spacy_jsons):
    mturk_tkns = set([tkn_id for subspan in mturk_tkn_subspans for tkn_id in range(int(subspan[0]), int(subspan[1])+1) if not curr_spacy_jsons[str(tkn_id)]['text'] in string.punctuation and not curr_spacy_jsons[str(tkn_id)]['pos'] in NON_CONTENT_POS])
    superpal_tkns = set([tkn_id for subspan in superpal_tkn_subspans for tkn_id in range(int(subspan[0]), int(subspan[1])+1) if not curr_spacy_jsons[str(tkn_id)]['text'] in string.punctuation and not curr_spacy_jsons[str(tkn_id)]['pos'] in NON_CONTENT_POS])
    agreement = len(set.intersection(mturk_tkns, superpal_tkns)) / len(set.union(mturk_tkns, superpal_tkns))
    return agreement

def calc_r_p_f1(mturk_tkn_subspans, superpal_tkn_subspans, curr_spacy_jsons):
    mturk_tkns = set([tkn_id for subspan in mturk_tkn_subspans for tkn_id in range(int(subspan[0]), int(subspan[1])+1) if not curr_spacy_jsons[str(tkn_id)]['text'] in string.punctuation and not curr_spacy_jsons[str(tkn_id)]['pos'] in NON_CONTENT_POS])
    superpal_tkns = set([tkn_id for subspan in superpal_tkn_subspans for tkn_id in range(int(subspan[0]), int(subspan[1])+1) if not curr_spacy_jsons[str(tkn_id)]['text'] in string.punctuation and not curr_spacy_jsons[str(tkn_id)]['pos'] in NON_CONTENT_POS])
    
    precision = len(set.intersection(mturk_tkns, superpal_tkns)) / len(superpal_tkns) if len(superpal_tkns)!=0 else 0
    recall = len(set.intersection(mturk_tkns, superpal_tkns)) / len(mturk_tkns) if len(mturk_tkns)!=0 else 0
    tp = len(set.intersection(mturk_tkns, superpal_tkns))
    fp = len(superpal_tkns) - tp
    fn = len(mturk_tkns) - tp

    if len(superpal_tkns) == 0 and  len(mturk_tkns) == 0 :
        precision, recall = 1, 1

    f1 = 0 if precision + recall == 0 else 2*precision*recall / (precision + recall)
    return {"precision":precision, "recall":recall, "F1":f1, "true_pos":tp, "false_pos":fp, "false_neg":fn}

In [55]:
def calc_avg(scores):
    return sum(scores) / len(scores)

## per-sentence agreements

In [56]:
summaries =  list(set(mturk_df['summaryFile']))


IAA_scores = {}
r_p_f1_scores = {}

docwise_IAA_scores = {}
docwise_r_p_f1_scores = {}

for summary in summaries:
    curr_mturk_df = mturk_df[mturk_df['summaryFile']==summary]
    curr_superpal_df = superpal_df[superpal_df['summaryFile']==summary]
    curr_spacy_doc_name = list(curr_superpal_df["documentFile"])[0]
    curr_spacy_jsons = spacy_jsons[curr_spacy_doc_name]
    SentCharIdx_list = list(set(curr_mturk_df["scuSentCharIdx"]))
    SentCharIdx_list.sort()

    IAA_scores[f'{summary}'] = {}
    r_p_f1_scores[f'{summary}'] = {}

    for sent_idx in SentCharIdx_list:
        curr_idx_mturk_df = curr_mturk_df[curr_mturk_df["scuSentCharIdx"]==sent_idx]
        curr_idx_superpal_df = curr_superpal_df[curr_superpal_df["scuSentCharIdx"]==sent_idx]

        mturk_all_idx, mturk_idx_subspans, mturk_idx_subspans_str = get_full_idx_spans(list(curr_idx_mturk_df["docSpanOffsets"]))
        superpal_all_idx, superpal_idx_subspans, superpal_idx_subspans_str = get_full_idx_spans(list(curr_idx_superpal_df["docSpanOffsets"]))

        try:
            mturk_tkn_subspans = idx_span_to_tkn_span(mturk_idx_subspans, curr_spacy_jsons, False)
            superpal_tkn_subspans = idx_span_to_tkn_span(superpal_idx_subspans, curr_spacy_jsons, True)
        except IndexError:
            doc_name = summary.split("_")[0]
            print(f"out of index for document {doc_name}")
            del IAA_scores[f'{summary}']
            del r_p_f1_scores[f'{summary}']
            break
        curr_agreement = calc_agreement(mturk_tkn_subspans, superpal_tkn_subspans, curr_spacy_jsons)
        
        curr_sent_id = list(curr_idx_mturk_df['scuSentTknId'])[0]
        IAA_scores[f'{summary}'][str(curr_sent_id)] = {"sent_idx":sent_idx, "agreement":curr_agreement}


        r_p_f1_scores[f'{summary}'][str(curr_sent_id)] = calc_r_p_f1(mturk_tkn_subspans, superpal_tkn_subspans, curr_spacy_jsons)



    
    if summary in IAA_scores.keys():
        IAA_scores[f'{summary}']["average"] = calc_avg([value["agreement"] for value in IAA_scores[f'{summary}'].values()]) 
    if summary in r_p_f1_scores.keys():
        r_p_f1_scores[f'{summary}']["average"] = {"precision":calc_avg([value["precision"] for value in r_p_f1_scores[f'{summary}'].values()]) , 
                                                  "recall":calc_avg([value["recall"] for value in r_p_f1_scores[f'{summary}'].values()]), 
                                                  "F1":calc_avg([value["F1"] for value in r_p_f1_scores[f'{summary}'].values()]),
                                                  "total_tp":sum([value["true_pos"] for value in r_p_f1_scores[f'{summary}'].values()]),
                                                  "total_fp":sum([value["false_pos"] for value in r_p_f1_scores[f'{summary}'].values()]),
                                                  "total_fn":sum([value["false_neg"] for value in r_p_f1_scores[f'{summary}'].values()])}

print("done")

out of index for document LA062290-0172
out of index for document LA062290-0172
out of index for document LA101990-0114
out of index for document LA112790-0154
out of index for document SJMN91-06256107
out of index for document LA092389-0092
out of index for document SJMN91-06236241
out of index for document LA101889-0066
out of index for document SJMN91-06236241
out of index for document LA092189-0225
out of index for document LA060590-0086
out of index for document SJMN91-06025182
out of index for document LA101889-0066
out of index for document LA060590-0086
out of index for document LA120890-0055
out of index for document LA101590-0066
out of index for document SJMN91-06025182
out of index for document LA112790-0154
out of index for document LA101590-0066
out of index for document SJMN91-06058250
out of index for document AP890119-0221
out of index for document AP890121-0050
out of index for document LA101990-0114
out of index for document AP890121-0050
out of index for document LA

## per doc agreements

In [57]:
summaries =  list(set(mturk_df['summaryFile']))

docwise_IAA_scores = {}
docwise_r_p_f1_scores = {}

for summary in summaries:
    curr_mturk_df = mturk_df[mturk_df['summaryFile']==summary]
    curr_superpal_df = superpal_df[superpal_df['summaryFile']==summary]
    curr_spacy_doc_name = list(curr_superpal_df["documentFile"])[0]
    curr_spacy_jsons = spacy_jsons[curr_spacy_doc_name]
    SentCharIdx_list = list(set(curr_mturk_df["scuSentCharIdx"]))
    SentCharIdx_list.sort()
    # try:
    mturk_all_idx, mturk_idx_subspans, mturk_idx_subspans_str = get_full_idx_spans(list(curr_mturk_df["docSpanOffsets"]))
    superpal_all_idx, superpal_idx_subspans, superpal_idx_subspans_str = get_full_idx_spans(list(curr_superpal_df["docSpanOffsets"]))
    # except:
    #     print("gotcha")
    try:
        mturk_tkn_subspans = idx_span_to_tkn_span(mturk_idx_subspans, curr_spacy_jsons, False)
        superpal_tkn_subspans = idx_span_to_tkn_span(superpal_idx_subspans, curr_spacy_jsons, True)
    except IndexError:
        doc_name = summary.split("_")[0]
        print(f"out of index for document {doc_name}")
        continue
    


    curr_agreement = calc_agreement(mturk_tkn_subspans, superpal_tkn_subspans, curr_spacy_jsons)
    docwise_IAA_scores[f'{summary}'] = {"agreement":curr_agreement}
    docwise_r_p_f1_scores[f'{summary}'] = calc_r_p_f1(mturk_tkn_subspans, superpal_tkn_subspans, curr_spacy_jsons)
print("done")

out of index for document LA062290-0172
out of index for document LA062290-0172
out of index for document LA101990-0114
out of index for document LA112790-0154
out of index for document SJMN91-06256107
out of index for document LA092389-0092
out of index for document SJMN91-06236241
out of index for document LA101889-0066
out of index for document SJMN91-06236241
out of index for document LA092189-0225
out of index for document LA060590-0086
out of index for document SJMN91-06025182
out of index for document LA101889-0066
out of index for document LA060590-0086
out of index for document LA120890-0055
out of index for document LA101590-0066
out of index for document SJMN91-06025182
out of index for document LA112790-0154
out of index for document LA101590-0066
out of index for document SJMN91-06058250
out of index for document AP890119-0221
out of index for document AP890121-0050
out of index for document LA101990-0114
out of index for document AP890121-0050
out of index for document LA

## save results

In [58]:
only_content_words = "_only_content_words" if NON_CONTENT_POS else ""
is_filter = "_filtered" if indir_superpal.endswith("filtered_checkpoint_2000.csv") else ""
with open(os.path.join(outdir,f"IAA_silver_results_docwise{only_content_words}{is_filter}.json"), "w") as outfile:
    outfile.write(json.dumps(docwise_r_p_f1_scores))
with open(os.path.join(outdir,f"IAA_silver_results{only_content_words}{is_filter}.json"), "w") as outfile:
    outfile.write(json.dumps(r_p_f1_scores))

## IAA total average

In [45]:
per_sent_IAA_avg = sum([elem['average'] for elem in IAA_scores.values()]) / len([elem['average'] for elem in IAA_scores.values()])
per_doc_IAA_avg = sum([elem['agreement'] for elem in docwise_IAA_scores.values()]) / len([elem['agreement'] for elem in docwise_IAA_scores.values()])
print(f"per sentence agreement avg:{per_sent_IAA_avg}\nper doc agreement avg:{per_doc_IAA_avg}")

per sentence agreement avg:0.5067591300317594
per doc agreement avg:0.5317787608066044


## Precision, Recall, F1 micro and macro average

In [59]:
# per sent
per_sent_macro_p = calc_avg([elem['average']['precision'] for elem in r_p_f1_scores.values()])
per_sent_macro_r = calc_avg([elem['average']['recall'] for elem in r_p_f1_scores.values()])
per_sent_macro_F1 = calc_avg([elem['average']['F1'] for elem in r_p_f1_scores.values()])


per_sent_full_tp = sum([elem['average']['total_tp'] for elem in r_p_f1_scores.values()])
per_sent_full_fp = sum([elem['average']['total_fp'] for elem in r_p_f1_scores.values()])
per_sent_full_fn = sum([elem['average']['total_fn'] for elem in r_p_f1_scores.values()])

per_sent_micro_p = per_sent_full_tp / (per_sent_full_tp + per_sent_full_fp) 
per_sent_micro_r = per_sent_full_tp / (per_sent_full_tp + per_sent_full_fn) 
per_sent_micro_F1 = 2*per_sent_micro_p*per_sent_micro_r / (per_sent_micro_p + per_sent_micro_r)

# per doc
try:
    per_doc_macro_p = calc_avg([elem['precision'] for elem in docwise_r_p_f1_scores.values()])
except:
    print("gotcha")
per_doc_macro_r = calc_avg([elem['recall'] for elem in docwise_r_p_f1_scores.values()])
per_doc_macro_F1 = calc_avg([elem['F1'] for elem in docwise_r_p_f1_scores.values()])


per_doc_full_tp = sum([elem['true_pos'] for elem in docwise_r_p_f1_scores.values()])
per_doc_full_fp = sum([elem['false_pos'] for elem in docwise_r_p_f1_scores.values()])
per_doc_full_fn = sum([elem['false_neg'] for elem in docwise_r_p_f1_scores.values()])


per_doc_micro_p = per_doc_full_tp / (per_doc_full_tp + per_doc_full_fp) 
per_doc_micro_r = per_doc_full_tp / (per_doc_full_tp + per_doc_full_fn) 
per_doc_micro_F1 = 2*per_doc_micro_p*per_doc_micro_r / (per_doc_micro_p + per_doc_micro_r)




# print(f"per sentence:\nmacro_p:{per_sent_macro_p}\nmacro_r:{per_sent_macro_r}\nmacro_F1:{per_sent_macro_F1}\n")

# print(f"per sentence:\nmicro_p:{per_sent_micro_p}\nmicro_r:{per_sent_micro_r}\nmicro_F1:{per_sent_micro_F1}\n")

print(f"per document:\nmacro_p:{per_doc_macro_p}\nmacro_r:{per_doc_macro_r}\nmacro_F1:{per_doc_macro_F1}")

print(f"per document:\nmicro_p:{per_doc_micro_p}\nmicro_r:{per_doc_micro_r}\nmicro_F1:{per_doc_micro_F1}")


# print(f"per document:\nmicro_p:{per_doc_macro_p}\nmicro_r:{per_doc_macro_r}\nmicro_F1:{per_doc_macro_F1}")



per document:
macro_p:0.7686967026110518
macro_r:0.640294414640495
macro_F1:0.6856260378517571
per document:
micro_p:0.759537372691179
micro_r:0.6306435430700874
micro_F1:0.6891151135473768


# Full data (2001+2002) Agreement Calculation

#### All tokens (excluding punctuation)

In [60]:
indir = r"C:\Users\aviv\OneDrive\Desktop\controlled_reduction_production\cleaned_data"
is_filter = "_filtered" if indir_superpal.endswith("filtered_checkpoint_2000.csv") else ""
subdirs = ["first_batch", "second_batch"]

total_data_sentwise = {}
total_data_docwise = {}

for subdir in subdirs:
    with open(os.path.join(indir, subdir, f"IAA_silver_results{is_filter}.json"), 'r') as f:
         data_sentwise = json.load(f)
    with open(os.path.join(indir, subdir, f"IAA_silver_results_docwise{is_filter}.json"), 'r') as f:
         data_docwise = json.load(f)
    if len(set.intersection(set(data_sentwise.keys()), set(total_data_sentwise.keys()))) != 0 :
        print("same doc in both (sentwise)!")
    if len(set.intersection(set(data_docwise.keys()), set(total_data_docwise.keys()))) != 0 :
        print("same doc in both (docwise)!")
    total_data_sentwise.update(data_sentwise)
    total_data_docwise.update(data_docwise)

print("done")

done


In [62]:
# per sent
per_sent_macro_p = calc_avg([elem['average']['precision'] for elem in total_data_sentwise.values()])
per_sent_macro_r = calc_avg([elem['average']['recall'] for elem in total_data_sentwise.values()])
per_sent_macro_F1 = calc_avg([elem['average']['F1'] for elem in total_data_sentwise.values()])


per_sent_full_tp = sum([elem['average']['total_tp'] for elem in total_data_sentwise.values()])
per_sent_full_fp = sum([elem['average']['total_fp'] for elem in total_data_sentwise.values()])
per_sent_full_fn = sum([elem['average']['total_fn'] for elem in total_data_sentwise.values()])

per_sent_micro_p = per_sent_full_tp / (per_sent_full_tp + per_sent_full_fp) 
per_sent_micro_r = per_sent_full_tp / (per_sent_full_tp + per_sent_full_fn) 
per_sent_micro_F1 = 2*per_sent_micro_p*per_sent_micro_r / (per_sent_micro_p + per_sent_micro_r)

# per doc
per_doc_macro_p = calc_avg([elem['precision'] for elem in total_data_docwise.values()])
per_doc_macro_r = calc_avg([elem['recall'] for elem in total_data_docwise.values()])
per_doc_macro_F1 = calc_avg([elem['F1'] for elem in total_data_docwise.values()])


per_doc_full_tp = sum([elem['true_pos'] for elem in total_data_docwise.values()])
per_doc_full_fp = sum([elem['false_pos'] for elem in total_data_docwise.values()])
per_doc_full_fn = sum([elem['false_neg'] for elem in total_data_docwise.values()])


per_doc_micro_p = per_doc_full_tp / (per_doc_full_tp + per_doc_full_fp) 
per_doc_micro_r = per_doc_full_tp / (per_doc_full_tp + per_doc_full_fn) 
per_doc_micro_F1 = 2*per_doc_micro_p*per_doc_micro_r / (per_doc_micro_p + per_doc_micro_r)


print("Agreement over all tokens (excluding punctuation)")

# print(f"per sentence:\nmacro_p:{per_sent_macro_p}\nmacro_r:{per_sent_macro_r}\nmacro_F1:{per_sent_macro_F1}\n")

# print(f"per sentence:\nmicro_p:{per_sent_micro_p}\nmicro_r:{per_sent_micro_r}\nmicro_F1:{per_sent_micro_F1}\n")

print(f"per document:\nmacro_p:{per_doc_macro_p}\nmacro_r:{per_doc_macro_r}\nmacro_F1:{per_doc_macro_F1}")

print(f"per document:\nmicro_p:{per_doc_micro_p}\nmicro_r:{per_doc_micro_r}\nmicro_F1:{per_doc_micro_F1}")


Agreement over all tokens (excluding punctuation)
per document:
macro_p:0.7408858889226297
macro_r:0.60041122635158
macro_F1:0.6470388731056482
per document:
micro_p:0.735680098811178
micro_r:0.5880973538704581
micro_F1:0.653662016269531


#### Only Content tokens

In [63]:
indir = r"C:\Users\aviv\OneDrive\Desktop\controlled_reduction_production\cleaned_data"
is_filter = "_filtered" if indir_superpal.endswith("filtered_checkpoint_2000.csv") else ""
subdirs = ["first_batch", "second_batch"]

total_data_sentwise = {}
total_data_docwise = {}

for subdir in subdirs:
    with open(os.path.join(indir, subdir, f"IAA_silver_results_only_content_words{is_filter}.json"), 'r') as f:
         data_sentwise = json.load(f)
    with open(os.path.join(indir, subdir, f"IAA_silver_results_docwise_only_content_words{is_filter}.json"), 'r') as f:
         data_docwise = json.load(f)
    if len(set.intersection(set(data_sentwise.keys()), set(total_data_sentwise.keys()))) != 0 :
        print("same doc in both (sentwise)!")
    if len(set.intersection(set(data_docwise.keys()), set(total_data_docwise.keys()))) != 0 :
        print("same doc in both (docwise)!")
    total_data_sentwise.update(data_sentwise)
    total_data_docwise.update(data_docwise)

print("done")

done


In [64]:
# per sent
per_sent_macro_p = calc_avg([elem['average']['precision'] for elem in total_data_sentwise.values()])
per_sent_macro_r = calc_avg([elem['average']['recall'] for elem in total_data_sentwise.values()])
per_sent_macro_F1 = calc_avg([elem['average']['F1'] for elem in total_data_sentwise.values()])


per_sent_full_tp = sum([elem['average']['total_tp'] for elem in total_data_sentwise.values()])
per_sent_full_fp = sum([elem['average']['total_fp'] for elem in total_data_sentwise.values()])
per_sent_full_fn = sum([elem['average']['total_fn'] for elem in total_data_sentwise.values()])

per_sent_micro_p = per_sent_full_tp / (per_sent_full_tp + per_sent_full_fp) 
per_sent_micro_r = per_sent_full_tp / (per_sent_full_tp + per_sent_full_fn) 
per_sent_micro_F1 = 2*per_sent_micro_p*per_sent_micro_r / (per_sent_micro_p + per_sent_micro_r)

# per doc
per_doc_macro_p = calc_avg([elem['precision'] for elem in total_data_docwise.values()])
per_doc_macro_r = calc_avg([elem['recall'] for elem in total_data_docwise.values()])
per_doc_macro_F1 = calc_avg([elem['F1'] for elem in total_data_docwise.values()])


per_doc_full_tp = sum([elem['true_pos'] for elem in total_data_docwise.values()])
per_doc_full_fp = sum([elem['false_pos'] for elem in total_data_docwise.values()])
per_doc_full_fn = sum([elem['false_neg'] for elem in total_data_docwise.values()])


per_doc_micro_p = per_doc_full_tp / (per_doc_full_tp + per_doc_full_fp) 
per_doc_micro_r = per_doc_full_tp / (per_doc_full_tp + per_doc_full_fn) 
per_doc_micro_F1 = 2*per_doc_micro_p*per_doc_micro_r / (per_doc_micro_p + per_doc_micro_r)


print("Agreement over all tokens (excluding punctuation)")

# print(f"per sentence:\nmacro_p:{per_sent_macro_p}\nmacro_r:{per_sent_macro_r}\nmacro_F1:{per_sent_macro_F1}\n")

# print(f"per sentence:\nmicro_p:{per_sent_micro_p}\nmicro_r:{per_sent_micro_r}\nmicro_F1:{per_sent_micro_F1}\n")

print(f"per document:\nmacro_p:{per_doc_macro_p}\nmacro_r:{per_doc_macro_r}\nmacro_F1:{per_doc_macro_F1}")

print(f"per document:\nmicro_p:{per_doc_micro_p}\nmicro_r:{per_doc_micro_r}\nmicro_F1:{per_doc_micro_F1}")


Agreement over all tokens (excluding punctuation)
per document:
macro_p:0.7560208081815802
macro_r:0.6053100374624475
macro_F1:0.6560313092704906
per document:
micro_p:0.7524534174916778
micro_r:0.5919061384118347
micro_F1:0.662593269377189
