In [None]:
from collections import defaultdict

system = 'path/to/file/with/predictions' # This is a file on the starsem format with the filename extension "sem" 
gold = 'path/to/goldstandard/to/evaluate/for' # This is the test set (gold standard) to evaluate for, also on the starsem format with the filename extension "sem"

# Map the index in gold to the index in system
# (if a cue in the end of the sentence was annotated after an earlier cue, 
# the order will be different in gold and system predictions)
# source -> gold cue idx -> sys cue index 
gold_cue_idx_to_sys_cue_idx = defaultdict(lambda: {})
sys_cue_idx_to_gold_cue_idx = defaultdict(lambda: {})
cue_mistakes = defaultdict(lambda: defaultdict(lambda: []))
scope_mistakes = defaultdict(lambda: defaultdict(lambda: []))


all_gold_cues = []
all_sys_cues = []
all_gold_scopes = []
all_sys_scopes = []

with open(system, encoding='utf8') as sys, open(gold, encoding='utf8') as gold: 

  sys_sents = [[item.split("\t") for item in s.split("\n")] for s in sys.read().split("\n\n") if s != ""]
  gold_sents = [[item.split("\t") for item in s.split("\n")] for s in gold.read().split("\n\n") if s != ""]

  print("LEN sys:", len(sys_sents))
  print("LEN gold:", len(gold_sents))


  source_to_sent = {gold_sents[i][0][1]: [gold_sents[i][j][3] for j in range(len(gold_sents[i]))] for i in range(len(gold_sents))}
  
  gold_cue_idx_to_cue = {}
  sys_cue_idx_to_cue = {}

  gold_scope_idx_to_scope = {}
  sys_scope_idx_to_scope = {}

  for i in range(len(gold_sents)):
    source = gold_sents[i][0][1]
    # print("SENTENCE:", source)
    # print(source_to_sent[source])

    last_idx_no_neg = 7

    gold_cue_idx = [last_idx_no_neg + n*3 for n in range(len(gold_sents[i][0])) if not len(gold_sents[i][0]) == last_idx_no_neg+1 and last_idx_no_neg + n*3 < len(gold_sents[i][0])]
    gold_scope_idx = [last_idx_no_neg + n*3 + 1 for n in range(len(gold_sents[i][0])) if last_idx_no_neg + n*3 + 1 < len(gold_sents[i][0])]
    # print("gold_cue_idx:", gold_cue_idx)
    # print("gold_scope_idx:", gold_scope_idx)

    sys_cue_idx = [last_idx_no_neg + n*3 for n in range(len(sys_sents[i][0])) if not len(sys_sents[i][0]) == last_idx_no_neg+1 and last_idx_no_neg + n*3 < len(sys_sents[i][0])]
    sys_scope_idx = [last_idx_no_neg + n*3 + 1 for n in range(len(sys_sents[i][0])) if last_idx_no_neg + n*3 + 1 < len(sys_sents[i][0])]
    # print("sys_cue_idx:", sys_cue_idx)
    # print("sys_scope_idx:", sys_scope_idx)

    gold_cues = [(idx, [gold_sents[i][j][idx] for j in range(len(gold_sents[i]))]) for idx in gold_cue_idx]
    gold_cue_idx_to_cue[source] = {idx: cue for idx, cue in gold_cues}
    # print("gold_cues:", gold_cues)
    # print("gold_cue_idx_to_cue[source]:", gold_cue_idx_to_cue[source])

    gold_scopes = [(idx, [gold_sents[i][j][idx] for j in range(len(gold_sents[i]))]) for idx in gold_scope_idx]
    gold_scope_idx_to_scope[source] = {idx: scope for idx, scope in gold_scopes}
    # print("gold_scopes:", gold_scopes)

    sys_cues = [(idx, [sys_sents[i][j][idx] for j in range(len(sys_sents[i]))]) for idx in sys_cue_idx]
    sys_cue_idx_to_cue[source] = {idx: cue for idx, cue in sys_cues}
    # print("sys_cues:", sys_cues)
    # print("sys_cue_idx_to_cue[source]:", sys_cue_idx_to_cue[source])

    sys_scopes = [(idx, [sys_sents[i][j][idx] for j in range(len(sys_sents[i]))]) for idx in sys_scope_idx]
    sys_scope_idx_to_scope[source] = {idx: scope for idx, scope in sys_scopes}
    # print("sys_scopes:", sys_scopes)
    
    # --------- CUES ----------
    for idx, c in gold_cues: 
      for sidx, sc in sys_cues: 
        if c == sc:
          gold_cue_idx_to_sys_cue_idx[source][idx] = sidx
          sys_cue_idx_to_gold_cue_idx[source][sidx] = idx

          cue_mistakes[source]["TP"].append((idx, sc)) # True Positives (= correctly predicted cues)

    # print("gold_cue_idx_to_sys_cue_idx[source]:", gold_cue_idx_to_sys_cue_idx[source])
    # print("sys_cue_idx_to_gold_cue_idx[source]:", sys_cue_idx_to_gold_cue_idx[source])


    for idx in gold_cue_idx_to_cue[source]:
      # print("LOOK FOR FN")
      # print(idx)
      # print(gold_cue_idx_to_sys_cue_idx[source].keys()) 
      # print(idx in gold_cue_idx_to_sys_cue_idx[source].keys())

      cue = gold_cue_idx_to_cue[source][idx]
      # print("cue:", cue)

      if not idx in gold_cue_idx_to_sys_cue_idx[source].keys(): # or sys_cue_idx_to_cue[source][gold_cue_idx_to_sys_cue_idx[source][idx]] != cue:
        # print("IF FN")
        cue_mistakes[source]["FN"].append((idx, cue))

    for idx in sys_cue_idx_to_cue[source]:
      # print("LOOK FOR FP")
      # print(idx)
      # print(sys_cue_idx_to_gold_cue_idx[source].keys()) 
      # print(idx in sys_cue_idx_to_gold_cue_idx[source].keys())

      cue = sys_cue_idx_to_cue[source][idx]
      # print("cue:", cue)

      if not idx in sys_cue_idx_to_gold_cue_idx[source].keys(): # or gold_cue_idx_to_cue[source][sys_cue_idx_to_gold_cue_idx[source][idx]] != cue:
        # print("IF FP")
        cue_mistakes[source]["FP"].append((idx, cue))


    # print("FN:", cue_mistakes[source]["FN"])
    # print("FP:", cue_mistakes[source]["FP"])

    # To be able to look at cues when looking at scopes: 
    for gs in gold_cues: 
      gs_idx, words = gs
      all_gold_cues.append([w for w in words if w != "_"])

    for ss in sys_cues:
      ss_idx, words = ss
      all_sys_cues.append([w for w in words if w != "_"])


    # --------- SCOPES ----------

    # Look at all gold scopes and all predicted scopes (to compare avg. length):
    for gs in gold_scopes: 
      gs_idx, words = gs
      all_gold_scopes.append([w for w in words if w != "_"])

    for ss in sys_scopes:
      ss_idx, words = ss
      all_sys_scopes.append([w for w in words if w != "_"])

    # Looking at scopes for correctly predicted cues:
    for idx, cue in gold_cues: 
      if idx in gold_cue_idx_to_sys_cue_idx[source]: # the cue was correctly predicted
        sidx = gold_cue_idx_to_sys_cue_idx[source][idx]
        
        gold_scope = gold_scope_idx_to_scope[source][idx+1]
        sys_scope = None
        if sidx+1 in sys_scope_idx_to_scope[source]:
          sys_scope = sys_scope_idx_to_scope[source][sidx+1]

        if gold_scope == sys_scope:
          scope_mistakes["correct"][source].append((cue, sys_scope))
        else: 
          scope_mistakes["incorrect"][source].append((cue, gold_scope, sys_scope))

    # print()






In [None]:
# Write CORRECT scope preds to file:

with open(f"{system[:len(system)-4]}_SCOPES_CORRECT_ALL_CASES.txt", 'w', encoding='utf8') as f:
    f.write("CORRECT SCOPE PREDICTIONS:\n\n")
    for source in scope_mistakes['correct']: 
        for cue, scope in scope_mistakes['correct'][source]:
            f.write("Source: {}\n".format(source))
            f.write("Cue:   {}\n".format(cue))
            f.write("Scope: {}\n\n".format(scope))

In [None]:
# GET AN OVERVIEW OF ALL CUES - TP, FP, FN, write to file:

tp_cues_overview = {}
fp_cues_overview = {}
fn_cues_overview = {}

# NB: This is not lowercased!!!

for k in cue_mistakes.keys():
    for i, cue_list in cue_mistakes[k]["TP"]:
        cue_str = ""
        for w in cue_list:
            if w != "_":
                if len(cue_str) > 0: 
                    cue_str += " " + w
                else:
                    cue_str += w
        if cue_str not in tp_cues_overview:
            tp_cues_overview[cue_str] = 0
        tp_cues_overview[cue_str] += 1


for k in cue_mistakes.keys():
    for i, cue_list in cue_mistakes[k]["FP"]:
        cue_str = ""
        for w in cue_list:
            if w != "_":
                if len(cue_str) > 0: 
                    cue_str += " " + w
                else:
                    cue_str += w
        if cue_str not in fp_cues_overview:
            fp_cues_overview[cue_str] = 0
        fp_cues_overview[cue_str] += 1


for k in cue_mistakes.keys():
    for i, cue_list in cue_mistakes[k]["FN"]:
        cue_str = ""
        for w in cue_list:
            if w != "_":
                if len(cue_str) > 0: 
                    cue_str += " " + w
                else:
                    cue_str += w
        if cue_str not in fn_cues_overview:
            fn_cues_overview[cue_str] = 0
        fn_cues_overview[cue_str] += 1


with open(f"{system[:len(system)-4]}_CUES_TP_FP_FN.txt", 'w', encoding='utf8') as f:
    f.write("TP\n")
    f.write(str(tp_cues_overview))
    f.write("\n\n")
    f.write("FP\n")
    f.write(str(fp_cues_overview))
    f.write("\n\n")
    f.write("FN\n")
    f.write(str(fn_cues_overview))
    f.write("\n\n")
       


In [None]:
with open(f"{system[:len(system)-4]}_CUES_MISTAKES.txt", 'w', encoding='utf8') as mf, open(f"{system[:len(system)-4]}_CUES_CORRECT.txt", 'w', encoding='utf8') as cf:
  for source in cue_mistakes: 
    if len(cue_mistakes[source]["FN"]) > 0 or len(cue_mistakes[source]["FP"]) > 0:
      mf.write(f"SOURCE: {source}\n")
      mf.write(f"{source_to_sent[source]}\n")
      mf.write("FN:\n")
      for fn in cue_mistakes[source]["FN"]:
        mf.write(f"{fn}\n")
      mf.write("\nFP:\n")
      for fp in cue_mistakes[source]["FP"]:
        mf.write(f"{fp}\n")
      mf.write("\n")

  mf.write("# sentences with cue mistakes: {}\n".format(len([s for s in cue_mistakes if len(list(cue_mistakes[s]["FP"])) > 0 or len(list(cue_mistakes[s]["FN"])) > 0])))
  mf.write("# total cue mistakes: {}\n".format(sum([len(list(cue_mistakes[source]["FP"])) + len(list(cue_mistakes[source]["FN"])) for source in cue_mistakes])))
  mf.write("# false negatives: {}\n".format(sum([len(list(cue_mistakes[source]["FN"])) for source in cue_mistakes])))
  mf.write("# false positives: {}\n".format(sum([len(list(cue_mistakes[source]["FP"])) for source in cue_mistakes])))
  mf.write("# total cues in gold standard: {}\n".format(sum([len(list(cue_mistakes[source]["FN"])) + len(list(cue_mistakes[source]["TP"])) for source in cue_mistakes])))

  for source in cue_mistakes:
    if len(cue_mistakes[source]["TP"]) > 0:
      cf.write(f"SOURCE: {source}\n")
      cf.write(f"{source_to_sent[source]}\n")
      cf.write("TP:\n")
      for tp in cue_mistakes[source]["TP"]:
        cf.write(f"{tp}\n")
      cf.write("\n")

  cf.write("# sentences with at least one correct cue: {}\n".format(len([s for s in cue_mistakes if len(list(cue_mistakes[s]["TP"])) > 0])))
  cf.write("# total correct cues: {}\n".format(sum([len(list(cue_mistakes[source]["TP"])) for source in cue_mistakes])))
  cf.write("# total cues in gold standard: {}\n".format(sum([len(list(cue_mistakes[source]["FN"])) + len(list(cue_mistakes[source]["TP"])) for source in cue_mistakes])))

In [None]:
# LOOK AT THE CUE WORDS

from collections import Counter

# ALL GOLD CUES
all_cues = []

for source in gold_cue_idx_to_cue:
    if len(gold_cue_idx_to_cue[source]) > 0: 
        for cue_i in gold_cue_idx_to_cue[source]:
            all_cues.append((source, cue_i, [w for w in gold_cue_idx_to_cue[source][cue_i] if w != "_"]))

print(len(all_cues))
# print(all_cues) 

gold_cue_counter = Counter([" ".join(cue_lst).lower() for _, _, cue_lst in all_cues])
print("\nTYPE OF CUE AND NUMBER OF OCCURRENCES IN GOLD:")
for w, c in gold_cue_counter.items():
    print(f"{w}: {c}")

In [None]:
# FIND THE FALSE NEGATIVES
all_fn = []
with open(f"{system[:len(system)-4]}_CUES_FN.txt", 'w', encoding='utf8') as fn_file:
    fn_file.write("FALSE NEGATIVES\n")
    for source in gold_cue_idx_to_cue:
        for fn in cue_mistakes[source]["FN"]: 
            cue_i, cue = fn
            fn_file.write(f"{source}: {source_to_sent[source]}\n")
            fn_file.write(f"{source}, {cue_i}, {[w for w in cue]}")            
            all_fn.append((source, cue_i, [w for w in cue if w != "_"]))
            fn_file.write("\n\n")

    print("\nAll FN:")
    print(all_fn)

    fn_cue_counter = Counter([" ".join(cue_lst).lower() for _, _, cue_lst in all_fn])
    fn_file.write("\n")
    fn_file.write("TYPE OF CUE AND NUMBER OF FALSE NEGATIVES:\n")
    fn_file.write("WORD,COUNT\n")
    for w, c in fn_cue_counter.items():
        fn_file.write(f"{w},{c}\n")    
    print()
    print("\nFALSE NEGATIVES:")
    print(fn_cue_counter)

# FIND ALL FALSE POSITIVES
all_fp = []
with open(f"{system[:len(system)-4]}_CUES_FP.txt", 'w', encoding='utf8') as fp_file:
    fp_file.write("FALSE POSITIVES\n")
    for source in sys_cue_idx_to_cue:
        for fp in cue_mistakes[source]["FP"]:
            cue_i, cue = fp
            fp_file.write(f"{source}: {source_to_sent[source]}\n")
            fp_file.write(f"{source}, {cue_i}, {[w for w in cue]}")
            all_fp.append((source, cue_i, [w for w in cue if w != "_"]))
            fp_file.write("\n\n")

    print("\nAll FP:")
    print(all_fp)

    fp_cue_counter = Counter([" ".join(cue_lst).lower() for _, _, cue_lst in all_fp])
    fp_file.write("\n")
    fp_file.write("TYPE OF CUE AND NUMBER OF FALSE POSITIVES:\n")
    fp_file.write("WORD,COUNT\n")
    for w, c in fp_cue_counter.items():
        fp_file.write(f"{w},{c}\n")
    print("\nFALSE POSITIVES:")
    print(fp_cue_counter)

In [None]:
print("# sentences with a correct cue and at least one correct scope:", len(set(list(scope_mistakes['correct'].keys()))))
print("# sentences with a correct cue and at least one incorrect scope:", len(set(list(scope_mistakes['incorrect'].keys()))))

print("# unique sentences with a correct cue prediction:", len(set(list(scope_mistakes['correct'].keys()) + list(scope_mistakes['incorrect'].keys()))))


In [None]:
# ANALYSIS OF SCOPE PREDICTIONS:
# gold scope length vs. pred scope length
# for what cues does the model fail on scopes 
# qualitative analysis, examples 

print(all_gold_scopes[:10])
print(all_sys_scopes[:10])

print(len(all_gold_scopes) == len(all_gold_cues))
print(len(all_sys_scopes) == len(all_sys_cues))

print(f"# gold cues: {len(all_gold_cues)}")
print(f"# sys cues: {len(all_sys_cues)}")

print(f"# gold scopes: {len(all_gold_scopes)}")
print(f"# sys scopes: {len(all_sys_scopes)}")

gold_implicit = []
for i, scope in enumerate(all_gold_scopes):
    if len(scope) == 0:
        gold_implicit.append((all_gold_cues[i], all_gold_scopes[i]))

sys_implicit = []
for i, scope in enumerate(all_sys_scopes):
    if len(scope) == 0:
        sys_implicit.append((all_sys_cues[i], all_sys_scopes[i]))

print("IMPLICIT SCOPES AND THEIR CUE:")
print("GOLD:", gold_implicit)
print("SYS:", sys_implicit)
print()

print(f"# implicit (empty) gold scopes: {len([s for s in all_gold_scopes if len(s) == 0])}")
print(f"# implicit (empty) sys scopes: {len([s for s in all_sys_scopes if len(s) == 0])}")

print("------ SCOPE LENGTHS ------")
print(f"GOLD: {sum([len(s) for s in all_gold_scopes]) / len(all_gold_scopes)}")
print(f"SYS: {sum([len(s) for s in all_sys_scopes]) / len(all_sys_scopes)}")


In [None]:
# MORE SCOPE ANALYSIS - WHAT KINDS OF SCOPES FAIL? (FOR WHAT KIND OF CUES ETC.)
from collections import defaultdict

# NB: This is for cues that are correctly predicted: 
print(f"Number of incorrect scopes: {sum([len(scope_mistakes['incorrect'][source]) for source in scope_mistakes['incorrect']])}")
print(f"Number of correct scopes: {sum([len(scope_mistakes['correct'][source]) for source in scope_mistakes['correct']])}")

scope_errors_per_cue = defaultdict(lambda: [])
scope_correct_per_cue = defaultdict(lambda: [])

for source in scope_mistakes['incorrect']:
    scope_errors = scope_mistakes['incorrect'][source]

    for e in scope_errors:
        cue, gold_scope, sys_scope = e
        cue_trimmed = " ".join([w.lower() for w in cue if w != "_"])
        scope_errors_per_cue[cue_trimmed].append({'source':source, 'cue':cue, 'gold_scope':gold_scope, 'sys_scope':sys_scope})

for source in scope_mistakes['correct']:
    scope_correct = scope_mistakes['correct'][source]

    for c in scope_correct:
        cue, scope = c
        cue_trimmed = " ".join([w.lower() for w in cue if w != "_"])
        scope_correct_per_cue[cue_trimmed].append({'source':source, 'cue':cue, 'scope':scope})


with open(f"{system[:len(system)-4]}_SCOPES_MISTAKES_STATS.txt", 'w', encoding='utf8') as f:
    f.write(f"# gold cues: {len(all_gold_cues)}\n")
    f.write(f"# sys cues: {len(all_sys_cues)}\n")

    f.write(f"# gold scopes: {len(all_gold_scopes)}\n")
    f.write(f"# sys scopes: {len(all_sys_scopes)}\n\n")

    f.write("IMPLICIT SCOPES AND THEIR CUE:\n")
    f.write("GOLD: {}\n".format(gold_implicit))
    f.write("SYS: {}\n".format(sys_implicit))

    f.write(f"# implicit (empty) gold scopes: {len([s for s in all_gold_scopes if len(s) == 0])}\n")
    f.write(f"# implicit (empty) sys scopes: {len([s for s in all_sys_scopes if len(s) == 0])}\n")

    f.write("\n------ SCOPE LENGTHS ------\n")
    f.write(f"GOLD: {sum([len(s) for s in all_gold_scopes]) / len(all_gold_scopes)}\n")
    f.write(f"SYS: {sum([len(s) for s in all_sys_scopes]) / len(all_sys_scopes)}\n\n")

    # NB: This is for cues that are correctly predicted: 
    f.write(f"Number of incorrect scopes: {sum([len(scope_mistakes['incorrect'][source]) for source in scope_mistakes['incorrect']])}\n")
    f.write(f"Number of correct scopes: {sum([len(scope_mistakes['correct'][source]) for source in scope_mistakes['correct']])}\n")

    f.write("\nCUE\t# scope errors\n")
    for cue in scope_errors_per_cue:
        f.write(f"{cue}\t{len(scope_errors_per_cue[cue])}\n")

    f.write("\nCUE\t# correct scopes\n")
    for cue in scope_correct_per_cue:
        f.write(f"{cue}\t{len(scope_correct_per_cue[cue])}\n")

    f.write("\nCUE\t% scope errors\n")
    for cue in scope_errors_per_cue:
        f.write(f"{cue}\t{round((len(scope_errors_per_cue[cue])/(len(scope_errors_per_cue[cue])+len(scope_correct_per_cue[cue])))*100, 2)}\n")

    print("Written to {}".format(f"{system[:len(system)-4]}_SCOPES_MISTAKES_STATS.txt"))


with open(f"{system[:len(system)-4]}_SCOPES_MISTAKES_ALL_CASES.txt", 'w', encoding='utf8') as f:
    for cue in scope_errors_per_cue:
        f.write(f"CUE: '{cue}'\n------------------------------------------------------------------\n")
        for error in scope_errors_per_cue[cue]:
            f.write(f"Source: {error['source']}\n")
            f.write("Cue: {}\n".format(" ".join(error['cue']))) 
            f.write("Gold scope: {}\n".format(" ".join(error['gold_scope'])))
            f.write("Sys scope:  {}\n".format(" ".join(error['sys_scope'])))
            f.write("\n")
        f.write("\n")
    
    print("Written to {}".format(f"{system[:len(system)-4]}_SCOPES_MISTAKES_ALL_CASES.txt"))

In [None]:
print("CUE\t# scope errors")
for cue in scope_errors_per_cue:
    print(f"{cue}\t{len(scope_errors_per_cue[cue])}")

print()
print("CUE\t# correct scopes")
for cue in scope_correct_per_cue:
    print(f"{cue}\t{len(scope_correct_per_cue[cue])}")

print("\nCUE\t% scope errors")
for cue in scope_errors_per_cue:
    print(f"{cue}\t{round((len(scope_errors_per_cue[cue])/(len(scope_errors_per_cue[cue])+len(scope_correct_per_cue[cue])))*100, 2)}")