In [None]:
import itertools
import math
import pickle

import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import numpy as np
import pandas as pd

%matplotlib inline
%cd ../../../

In [None]:
labels_df = pd.read_csv("calibration/labels_dict.csv")
label_strings = list(labels_df["Label"])
indexes = list(labels_df["Index"])

class Label():
    """A label consists of a bio tag, a lexical category tag, and a supersense (for nouns, verbs, and prepositions (STREUSLE only)"""
    def __init__(self, label_string, index):
        self.original = label_string
        self.new = ""
        self.index = index
        
        # We want to split subtags using "-" later, so temporarily replace "Co-" (as in "Co-agent" or "Co-theme" with "ç").
        label_string = label_string.replace("Co-", "ç")
        
        if label_string.startswith("I"):
            self.bio = "I"
            self.lex = "_"
            self.sup = ""
            return
        elif label_string.startswith("i"):
            self.bio = "i"
            self.lex = "_"
            self.sup = ""
            return
        
        self.bio = label_string[0]
        label_string = label_string[2:]

        if "-" in label_string:
            self.lex, self.sup = label_string.split("-")
        else:
            self.lex = label_string
            self.sup = ""
            
        self.sup = self.sup.lower()
        
    def __str__(self):
        return f"{self.index}\t{self.original}\t{self.new}\t{self.bio}\t{self.lex}\t{self.sup}"

labels = []
for l, i in zip(label_strings, indexes):
    labels.append(Label(l, i))

In [None]:
labels_map = {"CCONJ": "CONJ",
              "DISC": "X",
              "INF": "PART",
              "INF.P" : "PART",
              "N": "NOUN",
              "P": "ADP",
              "POSS": "PART",
              "PP": "ADP",
              "PRON.POSS": "PRON",
              "V":
              "VERB",
              "V.IAV":"VERB",
              "V.LVC.cause": "VERB",
              "V.LVC.full": "VERB",
              "V.VID": "VERB",
              "V.VPC.full": "VERB",
              "V.VPC.semi": "VERB",
              "_": "X"}

# In first pass, change lexical categories to match DiMSUM labelset
for l in labels:
    if l.lex in labels_map.keys():
        l.lex = labels_map[l.lex]
        
# In second pass, remove preposition supersenses
for l in labels:
    if l.sup .startswith("p"):
        l.sup = ""
        
# In third pass, set l.new
for l in labels:
    if l.sup != "":
        l.new = f"{l.bio}-{l.lex}-{l.sup}"
    else:
        l.new = f"{l.bio}-{l.lex}"

# In fourth and final pass, create map from each new label to list of indexes from old tagset (e.g. new_labels["O-PRON"] = [1, 28, 34, 54, 70, 78, 80, 100, 142, 242, 251, 288, 309, 348, 371])
new_labels = {}
for i, l in enumerate(labels):
    if l.new not in new_labels:
        new_labels[l.new] = [l.index]
    else:
        new_labels[l.new].append(l.index)

with open("calibration/consolidated_labels.pickle", "wb") as f:
    pickle.dump(new_labels, f)