## DiMSUM Confidence Scores

- Modified dimsum_to_jsonl.py to include label field by composing subtag fields into one string
- Read dimsum using dataset_reader
- Convert all.csv to all_newlabels.csv

In [1]:
# Notebook starts in notebooks folder. Change working directory back to streusle-tagger
%cd ../../../

C:\michael\dev\streusle-tagger


In [2]:
# System imports
import json
import math
import os
import pickle
import sys

from copy import deepcopy

# Add parent of streusle-tagger to path (streusle should be in this folder)
sys.path.append("../../..")

# External imports
import allennlp.nn.util as util
import numpy as np
import pandas as pd

from allennlp.common import Params
from allennlp.common.util import import_submodules
from allennlp.data.dataset_readers import DatasetReader
from allennlp.training.util import datasets_from_params

import_submodules("streusle_tagger")

params = Params.from_file("training_config/streusle_bert_large_cased/streusle_bert_large_cased_no_constraints.jsonnet")
datasets = datasets_from_params(deepcopy(params))
dataset_reader_params = deepcopy(params).pop("dataset_reader")
dataset_reader = DatasetReader.from_params(dataset_reader_params)

_jsonnet not loaded, treating training_config/streusle_bert_large_cased/streusle_bert_large_cased_no_constraints.jsonnet as json
Your BERT model appears to be cased, but your indexer is lowercasing tokens.
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.
2723it [00:00, 5945.43it/s]
554it [00:00, 5276.33it/s]
535it [00:00, 10700.93it/s]
Your BERT model appears to be cased, but your indexer is lowercasing tokens.
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


In [3]:
with open("calibration/consolidated_labels_no_lexcat.pickle", "rb") as f:
    new_labels = pickle.load(f)

dimsum_consolidated_path = "calibration/confidence_scores/dimsum_test/all_consolidated_no_lexcat.csv"

labels_df = pd.read_csv("calibration/labels_dict.csv")

dimsum_test_path = "data/dimsum16/dimsum16_test_updated_labeled_reformatted.json"

def read(file_path):
    with open(file_path, 'r') as tagging_file:
        tagging_data = json.load(tagging_file)
        for i, x in enumerate(tagging_data):
            if i % 200 == 0:
                print(i)
            tokens = [_ for _ in x["tokens"]]
            # Get their associated upos
            upos_tags = [_ for _ in x["upos_tags"]]

            # Get their associated lemma
            lemmas = [_ for _ in x["lemmas"]]
            
            labels = [_ for _ in x["label"]]
            
            yield dataset_reader.text_to_instance(tokens, upos_tags, lemmas, labels)
            
dimsum_test = list(read(dimsum_test_path))

0
200
400
600
800


In [4]:
ground = []
for i in dimsum_test:
    ground.extend(i.get("tags").labels)

In [5]:
corrected_ground = []

# If there's a noun or verb supersense label, the lexcat has to be NOUN or VERB (exceptions are usually due to MWEs)
for g in ground:
    if "-n." in g and "NOUN" not in g:
        x = g[:g.index("-", 1) + 1] + "NOUN" + g[g.index("-", 2):]
    elif "-v." in g and "VERB" not in g:
        x = g[:g.index("-", 1) + 1] + "VERB" + g[g.index("-", 2):]
    else:
        x = g
    corrected_ground.append(x)

In [6]:
# Merge the CSVs for all sentences and save the merged version
confidence_scores_path = "calibration/confidence_scores/dimsum_test"

dfs = []
for filename in os.listdir(confidence_scores_path):
    if filename.startswith("0") and filename.endswith(".csv"):
        df = pd.read_csv(f"{confidence_scores_path}/{filename}")
        dfs.append(df)
df = pd.concat(dfs, ignore_index=True)

df.to_csv(f"{confidence_scores_path}/all.csv", index=False)

In [7]:
dimsum_df = pd.read_csv(f"{confidence_scores_path}/all.csv")
dimsum_df.head()

Unnamed: 0.1,Unnamed: 0,Tokens,Predicted Tags,Predicted Tag Indexes,0,1,2,3,4,5,...,586,587,588,589,590,591,592,593,594,595
0,0,@JoJoLyrics,O-N-n.COMMUNICATION,27,0.003308357,0.000672,0.187076,0.02004502,0.0006822077,0.022586,...,8.651178e-08,1.1915e-08,1.492807e-07,9.625937e-08,1.617215e-07,2.149607e-08,1.509233e-08,3.212832e-07,1.995928e-07,2.42595e-07
1,1,I,O-PRON,1,3.710947e-07,0.995478,2.1e-05,8.467397e-07,3.974892e-07,2e-06,...,1.063442e-10,1.54395e-10,1.109404e-10,1.433815e-10,6.351943e-11,1.549071e-10,1.347726e-10,7.412362e-11,1.141272e-10,4.02788e-11
2,2,hear,O-V-v.perception,45,1.834077e-06,1.9e-05,0.007406,0.0003886628,7.104369e-05,4e-06,...,6.379876e-09,5.383308e-09,7.684921e-09,3.88521e-09,1.200826e-08,8.716266e-09,7.930803e-09,8.395023e-09,6.668426e-09,1.063727e-08
3,3,enough,O-ADV,5,1.044289e-05,0.000255,0.011356,0.09836438,0.001795487,0.849829,...,2.715171e-09,4.363862e-09,3.653439e-09,2.914954e-09,8.935501e-09,4.649278e-09,4.485106e-09,3.533208e-09,6.058807e-09,3.257677e-09
4,4,talking,O-V-v.communication,13,3.740599e-05,0.000319,0.013987,0.001134977,9.664433e-06,0.003871,...,2.200114e-08,4.902813e-08,2.117961e-08,2.007143e-08,3.957958e-08,3.524527e-08,3.407447e-08,3.588781e-08,3.603157e-08,5.895182e-08


In [8]:
# Currently running confidence scores, so only use corrected ground up to what has been calculated so far.
dimsum_df["Ground"] = corrected_ground[0:len(dimsum_df)]

In [9]:
new_df_columns = ["Token Index", "Tokens", "Predicted Tag", "Predicted Index", "Ground", "Ground Index"] + list(new_labels.keys())
new_df_columns
new_df = pd.DataFrame(columns=new_df_columns)

new_df["Token Index"] = dimsum_df["Unnamed: 0"]
new_df["Tokens"] = dimsum_df["Tokens"]

old_index_to_new_label = {}
for k, v in new_labels.items():
    for num in v:
        old_index_to_new_label[num] = k

new_labels_list = list(new_labels)
new_label_to_new_index = dict(zip(new_labels_list, list(range(len(new_labels_list)))))

In [10]:
# Get predicted labels and indexes using consolidated labelset
predicted_labels = []
predicted_label_indexes = []
for index in dimsum_df["Predicted Tag Indexes"]:
    new_label = old_index_to_new_label[index]
    predicted_labels.append(new_label)
    new_index = new_label_to_new_index[new_label]
    predicted_label_indexes.append(new_index)

In [11]:
labels_map = {"CCONJ": "CONJ",
              "DISC": "X",
              "INF": "PART",
              "INF.P" : "PART",
              "N": "NOUN",
              "P": "ADP",
              "POSS": "PART",
              "PP": "ADP",
              "PRON.POSS": "PRON",
              "V":
              "VERB",
              "V.IAV":"VERB",
              "V.LVC.cause": "VERB",
              "V.LVC.full": "VERB",
              "V.VID": "VERB",
              "V.VPC.full": "VERB",
              "V.VPC.semi": "VERB",
              "_": "X"}

In [12]:
ground_labels_consolidated = []
for i, label in enumerate(dimsum_df["Ground"]):
    
    label = label.replace("natural_object", "naturalobject")
    label = label.replace("PROPN", "NOUN")
    
    # For dealing with manually annotated cases where "lexcat=" note is provided
    if "lexcat=" in label:
        new_lexcat = label[label.index("=") + 1:]
        mapped_new_lexcat = labels_map[new_lexcat] if new_lexcat in labels_map else new_lexcat
        new_label = label[:label.index("-") + 1] + mapped_new_lexcat
        ground_labels_consolidated.append(new_label)
        continue
    try:
        if label.startswith("I"):
            ground_labels_consolidated.append("I-X")
        elif label.startswith("i"):
            ground_labels_consolidated.append("i-X")
        else:
            ground_labels_consolidated.append(label)
    except:
        print(i, "\t", label)
        ground_labels_consolidated.append("-1")

In [13]:
ground_indexes = []
ground_labels_no_lexcat = []
for label in ground_labels_consolidated:
    try:
        if label == "-1":
            ground_indexes.append("-1")
        elif label.count("-") == 2:
            no_lexcat_label = label[0] + "-" + label[label.index("-", 2) + 1:]
        else:
            no_lexcat_label = label[0]
        ground_labels_no_lexcat.append(no_lexcat_label)
        ground_indexes.append(new_label_to_new_index[no_lexcat_label])
    except:
        ground_indexes.append("-1")

In [14]:
assert len(ground_indexes) == len(ground_labels_no_lexcat)

In [15]:
new_df["Predicted Tag"] = predicted_labels
new_df["Predicted Index"] = predicted_label_indexes
new_df["Ground"] = ground_labels_no_lexcat
new_df["Ground Index"] = ground_indexes

In [16]:
new_df.head()

Unnamed: 0,Token Index,Tokens,Predicted Tag,Predicted Index,Ground,Ground Index,O,I,O-v.stative,O-n.group,...,B-??,B-n.plant,o-v.communication,o-v.emotion,b-n.substance,B-n.other,b-v.possession,b-n.quantity,o-n.naturalobject,b-n.location
0,0,@JoJoLyrics,O-n.communication,17,O,0,,,,,...,,,,,,,,,,
1,1,I,O,0,O,0,,,,,...,,,,,,,,,,
2,2,hear,O-v.perception,26,O-v.perception,26,,,,,...,,,,,,,,,,
3,3,enough,O,0,O,0,,,,,...,,,,,,,,,,
4,4,talking,O-v.communication,7,O-v.communication,7,,,,,...,,,,,,,,,,


In [17]:
def sum_scores(new_label, row_index):
    score_sum = 0
    
    for i in new_labels[new_label]:
        score_sum += dimsum_df[str(i)][row_index]
        
    return score_sum

In [18]:
score_column_names = list(new_df.columns)[6:]

score_columns = {}
for c in score_column_names:
    score_columns[c] = []
for i, row in new_df.iterrows():
    for c in score_column_names:
        score_columns[c].append(sum_scores(c, i))

for c in score_column_names:
    new_df[c] = score_columns[c]

In [19]:
new_df.head()

Unnamed: 0,Token Index,Tokens,Predicted Tag,Predicted Index,Ground,Ground Index,O,I,O-v.stative,O-n.group,...,B-??,B-n.plant,o-v.communication,o-v.emotion,b-n.substance,B-n.other,b-v.possession,b-n.quantity,o-n.naturalobject,b-n.location
0,0,@JoJoLyrics,O-n.communication,17,O,0,0.311954,0.188062,0.087458,0.021254,...,2.375879e-08,3.950812e-06,2.599846e-07,4.327444e-08,1.617358e-06,2.311371e-08,1.009156e-06,6.765463e-08,9.082696e-08,1.509233e-08
1,1,I,O,0,O,0,0.995519,2.1e-05,3e-06,3e-06,...,1.589938e-08,1.71568e-09,5.441116e-09,2.073083e-10,1.186772e-10,1.868365e-10,6.800536e-10,1.977087e-09,3.764251e-10,1.347726e-10
2,2,hear,O-v.perception,26,O-v.perception,26,0.000691,0.007862,0.000761,2e-05,...,7.995566e-09,7.173501e-08,6.073835e-06,4.889142e-08,1.474222e-07,1.158052e-08,1.779303e-07,3.386133e-07,1.222154e-07,7.930803e-09
3,3,enough,O,0,O,0,0.951448,0.021746,9.5e-05,3.3e-05,...,8.819788e-08,9.614047e-08,9.845042e-08,2.515755e-08,1.67335e-08,3.748133e-09,7.123056e-08,2.861959e-07,4.182288e-08,4.485106e-09
4,4,talking,O-v.communication,7,O-v.communication,7,0.005734,0.295478,0.000332,2.9e-05,...,1.027149e-07,1.736899e-07,1.132739e-06,1.610143e-06,3.967633e-08,2.301025e-08,2.259521e-07,6.651941e-08,1.057945e-07,3.407447e-08


In [20]:
new_df.to_csv(dimsum_consolidated_path, index=False)