## DiMSUM Confidence Scores

- Modified dimsum_to_jsonl.py to include label field by composing subtag fields into one string
- Read dimsum using dataset_reader
- Convert all.csv to all_newlabels.csv

In [1]:
# Notebook starts in notebooks folder. Change working directory back to streusle-tagger
%cd ../../../

C:\Michael\dev\streusle-tagger


In [4]:
# System imports
import json
import math
import os
import pickle
import sys

from copy import deepcopy

# Add parent of streusle-tagger to path (streusle should be in this folder)
sys.path.append("../streusle")

# External imports
import allennlp.nn.util as util
import numpy as np
import pandas as pd

from allennlp.common import Params
from allennlp.common.util import import_submodules
from allennlp.data.dataset_readers import DatasetReader
from allennlp.training.util import datasets_from_params

import_submodules("streusle_tagger")

params = Params.from_file("training_config/streusle_bert_large_cased/streusle_bert_large_cased_no_constraints.jsonnet")
datasets = datasets_from_params(deepcopy(params))
dataset_reader_params = deepcopy(params).pop("dataset_reader")
dataset_reader = DatasetReader.from_params(dataset_reader_params)

_jsonnet not loaded, treating training_config/streusle_bert_large_cased/streusle_bert_large_cased_no_constraints.jsonnet as json
Your BERT model appears to be cased, but your indexer is lowercasing tokens.
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.
2723it [00:00, 6119.42it/s]
554it [00:00, 5651.72it/s]
535it [00:00, 12446.63it/s]
Your BERT model appears to be cased, but your indexer is lowercasing tokens.
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


In [None]:
with open("calibration/consolidated_labels.pickle", "rb") as f:
    new_labels = pickle.load(f)

dimsum_consolidated_path = "calibration/confidence_scores/dimsum_test/all_consolidated.csv"

In [3]:
labels_df = pd.read_csv("calibration/labels_dict.csv")

dimsum_test_path = "data/dimsum16/dimsum16_test_updated_labeled_reformatted.json"

def read(file_path):
    with open(file_path, 'r') as tagging_file:
        tagging_data = json.load(tagging_file)
        for i, x in enumerate(tagging_data):
            if i % 200 == 0:
                print(i)
            tokens = [_ for _ in x["tokens"]]
            # Get their associated upos
            upos_tags = [_ for _ in x["upos_tags"]]

            # Get their associated lemma
            lemmas = [_ for _ in x["lemmas"]]
            
            labels = [_ for _ in x["label"]]
            
            yield dataset_reader.text_to_instance(tokens, upos_tags, lemmas, labels)
            
dimsum_test = list(read(dimsum_test_path))

0


KeyError: 'source'

In [None]:
ground = []
for i in dimsum_test:
    ground.extend(i.get("tags").labels)

In [None]:
corrected_ground = []

# If there's a noun or verb supersense label, the lexcat has to be NOUN or VERB (exceptions are usually due to MWEs)
for g in ground:
    if "-n." in g and "NOUN" not in g:
        x = g[:g.index("-", 1) + 1] + "NOUN" + g[g.index("-", 2):]
    elif "-v." in g and "VERB" not in g:
        x = g[:g.index("-", 1) + 1] + "VERB" + g[g.index("-", 2):]
    else:
        x = g
    corrected_ground.append(x)

In [None]:
# Merge the CSVs for all sentences and save the merged version
confidence_scores_path = "calibration/confidence_scores/dimsum_test"

dfs = []
for filename in os.listdir(confidence_scores_path):
    if filename.startswith("0") and filename.endswith(".csv"):
        df = pd.read_csv(f"{confidence_scores_path}/{filename}")
        dfs.append(df)
df = pd.concat(dfs, ignore_index=True)

df.to_csv(f"{confidence_scores_path}/all.csv", index=False)

In [None]:
dimsum_df = pd.read_csv(f"{confidence_scores_path}/all.csv")
dimsum_df.head()

In [None]:
# Currently running confidence scores, so only use corrected ground up to what has been calculated so far.
dimsum_df["Ground"] = corrected_ground[0:len(dimsum_df)]

In [None]:
new_df_columns = ["Token Index", "Tokens", "Predicted Tag", "Predicted Index", "Ground", "Ground Index"] + list(new_labels.keys())
new_df_columns
new_df = pd.DataFrame(columns=new_df_columns)

new_df["Token Index"] = dimsum_df["Unnamed: 0"]
new_df["Tokens"] = dimsum_df["Tokens"]

old_index_to_new_label = {}
for k, v in new_labels.items():
    for num in v:
        old_index_to_new_label[num] = k

new_labels_list = list(new_labels)
new_label_to_new_index = dict(zip(new_labels_list, list(range(len(new_labels_list)))))

In [None]:
# Get predicted labels and indexes using consolidated labelset
predicted_labels = []
predicted_label_indexes = []
for index in dimsum_df["Predicted Tag Indexes"]:
    new_label = old_index_to_new_label[index]
    predicted_labels.append(new_label)
    new_index = new_label_to_new_index[new_label]
    predicted_label_indexes.append(new_index)

In [None]:
labels_map = {"CCONJ": "CONJ",
              "DISC": "X",
              "INF": "PART",
              "INF.P" : "PART",
              "N": "NOUN",
              "P": "ADP",
              "POSS": "PART",
              "PP": "ADP",
              "PRON.POSS": "PRON",
              "V":
              "VERB",
              "V.IAV":"VERB",
              "V.LVC.cause": "VERB",
              "V.LVC.full": "VERB",
              "V.VID": "VERB",
              "V.VPC.full": "VERB",
              "V.VPC.semi": "VERB",
              "_": "X"}

ground_indexes = []
for i, label in enumerate(dimsum_df["Ground"]):
    
    label = label.replace("natural_object", "naturalobject")
    label = label.replace("PROPN", "NOUN")
    
    # For dealing with manually annotated cases where "lexcat=" note is provided
    if "lexcat=" in label:
        new_lexcat = label[label.index("=") + 1:]
        mapped_new_lexcat = labels_map[new_lexcat] if new_lexcat in labels_map else new_lexcat
        new_label = label[:label.index("-") + 1] + mapped_new_lexcat
        ground_indexes.append(new_label_to_new_index[new_label])
        continue
    try:
        if label.startswith("I"):
            ground_indexes.append(new_label_to_new_index["I-X"])
        elif label.startswith("i"):
            ground_indexes.append(new_label_to_new_index["i-X"])
        else:
            ground_indexes.append(new_label_to_new_index[label])
    except:
        print(i, "\t", label)
        ground_indexes.append("-1")

In [None]:
new_df["Predicted Tag"] = predicted_labels
new_df["Predicted Index"] = predicted_label_indexes
new_df["Ground"] = dimsum_df["Ground"]
new_df["Ground Index"] = ground_indexes

In [None]:
new_df.head()

In [None]:
def sum_scores(new_label, row_index):
    score_sum = 0
    
    for i in new_labels[new_label]:
        score_sum += dimsum_df[str(i)][row_index]
        
    return score_sum

In [None]:
score_column_names = list(new_df.columns)[6:]

score_columns = {}
for c in score_column_names:
    score_columns[c] = []
for i, row in new_df.iterrows():
    for c in score_column_names:
        score_columns[c].append(sum_scores(c, i))

for c in score_column_names:
    new_df[c] = score_columns[c]

In [None]:
new_df.head()

In [None]:
new_df.to_csv(dimsum_consolidated_path, index=False)