In [None]:
# Notebook starts in notebooks folder. Change working directory back to streusle-tagger
%cd ../../..

In [None]:
# System imports
import json
import math
import os
import sys

from copy import deepcopy

# Add parent of streusle-tagger to path (streusle should be in this folder)
sys.path.append("../../..")

# External imports
import allennlp.nn.util as util
import numpy as np
import pandas as pd
import torch

from allennlp.common import Params
from allennlp.common.util import import_submodules
from allennlp.data import Instance
from allennlp.data.dataset import Batch
from allennlp.data.dataset_readers import DatasetReader
from allennlp.models.archival import load_archive
from allennlp.nn.util import logsumexp
from allennlp.training.optimizers import Optimizer
from allennlp.training.util import datasets_from_params

import_submodules("streusle_tagger")

In [None]:
params = Params.from_file("training_config/streusle_bert_large_cased/streusle_bert_large_cased_no_constraints.jsonnet")

# Old model
# archive = load_archive("models/streusle_bert_large_cased_no_constraints/model.tar.gz")

archive = load_archive("models/new_no_constraints/model.tar.gz")
model = archive.model

index_to_label = model.vocab.get_index_to_token_vocabulary(model._label_namespace)
label_to_index = dict(zip(index_to_label.values(), index_to_label.keys()))

labels_df = pd.DataFrame(label_to_index, columns=["Label", "Index"])

In [None]:
datasets = datasets_from_params(deepcopy(params))
dataset_reader_params = deepcopy(params).pop("dataset_reader")
dataset_reader = DatasetReader.from_params(dataset_reader_params)

In [None]:
dimsum_test_path = "data/dimsum16/dimsum16_test_updated_labeled.json"
dimsum_test_reformatted = "data/dimsum16/dimsum16_test_updated_labeled_reformatted.json"

with open(dimsum_test_path, "r") as f:
    lines = f.readlines()

newlines = []

newlines.append("[\n")
for i, line in enumerate(lines):
    if i != len(lines) - 1:
        newlines.append(line[:-1] + ",\n")
    else:
        newlines.append(line[:-1] + "\n")
newlines.append("]")
        
with open(dimsum_test_reformatted, "w") as f:
    f.writelines(newlines)

In [None]:
def read(file_path):
    with open(file_path, 'r') as tagging_file:
        tagging_data = json.load(tagging_file)
        for i, x in enumerate(tagging_data):
            if i % 200 == 0:
                print(i)
            tokens = [_ for _ in x["tokens"]]
            # Get their associated upos
            upos_tags = [_ for _ in x["upos_tags"]]

            # Get their associated lemma
            lemmas = [_ for _ in x["lemmas"]]
            
            # Don't need ground labels for confidence scores
            
            yield dataset_reader.text_to_instance(tokens, upos_tags, lemmas)
            
dimsum_test = list(read(dimsum_test_reformatted))

In [None]:
def denominator(crf, logits):
    """Sum of all paths through the CRF."""
    if len(logits.size()) > 1:
        sequence_length, num_tags = logits.size()
        alpha = crf.start_transitions + logits[0]
    else:
        sequence_length = 1
        num_tags = logits.size()[0]
        alpha = crf.start_transitions + logits
        
    forward_trellis = []
    forward_trellis.append(alpha)
    
    for i in range(1, sequence_length):
        forward_trellis.append(forward_trellis[i - 1] + logsumexp(logits[i].view(1, num_tags) + crf.transitions))

    denom = util.logsumexp(forward_trellis[sequence_length - 1] + crf.end_transitions)
    
    backward_trellis = []

    if sequence_length > 1:
        backward_trellis.append(logsumexp(logits[sequence_length - 1].view(1, num_tags) + crf.transitions))

    reverse_indexes = list(range(1, sequence_length - 1))
    reverse_indexes.reverse()
    for i in reverse_indexes:
        backward_trellis.append(backward_trellis[sequence_length - i - 2] + logsumexp(logits[i].view(1, num_tags) + crf.transitions))
        
    # This never gets used; it's just for more intuitive indexing in numerator calculation
    backward_trellis.append(["dummy placeholder"])
    backward_trellis.reverse()
    return forward_trellis, backward_trellis, denom

def numerators(crf, logits, forward_trellis, backward_trellis, word_num):
    """Sum of all paths through each tag at position word_num."""
    if len(logits.size()) > 1:
        sequence_length, num_tags = logits.size()
    else:
        sequence_length = 1
        num_tags = logits.size()[0]
    
    if sequence_length == 1:
        transition_mask = torch.zeros((num_tags, num_tags))
        for i in range(num_tags):
            transition_mask[i, i] = 1
        alpha = util.replace_masked_values(model.crf.start_transitions.repeat(num_tags, 1) + logits.repeat(num_tags, 1), transition_mask, -1e32)
        return logsumexp(alpha + model.crf.end_transitions.repeat(num_tags, 1))   
    
    elif word_num == 0:
        transition_mask = torch.zeros((num_tags, num_tags))
        for i in range(num_tags):
            transition_mask[i, i] = 1
        alpha = util.replace_masked_values(model.crf.start_transitions.repeat(num_tags, 1) + logits[0].repeat(num_tags, 1), transition_mask, -1e32)
        return logsumexp(alpha + backward_trellis[1].repeat(num_tags, 1) + model.crf.end_transitions.repeat(num_tags, 1))
    
    else:
        emit_mask = torch.zeros((num_tags, num_tags))
        for i in range(num_tags):
            emit_mask[i, i] = 1
        tiled_logits = logits[word_num].repeat(num_tags, 1)
        emit_scores = util.replace_masked_values(tiled_logits, emit_mask, -1e32)

        transition_mask = torch.zeros(num_tags, num_tags, num_tags)
        for i in range(num_tags):
            transition_mask[i, :, i] = 1

        transition_scores = util.replace_masked_values(crf.transitions.repeat(num_tags, 1, 1), transition_mask, -1e32)

        if word_num == sequence_length - 1:
            return logsumexp(forward_trellis[word_num - 1].repeat(num_tags, 1) + logsumexp(emit_scores.view(num_tags, 1, num_tags) + transition_scores) + model.crf.end_transitions.repeat(num_tags, 1))
        else:
            return logsumexp(forward_trellis[word_num - 1].repeat(num_tags, 1) + logsumexp(emit_scores.view(num_tags, 1, num_tags) + transition_scores) + backward_trellis[word_num + 1].repeat(num_tags, 1) + model.crf.end_transitions.repeat(num_tags, 1))

In [None]:
def sentence_confidence(crf, sequence_logits):
    """Calculates matrix of confidence scores with num_words rows and num_tags columns."""
    confidence_matrix = []
    num_tags = crf.num_tags
    if len(sequence_logits.size()) == 1:
        num_words = 1
    else:
        num_words = sequence_logits.size()[0]
    
    forward_trellis, backward_trellis, denom = denominator(model.crf, sequence_logits)
    for word_num in range(num_words):
        nums = numerators(crf, sequence_logits, forward_trellis, backward_trellis, word_num)
        new_row = [math.exp(num - denom) for num in nums]
        confidence_matrix.append(new_row)
        
    return confidence_matrix
        
def dataset_confidence_no_labels(dataset, dataset_name):
    """Creates one CSV file per sentence, containing metadata and confidence scores for all tag-token pairs."""
    
    save_path = f"calibration/confidence_scores/{dataset_name}"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    
    for i, instance in enumerate(dataset):
        instance_batch = Batch([instance])
        instance_batch.index_instances(model.vocab)

        # Confidence scores
        print(f"Calculating confidence scores for instance {i}...")
        tokens = instance_batch.as_tensor_dict()["tokens"]
        embedded_tokens = model.text_field_embedder(tokens)
        logits = model.tag_projection_layer(embedded_tokens).squeeze()
        confidence_matrix = sentence_confidence(model.crf, logits)

        # Metadata
        tokens_list = np.array([[str(t) for t in instance.get("tokens").tokens]]).transpose()
        predicted_tags_indexes = (model.forward(**instance_batch.as_tensor_dict())["tags"])[0]
        predicted_tags = np.array([[index_to_label[i] for i in predicted_tags_indexes]]).transpose()
        predicted_tags_indexes = np.array([predicted_tags_indexes]).transpose()
        metadata = np.concatenate((tokens_list, predicted_tags, predicted_tags_indexes), axis=1)

        # Combine metadata and confidence scores
        data = np.concatenate((metadata, confidence_matrix), axis=1)
        
        # Write to file
        columns = ["Tokens", "Predicted Tags", "Predicted Tag Indexes"] + [i for i in range(model.crf.num_tags)]
        df = pd.DataFrame(data, columns=columns)
        df.to_csv(f"{save_path}/{i:04d}.csv")

In [None]:
dataset_confidence_no_labels(dimsum_test, "dimsum_test")