# Dialogue and Narrative Coursework - Subtask 1

In [1]:
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
import transformers
from transformers import AutoTokenizer, default_data_collator
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

2022-01-10 13:09:16.747413: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-10 13:09:16.747434: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### Build BERT 

In [2]:
model_checkpoint = "distilbert-base-uncased"
batch_size = 50
train = False
num_epochs = 1
validate_full = True # set False to only validate on teh same contexts as training data
trained_checkpoint = 'data/bert_'+str(num_epochs)+'_epochs'
trained_checkpoint = "data/bert_matt_1/"  # other pretrained model for eval
validate_cuda = True

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [3]:
## A function to load a specific doc2dial dataset

def load_doc2dial_dataset(name='dialogue_domain', split='train'):
  cache_dir = "./data_cache"

  return load_dataset(
      "doc2dial",
      name=name,
      split=split,
      ignore_verifications=True,
      cache_dir=cache_dir,
  )

train_data = load_doc2dial_dataset(name="doc2dial_rc", split="train")
val_data = load_doc2dial_dataset(name="doc2dial_rc", split="validation")

Reusing dataset doc2dial (./data_cache/doc2dial/doc2dial_rc/1.0.1/cf6d3ed4e77cea477387dd51c171a021a09bd314cf3a2cb2a6431ca738c6c0ee)
Reusing dataset doc2dial (./data_cache/doc2dial/doc2dial_rc/1.0.1/cf6d3ed4e77cea477387dd51c171a021a09bd314cf3a2cb2a6431ca738c6c0ee)


In [4]:
def preprocess_function(examples):
    short_questions = [q.split('\t')[0][5:] for q in examples["question"]]

    questions = [q.strip() for q in short_questions]

    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [5]:
def preprocess_val(examples):
    short_questions = [q.split('\t')[0][5:] for q in examples["question"]]

    questions = [q.strip() for q in short_questions]

    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

#     offset_mapping = inputs.pop("offset_mapping")
    offset_mapping = inputs["offset_mapping"]
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        # print(start_char, end_char, sequence_ids)
        # print(offset)
        # print()

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs["answers"] = examples["answers"]
    inputs["question"] = questions
    inputs["context"] = examples["context"]
    return inputs

In [6]:
# tokenise data
tok_train_data = train_data.map(preprocess_function, batched=True, remove_columns=train_data.column_names)
tok_val_data = val_data.map(preprocess_function, batched=True, remove_columns=val_data.column_names)

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [7]:
if train:  
    model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
    args = TrainingArguments(
        f"{model_checkpoint}-finetuned-doc2dial",
    #     evaluation_strategy = "epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        evaluation_strategy="steps",
        logging_steps=500
    )

    trainer = Trainer(
        model,
        args,
        train_dataset=tok_train_data,
        eval_dataset=tok_val_data,
        data_collator=default_data_collator,
        tokenizer=tokenizer
        )
    # start training
    trainer.train()
else:
    model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint)
    if validate_cuda:
        model.cuda()
    tok_val_data = val_data.map(preprocess_val, batched=True)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [8]:
tok_val_data = val_data.map(preprocess_val, batched=True)

Loading cached processed dataset at ./data_cache/doc2dial/doc2dial_rc/1.0.1/cf6d3ed4e77cea477387dd51c171a021a09bd314cf3a2cb2a6431ca738c6c0ee/cache-b6fde3218e6a26cc.arrow


In [9]:
# get inds of contexts not in the training set
from word_counts import word_counter
import utils
train_data = utils.load_own_rc_data(split="train")
val_data = utils.load_own_rc_data(split="validation")

counter = word_counter(train_data)
skip_inds = counter.no_context_availble(val_data)

good_data_inds = list(range(0, tok_val_data.num_rows))
for ind in skip_inds:
    good_data_inds.remove(ind)

# decide what data to validate
all_inds = list(range(0, tok_val_data.num_rows)) # all data use good_data_inds for context same as training
if validate_full:
    inds_to_val = all_inds
else:
    inds_to_val = good_data_inds


In [10]:
## Evaluate
from tqdm import tqdm
import torch
softmax = torch.nn.functional.softmax

def format_answer(qid, text, proba):
    return {"id":qid, "prediction_text":text, "no_answer_probability":1-proba}



# loop over to get all predictions
all_preds = []
for i in tqdm(inds_to_val):
    example = tok_val_data.select([i])
    short_question = [q.split('\t')[0][5:] for q in example["question"]]
    question = [q.strip() for q in short_question]

    inputs = tokenizer(question, 
                       example["context"], 
                        max_length=512,
                        truncation="only_second",
                        return_offsets_mapping=True,
                        padding="max_length",
                        return_tensors="pt"
        )

    offsets = inputs["offset_mapping"]
    sequence_ids = inputs.sequence_ids()
    
    if validate_cuda:
        inputs["input_ids"] = inputs["input_ids"].cuda()
        inputs["attention_mask"] = inputs["attention_mask"].cuda()
    outputs = model(inputs["input_ids"], inputs["attention_mask"])

    context = example[0]["context"]
    qid = example[0]["id"]
    offsets = example[0]["offset_mapping"]
    
    start_logits = softmax(outputs.start_logits[0], dim=0).cpu().detach().numpy()
    end_logits = softmax(outputs.end_logits[0], dim=0).cpu().detach().numpy()

    start_index = np.argsort(start_logits)[-1]
    end_index = np.argsort(end_logits)[-1]


    ## Checks the answer
    score = (start_logits[start_index] + end_logits[end_index]) / 2.0

    if start_index >= end_index: # No answer
        all_preds.append(format_answer(qid, "", score))
    elif sequence_ids[start_index] == 0 or sequence_ids[start_index] ==None: # Invalid answer
        all_preds.append(format_answer(qid, "", score))
    else:
        start_char = offsets[start_index][0]
        end_char = offsets[end_index][1]
        text = context[start_char: end_char]
        all_preds.append(format_answer(qid, text, score))


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4002/4002 [00:45<00:00, 87.70it/s]


In [11]:
# run eval metrics
import json
file = 'predictions_bert_all_data_1000.json'
with open(file, 'w') as outfile:
    json.dump(all_preds, outfile)

import os
cmd = 'python sharedtask_utils.py --task subtask1 --prediction_json '+file
os.system(cmd)

Reusing dataset doc2dial (/home/matt/.cache/huggingface/datasets/doc2dial/doc2dial_rc/1.0.1/cf6d3ed4e77cea477387dd51c171a021a09bd314cf3a2cb2a6431ca738c6c0ee)
2022-01-10 13:10:31.912401: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-10 13:10:31.912418: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


predictions_bert_all_data_1000.json
{'exact': 7.021489255372314, 'f1': 27.12797122640315, 'total': 4002, 'HasAns_exact': 7.021489255372314, 'HasAns_f1': 27.12797122640315, 'HasAns_total': 4002, 'best_exact': 7.021489255372314, 'best_exact_thresh': 0.8715786337852478, 'best_f1': 27.127971226403155, 'best_f1_thresh': 0.8925076723098755}


0

In [12]:
# good: {'exact': 7.261968800430339, 'f1': 27.17396473274665, 'total': 1859, 'HasAns_exact': 7.261968800430339, 'HasAns_f1': 27.17396473274665, 'HasAns_total': 1859, 'best_exact': 7.261968800430339, 'best_exact_thresh': 0.7820892930030823, 'best_f1': 27.173964732746647, 'best_f1_thresh': 0.8885061740875244}
# all:  {'exact': 7.021489255372314, 'f1': 27.12797122640315, 'total': 4002, 'HasAns_exact': 7.021489255372314, 'HasAns_f1': 27.12797122640315, 'HasAns_total': 4002, 'best_exact': 7.021489255372314, 'best_exact_thresh': 0.871579110622406, 'best_f1': 27.127971226403155, 'best_f1_thresh': 0.8925075531005859}