# Dialogue and Narrative Coursework - Subtask 1

In [1]:
## imports
# !pip install datasets
# !pip install transformers

from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np

### Build BERT 

In [2]:
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [3]:
## A function to load a specific doc2dial dataset

def load_doc2dial_dataset(name='dialogue_domain', split='train'):
  cache_dir = "./data_cache"

  return load_dataset(
      "doc2dial",
      name=name,
      split=split,
      ignore_verifications=True,
      cache_dir=cache_dir,
  )

train_data = load_doc2dial_dataset(name="doc2dial_rc", split="train")
val_data = load_doc2dial_dataset(name="doc2dial_rc", split="validation")

Reusing dataset doc2dial (./data_cache/doc2dial/doc2dial_rc/1.0.1/cf6d3ed4e77cea477387dd51c171a021a09bd314cf3a2cb2a6431ca738c6c0ee)
Reusing dataset doc2dial (./data_cache/doc2dial/doc2dial_rc/1.0.1/cf6d3ed4e77cea477387dd51c171a021a09bd314cf3a2cb2a6431ca738c6c0ee)


In [4]:
train_data[2]

{'answers': {'answer_start': [3714],
  'text': ['About ten percent of customers visiting a DMV office do not bring what they need to complete their transaction, and have to come back a second time to finish their business. This can be as simple as not bringing sufficient funds to pay for a license renewal or not having the proof of auto insurance required to register a car. Better yet , don t visit a DMV office at all, and see if your transaction can be performed online, like an address change, registration renewal, license renewal, replacing a lost title, paying a DRA or scheduling a road test. ']},
 'context': 'Many DMV customers make easily avoidable mistakes that cause them significant problems, including encounters with law enforcement and impounded vehicles. Because we see customers make these mistakes over and over again , we are issuing this list of the top five DMV mistakes and how to avoid them. \n\n1. Forgetting to Update Address \nBy statute , you must report a change of ad

In [5]:
## investigate lengths of contexts
max_length = 0
for ex in train_data:
  curr_len = len(ex["context"].split())
  if curr_len > max_length:
    max_length = curr_len

print(len(train_data[0]["context"].split()))
print("Maximum length for a context:", max_length)

781
Maximum length for a context: 4795


In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [7]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [8]:
tokenizer('What should I do?', "Too lazy to write a context.")

{'input_ids': [101, 2054, 2323, 1045, 2079, 1029, 102, 2205, 13971, 2000, 4339, 1037, 6123, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
'Tell me anything'.split('?')[0]

'Tell me anything'

In [10]:
def preprocess_function(examples):
    short_questions = [q.split('\t')[0][5:] for q in examples["question"]]

    questions = [q.strip() for q in short_questions]

    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        # print(start_char, end_char, sequence_ids)
        # print(offset)
        # print()

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [11]:
res = preprocess_function(train_data[20:30])
res["start_positions"]

[388, 0, 0, 140, 447, 388, 143, 0, 131, 0]

In [12]:
tok_train_data = train_data.map(preprocess_function, batched=True, remove_columns=train_data.column_names)

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

2022-01-06 21:53:10.991415: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-06 21:53:10.991433: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.





In [13]:
tok_val_data = val_data.map(preprocess_function, batched=True, remove_columns=val_data.column_names)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [14]:
tok_train_data

Dataset({
    features: ['attention_mask', 'end_positions', 'input_ids', 'start_positions'],
    num_rows: 20598
})

In [15]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [16]:
batch_size = 50
args = TrainingArguments(
    f"{model_checkpoint}-finetuned-doc2dial",
#     evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=6,
    weight_decay=0.01,
    evaluation_strategy="steps",
#     logging_strategy="steps",
    logging_steps=100
)

In [17]:
from transformers import default_data_collator

trainer = Trainer(
    model,
    args,
    train_dataset=tok_train_data,
    eval_dataset=tok_val_data,
    data_collator=default_data_collator,
    tokenizer=tokenizer
)

In [18]:
import gc
# del variables
gc.collect()

# trainer.train()

302

In [19]:
checkpoint = "distilbert-base-uncased-finetuned-doc2dial/checkpoint-1000/"
checkpoint = "bert_matt_1/"
trainer = AutoModelForQuestionAnswering.from_pretrained(checkpoint)

In [20]:
def preprocess_val(examples):
    short_questions = [q.split('\t')[0][5:] for q in examples["question"]]

    questions = [q.strip() for q in short_questions]

    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

#     offset_mapping = inputs.pop("offset_mapping")
    offset_mapping = inputs["offset_mapping"]
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        # print(start_char, end_char, sequence_ids)
        # print(offset)
        # print()

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs["answers"] = examples["answers"]
    inputs["question"] = questions
    inputs["context"] = examples["context"]
    return inputs

In [21]:
tok_val_data = val_data.map(preprocess_val, batched=True)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




Dataset({
    features: ['answers', 'attention_mask', 'context', 'domain', 'end_positions', 'id', 'input_ids', 'offset_mapping', 'question', 'start_positions', 'title'],
    num_rows: 4002
})

In [22]:
# inputs = tokenizer("What is your name?", 
#                    "My name is Russia and I love monsters and the witcher", 
# #                     max_length=512,
#                     truncation="only_second",
#                     return_offsets_mapping=True,
# #                     padding="max_length",
#                     return_tensors="pt"
#     )

# # print(inputs.keys())
# offsets = inputs["offset_mapping"]
# sequence_ids = inputs.sequence_ids()
# sequence_ids



In [23]:
## Evaluate
from tqdm import tqdm
import torch
softmax = torch.nn.functional.softmax

def format_answer(qid, text, proba):
    return {"id":qid, "prediction_text":text, "no_answer_probability":1-proba}

# loop over to get all predictions
all_preds = []
for i in tqdm(range(0, tok_val_data.num_rows)):
    example = tok_val_data.select([i])
    short_question = [q.split('\t')[0][5:] for q in example["question"]]
    question = [q.strip() for q in short_question]

    inputs = tokenizer(question, 
                       example["context"], 
                        max_length=512,
                        truncation="only_second",
                        return_offsets_mapping=True,
                        padding="max_length",
                        return_tensors="pt"
        )

    offsets = inputs["offset_mapping"]
    sequence_ids = inputs.sequence_ids()

    # inputs = tok_val_data.remove_columns(['answers', 'context', 'domain', 'id', 'end_positions',  'question', 'start_positions', 'title'])
    # inputs

    # outputs = trainer(**inputs)
    outputs = trainer(inputs["input_ids"], inputs["attention_mask"])

    context = example[0]["context"]
    qid = example[0]["id"]
    offsets = example[0]["offset_mapping"]
    
    start_logits = softmax(outputs.start_logits[0], dim=0).detach().numpy()
    end_logits = softmax(outputs.end_logits[0], dim=0).detach().numpy()
    
#     start_logits = softmax(outputs.start_logits[i].detach().numpy())
#     end_logits = softmax(outputs.end_logits[i].detach().numpy())

    start_index = np.argsort(start_logits)[-1]
    end_index = np.argsort(end_logits)[-1]


    ## Checks the answer
    score = (start_logits[start_index] + end_logits[end_index]) / 2.0
#     print(start_logits[start_index], end_logits[end_index])

    if start_index >= end_index:
#         print("No answer")
        all_preds.append(format_answer(qid, "", score))
    elif sequence_ids[start_index] == 0 or sequence_ids[start_index] ==None:
#         print('Invalid answer')
        all_preds.append(format_answer(qid, "", score))
    else:
        start_char = offsets[start_index][0]
        end_char = offsets[end_index][1]
        text = context[start_char: end_char]
#         print("starting en ending positions", start_pos, end_pos)
        all_preds.append(format_answer(qid, text, score))


100%|███████████████████████████████████| 4002/4002 [06:33<00:00, 10.17it/s]


In [24]:
# q = val_data[1]["question"]
# q

In [27]:
import json
file = 'predictions_2_steps.json'
with open(file, 'w') as outfile:
    json.dump(all_preds, outfile)

In [31]:
import os
cmd = 'python sharedtask_utils.py --task subtask1 --prediction_json '+file
os.system(cmd)

Reusing dataset doc2dial (/home/matt/.cache/huggingface/datasets/doc2dial/doc2dial_rc/1.0.1/cf6d3ed4e77cea477387dd51c171a021a09bd314cf3a2cb2a6431ca738c6c0ee)
2022-01-06 22:09:51.521104: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-06 22:09:51.521122: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


predictions_2_steps.json
{'exact': 7.021489255372314, 'f1': 27.12797122640315, 'total': 4002, 'HasAns_exact': 7.021489255372314, 'HasAns_f1': 27.12797122640315, 'HasAns_total': 4002, 'best_exact': 7.021489255372314, 'best_exact_thresh': 0.871579110622406, 'best_f1': 27.127971226403155, 'best_f1_thresh': 0.8925075531005859}


0