# Data exploration

In [430]:
from datasets import load_dataset
from transformers import pipeline
from tqdm import tqdm
from transformers import AutoTokenizer, BertForQuestionAnswering, BertTokenizer, DistilBertTokenizerFast, AutoModelForQuestionAnswering, TrainingArguments, Trainer
import time
import numpy as np
import pandas as pd
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [2]:
split = "train"
cache_dir = "./data_cache"

dialogue_dataset = load_dataset(
    "doc2dial",
    name="dialogue_domain",  # this is the name of the dataset for the second subtask, dialog generation
    split=split,
    ignore_verifications=True,
    cache_dir=cache_dir,
)

Reusing dataset doc2dial (./data_cache/doc2dial/dialogue_domain/1.0.1/c15afdf53780a8d6ebea7aec05384432195b356f879aa53a4ee39b740d520642)


In [3]:
document_dataset = load_dataset(
    "doc2dial",
    name="document_domain",  # this is the name of the dataset for the second subtask, dialog generation
    split=split,
    ignore_verifications=True,
    cache_dir=cache_dir,
)

Reusing dataset doc2dial (./data_cache/doc2dial/document_domain/1.0.1/c15afdf53780a8d6ebea7aec05384432195b356f879aa53a4ee39b740d520642)


## Creating the dataset

Steps:
- [X] Sliding windows from the Document
- [ ] Extract user utterance
- [ ] Extract Dialogue history

### Sliding windows from the Document

In [281]:
# Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('bert-base-uncased')

# Defining train_dict
train_dict = dict()
train_dict['train_document'] = []
train_dict['train_id_sp'] = []
train_dict['train_user_utterance'] = []
train_dict['train_doc_domain'] = []
train_dict['train_doc_id'] = []
train_dict['train_text_sp'] = []
train_dict['train_dial_id_turn_id'] = []     # necessary for evaluation
train_dict['train_start_pos'] = []     
train_dict['train_end_pos'] = []     
train_dict['train_start_tok'] = []     
train_dict['train_end_tok'] = []  

start = time.time()
for idx, dialogue in tqdm(enumerate(dialogue_dataset)):
    if idx == 10:
        break
    dial_id_turn_id = []       # running list of <dial_id>_<turn_id> for evaluation
    sp_id_list = []            # running list of spans per document
    user_utterance_list = []   # running list of user utterances per document
    
    for turn in dialogue['turns']:
        dial_id_turn_id.append(dialogue['dial_id'] + '_' + str(turn['turn_id']))
        if turn['role'] == 'user':
            # TURN UTTERANCE IS FLATTENED AND ONLY THE [INPUT_IDS] IS STORED
            turn['utterance'] = tokenizer(turn['utterance'], padding=True, truncation=True, return_tensors="pt")['input_ids'].view(-1)
            user_utterance_list.append(turn['utterance'])   # adding user utterance to user_utterance_list
        else:
            references = turn['references']
            ref_sp_id = []
            for ref in references:
                ref_sp_id.append(ref['sp_id'])
            sp_id_list.append(ref_sp_id)          # adding list of sp_ids per dialogue to list of sp_ids per document
    train_dict['train_id_sp'].append(sp_id_list)
    train_dict['train_user_utterance'].append(user_utterance_list)
    train_dict['train_doc_domain'].append(dialogue['domain'])
    train_dict['train_doc_id'].append(dialogue['doc_id'])
    train_dict['train_dial_id_turn_id'].append(dial_id_turn_id)
    
    for doc in document_dataset:
        if doc['doc_id'] == train_dict['train_doc_id'][-1]:
            # DOCUMENT TEXT IS NOT A TENSOR. PREVIOUSLY WE HAD tokenizer( )['index_ids'].view(-1)
            doc['doc_text'] = tokenizer(doc['doc_text'], padding=True, truncation=False, return_tensors="pt")
            train_dict['train_document'].append(doc['doc_text'])          # adding the total document text
            text_sp_2 = []            
            start_sp_list = []         # big start sp list
            end_sp_list = []           # big end sp list        
            start_tok_list = []         # big start token list
            end_tok_list = []           # big end token list     
            for train_spans_id in train_dict['train_id_sp'][-1]:    
                text_sp = ""         
                ref_start_pos_list = []
                ref_end_pos_list = []      
                for span in doc['spans']:                    
                    if span['id_sp'] in train_spans_id:
                        text_sp += span['text_sp']                        
                        ref_start_pos_list.append(span['start_sp'])
                        ref_end_pos_list.append(span['end_sp'])    
                start_pos = np.amin(ref_start_pos_list)
                start_sp_list.append(start_pos)
                # convert start_pos to start_token
                start_tok_pos = doc['doc_text'].char_to_token(start_pos)
                # check that start_tok_pos is not None
                start_tok_list.append(start_tok_pos)
                # convert end_pos to end_token
                end_pos = np.amax(ref_end_pos_list)
                end_sp_list.append(end_pos)
                end_tok_pos = doc['doc_text'].char_to_token(end_pos)
                while end_tok_pos == None:
                    print('This is the faulty character: {end_pos}')
                    end_pos = end_pos - 1
                    end_tok_pos = doc['doc_text'].char_to_token(end_pos)
                    print('This is the faulty character: {end_pos}')
                end_tok_list.append(end_tok_pos)
                text_sp_2.append(text_sp)
            train_dict['train_text_sp'].append(text_sp_2)
            train_dict['train_start_pos'].append(start_sp_list)
            train_dict['train_end_pos'].append(end_sp_list)
            train_dict['train_start_tok'].append(start_tok_list)
            train_dict['train_end_tok'].append(end_tok_list)
            break
end = time.time()
print(f'Total time: {end-start}')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizerFast'.
0it [00:00, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
1it [00:00,  4.58it/s]

This is the faulty character: {end_pos}
This is the faulty character: {end_pos}
This is the faulty character: {end_pos}
This is the faulty character: {end_pos}


5it [00:01,  5.03it/s]

This is the faulty character: {end_pos}
This is the faulty character: {end_pos}
This is the faulty character: {end_pos}
This is the faulty character: {end_pos}


10it [00:02,  4.87it/s]

Total time: 2.0558907985687256





Results:

# Functions for Dataset / Sliding Window

In [316]:
def text_mask(question, text):
    '''   
    text['input_ids'].view(-1)[1:] was on the line below where 'text' is now - 
    need to do this to text before sending it into this function
    
    input_ids: will be the question and the window of the document concat together
    segment_ids: is a mask that makes the two sentences distinct 1's for question 0 for document text
    '''
    input_ids=torch.cat((question, text), 0)
    SEP_token_id=102
    sep_idx = (input_ids == 102).nonzero(as_tuple=False)[0][0].item()
    num_seg_a = sep_idx+1
    num_seg_b = len(input_ids) - num_seg_a
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)
    return input_ids, segment_ids

def add_sep_tokens(windows):
    tmp = []
    sep_token = 102
    for window in windows:
        end = len(window) - 1
        if window[end] != sep_token:
            # add a SEP token at the end
            tmp.append(torch.cat((window, torch.tensor([102])),0))
        else:
            tmp.append(window)
    return tmp

def sliding_windows(question, document, start_token=None, end_token=None, stride=256):
    # tokenized input_ids is the document - remove [CLS] before sending through   
    # TODO: if start_token and end_token != None, check that the context contains the answer. 
    # This is only used during training 
    windows = []
    model_tok_limit = 511  # model can take 512 tokens maximum - -1 to add a sep token at end of each window
    start = 0
    end = model_tok_limit - len(question)
    doc_size = len(document)
        
    # handling edge case of documents smaller than models input (512 tokens)   
    if len(document) <= model_tok_limit:
        end = len(document)
    
    while(start <= doc_size):
        # print(start, end, doc_size)
        training = (start_token != None and end_token != None) 
        question_length=len(question)
        if training:     # check that question is inside the context
            question_inside_context = (start_token >= start and end_token <= end)
        if (not training) or (training and question_inside_context):
            # if not training or answer inside the context
            window = document[start:end]
            windows.append(window)
        
        if end == doc_size: 
            break
        
        start += stride
        # if there are less tokens than the slide amount
        if (doc_size - (start + stride)) < stride:
            end = doc_size
        else:
            end += stride
    
    windows = add_sep_tokens(windows)
    return windows

Create mask for start and end positions. This way we only check the first token after the '.' as start positions, and the tokens before the '.' as end positions.

In [119]:
question=train_dict['train_user_utterance'][0][1]
print(f'Decoded question: {tokenizer.decode(question)}')
# If already tokenized from dataset
text=train_dict['train_document'][0]    # tokenized text
# if simple text
#text='By statute , you must report a change of address to DMV within ten days of moving. That is the case for the address associated with your license, as well as all the addresses associated with each registered vehicle, which may differ.'
#text=tokenizer([text],  return_tensors="pt")['input_ids'].view(-1)

Decoded question: [CLS] can i do my dmv transactions online? [SEP]


In [120]:
''' Sliding Window [1:] to remove the [CLS] that was put in by the tokenizer
    The Model likes '[CLS] Sentence1 [SEP] Sentence2 [SEP]' it doesn't need the [CLS]
'''
windows = []
windows = sliding_windows(question, text['input_ids'][0][1:])

In [121]:
for window in windows:
    print(len(window) + len(question))

512
512
433


In [122]:
model_inputs = []

for window in windows:
    model_inputs.append(text_mask(question, window))
    

In [123]:
len(model_inputs[2][0])

433

## Our Model - BertForQA

In [124]:
model = AutoModelForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [125]:
models = []
for model_input in model_inputs:
    output = model(model_input[0].view(1,-1), token_type_ids=torch.tensor([model_input[1]]))
    models.append([model_input, output])

model_outputs contains a list of [input, output]

    output is the models output

    input is a list containing [input_ids, segment_ids]

        -input_ids is a tokenized input

        -segment_ids is a mask to let the model understand there are two individual sentences

## Functions for after the model has been run

In [14]:
# token id for '.' = 1012
def mask_start_end(input_ids_trunc, segment_ids_trunc, mode):
    """Returns a mask for the start and end logits. 
    input_ids_trunc = tokens (tensor)
    segment_ids_trunc = mask (question / text)
    mode = "start" or 'end'
    return tensor
    """
    a = torch.where(input_ids_trunc == 1012, 1, 0)   # mask=1 for '.'
    a = a * torch.tensor(segment_ids_trunc)          # mask question - text
    if mode=='start':
        b = torch.cat((torch.tensor([0]),a),0)[:-1]     # move the 1s one position to the right
    else:
        b = torch.cat((a, torch.tensor([0])),0)[1:]
    assert len (a) == len(b)
    return b

def tensor_to_positive(tensor, mask):
    """ All the values need to be higher than 0, since 0s are values for the mask
    and we don't want to choose them when selecting the start or end token.
    Return torch.tensor """
    min_value = torch.amin(tensor) 
    tensor_positive = tensor + (mask * np.abs(min_value.detach().numpy()))
    return tensor_positive

In [127]:
model_joint_probability = []

# picked a low number 
sum_joint_prob = -1000
best_tokens = None
answer_start = None
answer_end = None

# calculate the best combined (start+end) probability from each window. Use the best probability as the output
# from the model

for m in models:
    # TODO: check that the context is inside the window
    model_tokens = tokenizer.convert_ids_to_tokens(m[0][0])
    mask_start = mask_start_end(m[0][0], m[0][1], 'start') # [0][0] = input_ids [0][1] = segment_ids
    start_logits_positive = tensor_to_positive(m[1].start_logits * mask_start, mask_start)
    model_answer_start = torch.argmax(start_logits_positive)  # token index for the highest start token
    max_start_prob = m[1].start_logits[0][model_answer_start].item()
    
    mask_end = mask_start_end(m[0][0], m[0][1], 'end')
    end_logits_positive = tensor_to_positive(m[1].end_logits * mask_end, mask_end)
    model_answer_end = torch.argmax(end_logits_positive)
    max_end_prob = m[1].end_logits[0][model_answer_end].item()
    
    model_joint_prob = max_start_prob + max_end_prob
    
    if model_joint_prob > sum_joint_prob:
        sum_joint_prob = model_joint_prob
        tokens = model_tokens
        answer_start = model_answer_start
        answer_end = model_answer_end

if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")
    
print("\nQuestion:\n{}".format(tokenizer.decode(question)))
print("\nAnswer:\n{}.".format(answer))

Answer start: 416, Answer end: 360
Answer start: 160, Answer end: 104
Answer start: 373, Answer end: 371
I am unable to find the answer to this question. Can you please ask another question?

Question:
[CLS] can i do my dmv transactions online? [SEP]

Answer:
forgetting to update address by statute , you must report a change of address to d ##m ##v within ten days of moving.


## Fine-tune our model

This is what they do at HuggingFace:

In [18]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [20]:
from datasets import load_dataset
squad = load_dataset("squad")
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Reusing dataset squad (/Users/ri21540/.cache/huggingface/datasets/squad/plain_text/1.0.0/6b6c4172d0119c74515f44ea0b8262efe4897f2ddb6613e5e915840fdc309c16)


  0%|          | 0/88 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [146]:
type(tokenized_squad['train'][0]['input_ids'])

list

### Dataset creation following HuggingFace format

In [392]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [382]:
def pad_input(input_ids, segment_ids):
    if len(input_ids) < 512:
        pad_input = torch.tensor(np.zeros(512 - len(input_ids)))
        input_ids = torch.cat((input_ids, pad_input), 0)
        pad_segment = torch.tensor(np.ones(512 - len(segment_ids)))
        segment_ids = torch.cat((torch.tensor(segment_ids), pad_segment), 0)
    return input_ids, segment_ids

In [424]:
data = dict()
data['input_ids'] = []
data['attention_mask'] = []
data['start_positions'] = []
data['end_positions'] = []

for idx_dialogue in tqdm(range(0, len(train_dict['train_document']))):      # num dialogues
    utterances = train_dict['train_user_utterance'][idx_dialogue]
    document = train_dict['train_document'][idx_dialogue]
    for idx_utterance, utterance in enumerate(utterances):
        start_token = train_dict['train_start_tok'][idx_dialogue][idx_utterance]
        end_token = train_dict['train_end_tok'][idx_dialogue][idx_utterance]
#         print(f'Idx dialogue: {idx_dialogue}, Idx utterance: {idx_utterance} -------------------')
        windows = sliding_windows(utterance, document['input_ids'][0][1:], start_token, end_token)
#         print(f'{start_token}, {end_token}, {len(windows)}')
#         print(f'Start token: {start_token}\nEnd token: {end_token}\nUtterance: {tokenizer.decode(utterance)}\nWindow length: {len(windows)}\n\n')
        for window in windows:
            input_ids, segment_ids = text_mask(utterance, window)
            input_ids, segment_ids = pad_input(input_ids, segment_ids)
            data['input_ids'].append(input_ids.to(torch.int64).to(device))
            data['attention_mask'].append(torch.tensor(segment_ids).to(torch.int64).to(device))
            data['start_positions'].append(start_token)
            data['end_positions'].append(end_token)

  data['attention_mask'].append(torch.tensor(segment_ids).to(torch.int64))
100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 685.31it/s]


In [425]:
dataset_training = OurDataset(data)
dataset_validation = OurDataset(data)

We want to freeze the first 60% of the hidden layers: <br>
[:-79] = 60%
[:-2] = 99%

In [None]:
for name, param in list(model.named_parameters()): 
    print(f'{name}') 


In [426]:
for name, param in list(model.named_parameters())[:-2]: 
    print('I will be frozen: {}'.format(name)) 
    param.requires_grad = False

I will be frozen: bert.embeddings.word_embeddings.weight
I will be frozen: bert.embeddings.position_embeddings.weight
I will be frozen: bert.embeddings.token_type_embeddings.weight
I will be frozen: bert.embeddings.LayerNorm.weight
I will be frozen: bert.embeddings.LayerNorm.bias
I will be frozen: bert.encoder.layer.0.attention.self.query.weight
I will be frozen: bert.encoder.layer.0.attention.self.query.bias
I will be frozen: bert.encoder.layer.0.attention.self.key.weight
I will be frozen: bert.encoder.layer.0.attention.self.key.bias
I will be frozen: bert.encoder.layer.0.attention.self.value.weight
I will be frozen: bert.encoder.layer.0.attention.self.value.bias
I will be frozen: bert.encoder.layer.0.attention.output.dense.weight
I will be frozen: bert.encoder.layer.0.attention.output.dense.bias
I will be frozen: bert.encoder.layer.0.attention.output.LayerNorm.weight
I will be frozen: bert.encoder.layer.0.attention.output.LayerNorm.bias
I will be frozen: bert.encoder.layer.0.intermed

In [427]:
from transformers import default_data_collator
data_collator = default_data_collator

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_training,
    eval_dataset=dataset_validation,
    data_collator=data_collator,
    tokenizer=tokenizer
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [428]:
trainer.train()

***** Running training *****
  Num examples = 83
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 18


Epoch,Training Loss,Validation Loss
1,No log,5.76951
2,No log,5.767421
3,No log,5.766675


***** Running Evaluation *****
  Num examples = 83
  Batch size = 16
***** Running Evaluation *****
  Num examples = 83
  Batch size = 16
***** Running Evaluation *****
  Num examples = 83
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=18, training_loss=5.331610361735026, metrics={'train_runtime': 558.6488, 'train_samples_per_second': 0.446, 'train_steps_per_second': 0.032, 'total_flos': 231248041039872.0, 'train_loss': 5.331610361735026, 'epoch': 3.0})

### Output - For Report

This section of text shows that span [49][50][51][52] is what we return. However, the section belows is what the ground truth says. SPan [51] is highlighted in red (we return 51, ground truth doesn't contain it).

- 'About ten percent of customers visiting a DMV office do not bring what they need to complete their transaction, and have to come back a second time to finish their business. This can be as simple as not bringing sufficient funds to pay for a license renewal or not having the proof of auto insurance required to register a car. <font color='red'>Better yet ,</font> don t visit a DMV office at all, and see if your transaction can be performed online, like an address change, registration renewal, license renewal, replacing a lost title, paying a DRA or scheduling a road test. '
- 'About ten percent of customers visiting a DMV office do not bring what they need to complete their transaction, and have to come back a second time to finish their business. This can be as simple as not bringing sufficient funds to pay for a license renewal or not having the proof of auto insurance required to register a car. don t visit a DMV office at all, and see if your transaction can be performed online, like an address change, registration renewal, license renewal, replacing a lost title, paying a DRA or scheduling a road test. '