# NLP Coursework
Mauro Comi, Phillip Sloan

In [2]:
from datasets import load_dataset
from transformers import pipeline
from tqdm import tqdm
from transformers import BertForQuestionAnswering, DistilBertTokenizerFast, AutoModelForQuestionAnswering, TrainingArguments, Trainer
import time
import numpy as np
import pandas as pd
import torch
import json
device = "cuda:0" if torch.cuda.is_available() else "cpu"

We load the two datasets that we used to create our ``training set and validation set.

In [None]:
split = 'train'
cache_dir = "./data_cache"

dialogue_dataset = load_dataset(
    "doc2dial",
    name="dialogue_domain",  # this is the name of the dataset for the second subtask, dialog generation
    split=split,
    ignore_verifications=True,
    cache_dir=cache_dir,
)

document_dataset = load_dataset(
    "doc2dial",
    name="document_domain",  # this is the name of the dataset for the second subtask, dialog generation
    split='train',
    ignore_verifications=True,
    cache_dir=cache_dir,
)

Downloading:   0%|          | 0.00/3.03k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading and preparing dataset doc2dial/dialogue_domain (download: 5.61 MiB, generated: 7.86 MiB, post-processed: Unknown size, total: 13.47 MiB) to ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c...


Downloading:   0%|          | 0.00/5.88M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset doc2dial downloaded and prepared to ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c. Subsequent calls will reuse this data.
Downloading and preparing dataset doc2dial/document_domain (download: 5.61 MiB, generated: 195.38 MiB, post-processed: Unknown size, total: 200.99 MiB) to ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c...


0 examples [00:00, ? examples/s]

Dataset doc2dial downloaded and prepared to ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c. Subsequent calls will reuse this data.


In [None]:
rc_dataset = load_dataset(
    "doc2dial",
    name="doc2dial_rc",  # this is the name of the dataset for the second subtask, dialog generation
    split=split,
    ignore_verifications=True,
    cache_dir=cache_dir,
)

Downloading and preparing dataset doc2dial/doc2dial_rc (download: 5.61 MiB, generated: 131.12 MiB, post-processed: Unknown size, total: 136.72 MiB) to ./data_cache/doc2dial/doc2dial_rc/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c...


0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset doc2dial downloaded and prepared to ./data_cache/doc2dial/doc2dial_rc/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c. Subsequent calls will reuse this data.


We create a dictionary (`train_dict`) from the two datasets `dialogue_dataset` and `document_dataset`. This dictionary contains information needed to later fine-tune the model.

In [None]:
# Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('bert-base-uncased')

# Defining train_dict
train_dict = dict()
train_dict['train_document'] = []
train_dict['train_user_utterance'] = []
train_dict['train_text_sp'] = []
train_dict['train_dial_id_turn_id'] = []     # necessary for evaluation
train_dict['train_start_pos'] = []     
train_dict['train_end_pos'] = []     
train_dict['train_start_tok'] = []     
train_dict['train_end_tok'] = []  
train_dict['train_all_utterances'] = []

start = time.time()
for idx, dialogue in tqdm(enumerate(dialogue_dataset)):
    dial_id_turn_id = []       # running list of <dial_id>_<turn_id> for evaluation
    user_utterance_list = []   # running list of user utterances per document
    sp_id_list = []            # running list of spans per document

    all_utterances_list = []
    for idx_turn, turn in enumerate(dialogue['turns']):

        all_utterances_list.append(turn['utterance'])
        if turn['role'] == 'user':            
            # If the previous turn was still the user, we want to concatenate the 
            # current and previous utterances
            if (idx_turn > 0 and dialogue['turns'][idx_turn-1]['role'] == 'user'):
                turn['utterance'] = tokenizer(turn['utterance'], padding=True, truncation=True, return_tensors="pt")['input_ids'].view(-1)[1:]
                previous_utterance = user_utterance_list[-1][:-1]  # the previous utterance ends with [SEP], we remove it
                turn['utterance'] = torch.cat((previous_utterance, turn['utterance']), 0)
                user_utterance_list[-1]= turn['utterance']  # replace last element in list
                dial_id_turn_id[-1] = dialogue['dial_id'] + '_' + str(turn['turn_id'])
            # If the last utterance is by the user, we don't store it
            elif idx_turn == len(dialogue['turns'])-1:
                continue
            else:
              # TURN UTTERANCE IS FLATTENED AND ONLY THE [INPUT_IDS] IS STORED
              turn['utterance'] = tokenizer(turn['utterance'], padding=True, truncation=True, return_tensors="pt")['input_ids'].view(-1)
              user_utterance_list.append(turn['utterance'])   # adding user utterance to user_utterance_list
              dial_id_turn_id.append(dialogue['dial_id'] + '_' + str(turn['turn_id']))
        else:
            references = turn['references']
            ref_sp_id = []
            for ref in references:
                ref_sp_id.append(ref['sp_id'])
            sp_id_list.append(ref_sp_id)          # adding list of sp_ids per dialogue to list of sp_ids per document
        
    train_dict['train_user_utterance'].append(user_utterance_list)
    train_dict['train_all_utterances'].append(all_utterances_list)
    train_dict['train_dial_id_turn_id'].append(dial_id_turn_id)
    
    for doc in document_dataset:
        if doc['doc_id'] == dialogue['doc_id']:
            # DOCUMENT TEXT IS NOT A TENSOR. PREVIOUSLY WE HAD tokenizer( )['index_ids'].view(-1)
            doc['doc_text'] = tokenizer(doc['doc_text'], padding=True, truncation=False, return_tensors="pt")
            train_dict['train_document'].append(doc['doc_text'])          # adding the total document text
            text_sp_2 = []            
            start_tok_list = []         # big start token list
            end_tok_list = []           # big end token list     
            for train_spans_id in sp_id_list:    
                text_sp = ""         
                ref_start_pos_list = []
                ref_end_pos_list = []      
                for span in doc['spans']:                    
                    if span['id_sp'] in train_spans_id:
                        text_sp += span['text_sp']                        
                        ref_start_pos_list.append(span['start_sp'])
                        ref_end_pos_list.append(span['end_sp'])    
                start_pos = np.amin(ref_start_pos_list)
                # convert start_pos to start_token
                start_tok_pos = doc['doc_text'].char_to_token(start_pos)
                # check that start_tok_pos is not None, if it is go to the next character
                while start_tok_pos == None:
                    start_pos = start_pos + 1
                    start_tok_pos = doc['doc_text'].char_to_token(start_pos)
                start_tok_list.append(start_tok_pos)
                # convert end_pos to end_token
                end_pos = np.amax(ref_end_pos_list)
                end_tok_pos = doc['doc_text'].char_to_token(end_pos)
                # check that end_tok_pos is not None, if it is go to the next character
                while end_tok_pos == None:
                    end_pos = end_pos - 1
                    end_tok_pos = doc['doc_text'].char_to_token(end_pos)
                end_tok_list.append(end_tok_pos)
                text_sp_2.append(text_sp)
            train_dict['train_text_sp'].append(text_sp_2)
            train_dict['train_start_tok'].append(start_tok_list)
            train_dict['train_end_tok'].append(end_tok_list)
            break
end = time.time()
print(f'Total time: {end-start}')

We store the `train_dict` dictionary as a numpy object.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/My Drive/train_dict.npy'
# Save train_dict
np.save(file_path train_dict) 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


All the utility functions that we use to post-process the dictionary `train_dict`, creating the sliding windows, and the masking technique:

In [3]:
def text_mask(question, text):
    '''   
    text['input_ids'].view(-1)[1:] was on the line below where 'text' is now - 
    need to do this to text before sending it into this function
    
    input_ids: will be the question and the window of the document concat together
    segment_ids: is a mask that makes the two sentences distinct 1's for question 0 for document text
    '''
    input_ids=torch.cat((question, text), 0).to(device)
    SEP_token_id=102
    sep_idx = (input_ids == 102).nonzero(as_tuple=False)[0][0].item()
    num_seg_a = sep_idx+1
    num_seg_b = len(input_ids) - num_seg_a
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)
    return input_ids, segment_ids

def add_sep_tokens(windows):
    tmp = []
    sep_token = 102
    for window in windows:
        end = len(window) - 1
        if window[end] != sep_token:
            # add a SEP token at the end
            tmp.append(torch.cat((window.to(device), torch.tensor([102]).to(device)),0))
        else:
            tmp.append(window.to(device))
    return tmp

def sliding_windows(question, document, start_token=None, end_token=None, stride=256):
    # tokenized input_ids is the document - remove [CLS] before sending through   
    # If start_token and end_token != None, check that the context contains the answer. 
    # This is only used during training 
    windows = []
    model_tok_limit = 511  # model can take 512 tokens maximum - -1 to add a sep token at end of each window
    start = 0
    end = model_tok_limit - len(question)
    doc_size = len(document)
    start_positions = []

    # handling edge case of documents smaller than models input (512 tokens)   
    if len(document) <= model_tok_limit - len(question):
        end = len(document)
    
    while(start <= doc_size):
        # print(start, end, doc_size)
        training = (start_token != None and end_token != None) 
        question_length=len(question) 

        if training:     # check that question is inside the context
            question_inside_context = (start_token >= start and end_token <= end)
        if (not training) or (training and question_inside_context):
            # if not training or answer inside the context
            window = document[start:end]
            windows.append(window)
            start_positions.append(start)
        
        if end == doc_size: 
            break
        
        start += stride
        # if there are less tokens than the slide amount
        #if (doc_size  - (start + stride)) < stride:
        if (doc_size - start) < (model_tok_limit - len(question)):
            end = doc_size
        else:
            end += stride
    
    windows = add_sep_tokens(windows)
    return windows, start_positions

# token id for '.' = 1012
def mask_start_end(input_ids_trunc, segment_ids_trunc, mode):
    """Returns a mask for the start and end logits. 
    input_ids_trunc = tokens (tensor)
    segment_ids_trunc = mask (question / text)
    mode = "start" or 'end'
    return tensor
    """
    a = torch.where(input_ids_trunc == 1012, 1, 0).to(device)   # mask=1 for '.'
    c = torch.where(input_ids_trunc == 1010, 1, 0).to(device)   # mask=1 for ','

    a = a + c

    a = a * torch.tensor(segment_ids_trunc).to(device)          # mask question - text 
    
    if mode=='start':
        b = torch.cat((torch.tensor([0]).to(device),a),0)[:-1].to(device)     # move the 1s one position to the right
    else:
        b = torch.cat((a, torch.tensor([0]).to(device)),0)[1:].to(device)
    assert len (a) == len(b)
    return b

def tensor_to_positive(tensor, mask):
    """ All the values need to be higher than 0, since 0s are values for the mask
    and we don't want to choose them when selecting the start or end token.
    Return torch.tensor """
    min_value = torch.amin(tensor) 
    tensor_positive = tensor + (mask * np.abs(min_value.cpu().detach().numpy()))
    return tensor_positive.to(device)

def convert_dictionary(train_dict):
    data = dict()
    data['input_ids'] = []
    data['attention_mask'] = []
    data['start_positions'] = []
    data['end_positions'] = []

    for idx_dialogue in tqdm(range(0, len(train_dict['train_document']))):      # num dialogues
        utterances = train_dict['train_user_utterance'][idx_dialogue]
        document = train_dict['train_document'][idx_dialogue]
        for idx_utterance, utterance in enumerate(utterances):
            utterance = utterance.to(device)
            start_token = train_dict['train_start_tok'][idx_dialogue][idx_utterance]
            end_token = train_dict['train_end_tok'][idx_dialogue][idx_utterance]
    #         print(f'Idx dialogue: {idx_dialogue}, Idx utterance: {idx_utterance} -------------------')
            windows, start_positions = sliding_windows(utterance, document['input_ids'][0][1:], start_token, end_token)
    #         print(f'{start_token}, {end_token}, {len(windows)}')
    #         print(f'Start token: {start_token}\nEnd token: {end_token}\nUtterance: {tokenizer.decode(utterance)}\nWindow length: {len(windows)}\n\n')
            for idx, window in enumerate(windows):
                input_ids, segment_ids = text_mask(utterance, window)
                input_ids, segment_ids = pad_input(input_ids, segment_ids)
                data['input_ids'].append(input_ids.to(torch.int64).to(device))
                data['attention_mask'].append(torch.tensor(segment_ids).to(torch.int64).to(device))
                data['start_positions'].append(start_token - start_positions[idx])
                data['end_positions'].append(end_token - start_positions[idx])
    return data

def pad_input(input_ids, segment_ids):
    if len(input_ids) < 512:
        pad_input = torch.tensor(np.zeros(512 - len(input_ids))).to(device)
        input_ids = torch.cat((input_ids, pad_input), 0)
        pad_segment = torch.tensor(np.ones(512 - len(segment_ids))).to(device)
        segment_ids = torch.cat((torch.tensor(segment_ids).to(device), pad_segment), 0).to(device).tolist()
    return input_ids, segment_ids


We define a PyTorch.Dataset:

In [4]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [5]:
tokenizer = DistilBertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizerFast'.


In [6]:
# Load validation dictionary
from google.colab import drive
drive.mount('/content/drive')

# Convert train dictionary into torch.Dataset
train_dict = np.load('/content/drive/My Drive/train_dict.npy', allow_pickle=True).item()
train_dict_proc = convert_dictionary(train_dict)
train_data = OurDataset(train_dict_proc)

# train_size = int(0.8 * len(train_data))
# val_size = len(train_data) - train_size
# train_set, val_set = torch.utils.data.random_split(train_data, [train_size, val_size])
# print(f'- {train_size} rows in the training set \n- {val_size} rows in the validation set')

# Convert train dictionary into torch.Dataset
val_dict = np.load('/content/drive/My Drive/val_dict.npy', allow_pickle=True).item()
val_dict_proc = convert_dictionary(val_dict)
val_data = OurDataset(val_dict_proc)

Mounted at /content/drive


100%|██████████| 3474/3474 [00:27<00:00, 124.19it/s]
100%|██████████| 661/661 [00:03<00:00, 202.51it/s]


In [10]:
#model = AutoModelForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
#model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')
model = AutoModelForQuestionAnswering.from_pretrained('/content/drive/My Drive/model2/')

model = model.to(device)

# Freeze layers
for name, param in list(model.named_parameters()): 
    param.requires_grad = False

In [None]:
for name, param in list(model.named_parameters())[:-18]: 
    print('{}'.format(name)) 

bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.weight
bert.enc

In [None]:
from transformers import default_data_collator
data_collator = default_data_collator

training_args = TrainingArguments(
    output_dir='/content/drive/My Drive/results',
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 27203
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 13604


Epoch,Training Loss,Validation Loss


Saving model checkpoint to /content/drive/My Drive/results/checkpoint-500
Configuration saved in /content/drive/My Drive/results/checkpoint-500/config.json
Model weights saved in /content/drive/My Drive/results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/My Drive/results/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/My Drive/results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to /content/drive/My Drive/results/checkpoint-1000
Configuration saved in /content/drive/My Drive/results/checkpoint-1000/config.json
Model weights saved in /content/drive/My Drive/results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /content/drive/My Drive/results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /content/drive/My Drive/results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to /content/drive/My Drive/results/checkpoint-1500
Configuration saved in /content/

## Prediction

In [None]:
# Load validation dictionary
from google.colab import drive
drive.mount('/content/drive')

# Convert validation dictionary into torch.Dataset
val_dict = np.load('/content/drive/My Drive/val_dict.npy', allow_pickle=True).item()

Mounted at /content/drive


In [10]:
def return_answer(models, question):
    model_joint_probability = 0
    sum_joint_prob = -1000.  # picked a low number 
    best_tokens = None
    answer_start = -5  # impossible value, higher than end
    answer_end = -10    # impossible value, lower than start

    for m in models:
        model_tokens = tokenizer.convert_ids_to_tokens(m[0][0])
        mask_start = mask_start_end(m[0][0], m[0][1], 'start') # [0][0] = input_ids [0][1] = segment_ids
        start_logits_positive = tensor_to_positive(m[1].start_logits * mask_start, mask_start)
        model_answer_start = torch.argmax(start_logits_positive)  # token index for the highest start token
        max_start_prob = m[1].start_logits[0][model_answer_start].item()
        
        mask_end = mask_start_end(m[0][0], m[0][1], 'end')
        end_logits_positive = tensor_to_positive(m[1].end_logits * mask_end, mask_end)
        model_answer_end = torch.argmax(end_logits_positive)
        max_end_prob = m[1].end_logits[0][model_answer_end].item() 

        model_joint_prob = max_start_prob + max_end_prob

        if (model_joint_prob > sum_joint_prob) and (model_answer_start < model_answer_end):
            sum_joint_prob = model_joint_prob
            tokens = m[0][0].to(torch.int64)
            answer_start = model_answer_start
            answer_end = model_answer_end

    answer = " "
    if answer_end >= answer_start:
        answer = tokenizer.decode(tokens[answer_start:answer_end+1])
        # print("\nQuestion:\n{}".format(tokenizer.decode(question)))
        # print("\nAnswer:\n{}.".format(answer))
    return answer

In [11]:
def return_answer_no_mask(models, question):
    model_joint_probability = 0
    sum_joint_prob = -1000.  # picked a low number 
    best_tokens = None
    answer_start = -5  # impossible value, higher than end
    answer_end = -10    # impossible value, lower than start

    for m in models:
        model_tokens = tokenizer.convert_ids_to_tokens(m[0][0])
        model_answer_start = torch.argmax(m[1].start_logits)  # token index for the highest start token
        max_start_prob = m[1].start_logits[0][model_answer_start].item()

        model_answer_end = torch.argmax(m[1].end_logits)
        max_end_prob = m[1].end_logits[0][model_answer_end].item() 

        model_joint_prob = max_start_prob + max_end_prob

        if (model_joint_prob > sum_joint_prob) and (model_answer_start < model_answer_end):
            sum_joint_prob = model_joint_prob
            answer_start = model_answer_start
            answer_end = model_answer_end
            tokens = m[0][0].to(torch.int64)

    answer = " "
    if answer_end >= answer_start:
        answer = tokenizer.decode(tokens[answer_start:answer_end+1])
        # print("\nQuestion:\n{}".format(tokenizer.decode(question)))
        # print("\nAnswer:\n{}.".format(answer))
    return answer

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizerFast'.


In [None]:
model = AutoModelForQuestionAnswering.from_pretrained('/content/drive/My Drive/model2/')
model = model.to(device)
# Freeze layers
for name, param in list(model.named_parameters()): 
    param.requires_grad = False

predictions = []
model.eval() 
torch.no_grad()


for idx, dial_id_turn_ids in tqdm(enumerate(val_dict['train_dial_id_turn_id'])): 
    utterances = val_dict['train_user_utterance'][idx]
    assert len(dial_id_turn_ids) == len(utterances)
    document = val_dict['train_document'][idx]['input_ids'].view(-1).to(device)
    for idx_utterance, utterance in enumerate(utterances):
        results = dict()
        utterance = utterance.to(device)
        windows, start_positions = sliding_windows(utterance, document[1:])
        model_inputs = []
        models = []
        for idx, window in enumerate(windows):
            input_ids, segment_ids = text_mask(utterance, window)
            input_ids, segment_ids = pad_input(input_ids, segment_ids)
            model_inputs.append((input_ids.to(device), segment_ids))
            output = model(model_inputs[-1][0].view(1,-1).to(torch.int64).to(device), token_type_ids=torch.tensor(model_inputs[-1][1]).to(torch.int64).view(1,-1).to(device),attention_mask='')
            models.append([model_inputs[-1], output])
        answer = return_answer(models, utterance)
        results['id'] = dial_id_turn_ids[idx_utterance]
        results['prediction_text'] = answer
        results['no_answer_probability'] = 0
        predictions.append(results)

with open('/content/drive/My Drive/prediction_after_fine_tuning3.json', 'w') as f:
    json.dump(predictions, f)

In [None]:
predictions

[{'id': 'dea7174409afbfe0af0ace21e7f318ae_1',
  'no_answer_probability': 0,
  'prediction_text': ' '},
 {'id': 'dea7174409afbfe0af0ace21e7f318ae_3',
  'no_answer_probability': 0,
  'prediction_text': ' '},
 {'id': 'dea7174409afbfe0af0ace21e7f318ae_5',
  'no_answer_probability': 0,
  'prediction_text': ' '},
 {'id': 'dea7174409afbfe0af0ace21e7f318ae_7',
  'no_answer_probability': 0,
  'prediction_text': ' '},
 {'id': 'dea7174409afbfe0af0ace21e7f318ae_9',
  'no_answer_probability': 0,
  'prediction_text': ' '},
 {'id': 'dea7174409afbfe0af0ace21e7f318ae_12',
  'no_answer_probability': 0,
  'prediction_text': ' '},
 {'id': 'dea7174409afbfe0af0ace21e7f318ae_14',
  'no_answer_probability': 0,
  'prediction_text': ' '},
 {'id': 'dea7174409afbfe0af0ace21e7f318ae_16',
  'no_answer_probability': 0,
  'prediction_text': ' '},
 {'id': '83d090a234b905fa4f0cb8ed8f0b5535_1',
  'no_answer_probability': 0,
  'prediction_text': ' '},
 {'id': '83d090a234b905fa4f0cb8ed8f0b5535_5',
  'no_answer_probability

In [None]:
stringa = "Because we all pay indirectly for crashes involving uninsured motorists , New York State requires every motorist to maintain auto insurance every single day a vehicle is registered. DMV works with insurance companies to electronically monitor your insurance coverage , "
stringa_tok = tokenizer(stringa)
stringa_detok = tokenizer.decode(stringa_tok['input_ids'])
print(stringa_detok)

[CLS] because we all pay indirectly for crashes involving uninsured motorists, new york state requires every motorist to maintain auto insurance every single day a vehicle is registered. dmv works with insurance companies to electronically monitor your insurance coverage, [SEP]


In [None]:
# 9f44c1539efe6f7e79b02eb1b413aa43_1     for example
def id_to_doc_utterance(search_element, tokenizer):
    for idx_dialogue, dialogue in enumerate(train_dict['train_dial_id_turn_id']):
        for idx_turn, dial_id_turn_id in enumerate(dialogue):
            if dial_id_turn_id == search_element:
                print(f'Dialogue index: {idx_dialogue}, Turn id: {idx_turn}')
                doc = train_dict['train_document'][idx_dialogue]
                utterance = train_dict['train_all_utterances'][idx_dialogue][idx_turn]
                utterance = tokenizer(utterance, return_tensors='pt')
    return doc['input_ids'].view(-1), utterance['input_ids'].view(-1)

In [None]:
# Question and text
question=train_dict['train_user_utterance'][0][0].to(device)
print(f'Decoded question: {tokenizer.decode(question)}')
text=train_dict['train_document'][0]    # tokenized text

windows = []
windows, _ = sliding_windows(question, text['input_ids'][0][1:])

model_inputs = []
for window in windows:
    model_inputs.append(text_mask(question, window.to(device)))

model_joint_probability = 0

sum_joint_prob = -1000.  # picked a low number 
best_tokens = None
answer_start = None
answer_end = None

# calculate the best combined (start+end) probability from each window. Use the best probability as the output
# from the model
models = []
for model_input in model_inputs:
    output = model(model_input[0].view(1,-1).to(device), token_type_ids=torch.tensor([model_input[1]]).to(device))
    models.append([model_input, output])

for m in models:
    # TODO: check that the context is inside the window
    model_tokens = tokenizer.convert_ids_to_tokens(m[0][0])
    mask_start = mask_start_end(m[0][0], m[0][1], 'start') # [0][0] = input_ids [0][1] = segment_ids
    start_logits_positive = tensor_to_positive(m[1].start_logits * mask_start, mask_start)
    model_answer_start = torch.argmax(start_logits_positive)  # token index for the highest start token
    max_start_prob = m[1].start_logits[0][model_answer_start].item()
    
    mask_end = mask_start_end(m[0][0], m[0][1], 'end')
    end_logits_positive = tensor_to_positive(m[1].end_logits * mask_end, mask_end)
    model_answer_end = torch.argmax(end_logits_positive)
    max_end_prob = m[1].end_logits[0][model_answer_end].item()
    
    model_joint_prob = max_start_prob + max_end_prob
    
    print(max_start_prob, max_end_prob)

    if (model_joint_prob > sum_joint_prob) and (model_answer_start < model_answer_end):
        sum_joint_prob = model_joint_prob
        tokens = model_tokens
        answer_start = model_answer_start
        answer_end = model_answer_end

if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
    print("\nQuestion:\n{}".format(tokenizer.decode(question)))
    print("\nAnswer:\n{}.".format(answer))
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")
    
