In [28]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import random_split
from datasets import load_dataset, Dataset
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import evaluate
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from umap import UMAP
import seaborn as sns
import collections
import sys 
import os

In [2]:
# setting device as GPU if available
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('Device in use: {} \n'.format(device))

TRAIN_PERCENTAGE = 0.9
BATCH_SIZE = 24
N_EPOCHS = 20
LOGGING_STEPS = 100 # roughly every 1/5 epoch (for this batch size)
WARMUP_STEPS = 500 # roughly 1 epoch of warmup (for this batch size)

Device in use: cuda 



In [3]:
data = pd.read_csv(r'.\SciQ\train.csv')

In [4]:
data = data.drop(['distractor1','distractor2','distractor3'], axis=1)
data.columns = ['question', 'answer', 'context']
data.head()

Unnamed: 0,question,answer,context
0,What type of organism is commonly used in prep...,mesophilic organisms,"Mesophiles grow best in moderate temperature, ..."
1,What phenomenon makes global winds blow northe...,coriolis effect,Without Coriolis Effect the global winds would...
2,Changes from a less-ordered state to a more-or...,exothermic,Summary Changes of state are examples of phase...
3,What is the least dangerous radioactive decay?,alpha decay,All radioactive decay is dangerous to living t...
4,Kilauea in hawaii is the world’s most continuo...,smoke and ash,Example 3.5 Calculating Projectile Motion: Hot...


In [5]:
## dropping all rows with Null values (only context columns have null values in this case)
print (data['context'].isnull().sum())
data = data.dropna()

## resetting index because deleting rows doesn't re-assing the indices
data = data.reset_index(drop=True)
data.head()

1198


Unnamed: 0,question,answer,context
0,What type of organism is commonly used in prep...,mesophilic organisms,"Mesophiles grow best in moderate temperature, ..."
1,What phenomenon makes global winds blow northe...,coriolis effect,Without Coriolis Effect the global winds would...
2,Changes from a less-ordered state to a more-or...,exothermic,Summary Changes of state are examples of phase...
3,What is the least dangerous radioactive decay?,alpha decay,All radioactive decay is dangerous to living t...
4,Kilauea in hawaii is the world’s most continuo...,smoke and ash,Example 3.5 Calculating Projectile Motion: Hot...


In [6]:
def add_answer_start(batch):
    ''' The data is missing the indices of the location 
    of the start of the answer inside the context,
    we find those here (this function takes a pandas dataframe
    and modifies the answer column to include this as a dict),
    later they can be used to find the end positions 
    (after splitting contexts below if necessary),
    then positions labels for prediction,
    need to lower case the context and answers first,
    because find() is case-senstitive'''
    
    def start_position(context, answer):
        pos = context.lower().find(answer.lower())
        
        ## find returns -1 if doesn't find
        if pos == -1:
            return {'text':[answer], 'answer_start':[0]}
        else:
            return {'text':[answer], 'answer_start':[pos]}
    
    batch['answer'] = batch.apply(lambda row : start_position(row['context'], row['answer']), axis = 1)
    
    return batch

data = add_answer_start(data)

In [7]:
## load pretrained model and it's tokenizer
MODEL_NAME = 'distilbert-base-cased-distilled-squad'
# MODEL_NAME = 'deepset/minilm-uncased-squad2'

model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
MAX_LENGTH = tokenizer.model_max_length

def check_context_lengths(batch):
    '''function to check if any context length (by counting tokens) 
    exceeds the max context length the model accepts'''

    encoded_data = tokenizer(list(batch['question']), list(batch['context']), truncation=False, padding="max_length")
    max_context_length = max([len(id) for id in encoded_data['input_ids']])
    print('max number of tokens in these contexts is: {}'.format(max_context_length))
    if max_context_length > MAX_LENGTH:
        print ('This exceeds tha model max length of {}'.format(MAX_LENGTH))
        return True
    else:
        print ("This doesn't exceed max model length of {}".format(MAX_LENGTH))
        return False

exceeds_model_length = check_context_lengths(data)

max number of tokens in these contexts is: 814
This exceeds tha model max length of 512


In [8]:
def tokenize(batch):
    '''Tokenize the context and question,
    becasue some context sizes exceed the model max length 
    (with trunction set to apply only to the second entry aka contexts)
    we will set a maximum then use a sliding sindow (stride) to guarantee
    that the cutoff will never cut the answer, and we use the same question
    for all the new contexts, only now some of them may not contain the answer,
    for those we set start & end positions to be 0 (aka the [CLS] token)
    return_overflowing_tokens so we get those overflowing tokens
    and return_offsets_mapping so we can use those to find the
    answer end position (provided we know the start position).
    
    In the case that none of the xontexts exceed model length we use a 
    simpler more familiar tokenizatin scheme'''
    
#     if exceeds_model_length:
#         return tokenizer(batch['question'], batch['context'], truncation='only_second', max_length = MAX_LENGTH,
#                               stride=int(MAX_LENGTH/2), return_overflowing_tokens=True, return_offsets_mapping=True)
#     elif not exceeds_model_length:
#         return tokenizer(batch['question'], batch['context'], truncation=True, padding="max_length")

In [9]:
def preprocess_data(batch):
    '''This function takes the data in a (HF) dataset,
    applies the tokenize function, then finds the start and end
    positions (treating the split up contexts case, using the 
    overflow_to_sample_mapping to find the original example
    (or as theey call it "sample" that the split up context came from)'''
    
    '''for the tokenizer: tokenize the context and question,
    becasue some context sizes exceed the model max length 
    (with trunction set to apply only to the second entry aka contexts)
    we will set a maximum then use a sliding sindow (stride) to guarantee
    that the cutoff will never cut the answer, and we use the same question
    for all the new contexts, only now some of them may not contain the answer,
    for those we set start & end positions to be 0 (aka the [CLS] token)
    return_overflowing_tokens so we get those overflowing tokens
    and return_offsets_mapping so we can use those to find the
    answer end position (provided we know the start position).
    
    In the case that none of the xontexts exceed model length we use a 
    simpler more familiar tokenizatin scheme'''
    
    
    ## removing whitespace before and after the question
    questions = [q.strip() for q in batch['question']]
    
    ## I Call it "tokenized" here, but equivalently "encoded", which is what I use below
    if exceeds_model_length:
        tokenized_batch = tokenizer(batch['question'], batch['context'], truncation='only_second', 
                                      max_length = MAX_LENGTH, stride=int(MAX_LENGTH/2), 
                                      return_overflowing_tokens=True, return_offsets_mapping=True)
    elif not exceeds_model_length:
        tokenized_batch = tokenizer(batch['question'], batch['context'],
                                      truncation=True, padding="max_length")
    
    
    
    offset_mapping = tokenized_batch.pop('offset_mapping')
    sample_map = tokenized_batch.pop('overflow_to_sample_mapping')
    answers = batch['answer']
    start_positions = []
    end_positions = []
    
    
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer['answer_start'][0]
        end_char = answer['answer_start'][0] + len(answer['text'][0])
        sequence_ids = tokenized_batch.sequence_ids(i)

        '''Find the start and end of the context, 
        the input_ids contain the tokenized question and context,
        and (THIS IS MODEL DEPENDENT) the context has id 1 in
        the sequence ids''' 
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)
    
    tokenized_batch['start_positions'] = start_positions
    tokenized_batch['end_positions'] = end_positions
    
    return tokenized_batch

In [10]:
def preprocess_decorator(preprocess_function, dataframe):
    '''This function takes the data in a pandas dataframe, 
    converts it to a (HF) dataset (this step so we don't have 
    to worry about the list input to tokenizer and so on),
    removes the OG columns because they have different size
    (after splitting some examples), so there isnt a mismatch
    in the final dataset'''
    

    dataset = Dataset.from_pandas(dataframe)
    encoded = dataset.map(preprocess_function, batched=True, batch_size=BATCH_SIZE, 
                remove_columns=dataset.column_names)
    
    return encoded


encoded_dataset = preprocess_decorator(preprocess_data, data)

Map:   0%|          | 0/10481 [00:00<?, ? examples/s]

In [11]:
print (encoded_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 10586
})


In [12]:
def split_data(encoded_full_data):
    train_dataset = encoded_full_data.select(range(int(TRAIN_PERCENTAGE*len(encoded_full_data))))
    test_dataset = encoded_full_data.select(range(int(TRAIN_PERCENTAGE*len(encoded_full_data)), len(encoded_full_data)))

    return train_dataset, test_dataset

train_dataset, test_dataset = split_data(encoded_dataset)

In [13]:
## we structured this like the SQuAD dataset so we can use it's metric
metric = evaluate.load("squad")

def compute_metrics(start_logits, end_logits, features, examples):
    '''This function only works after training is concluded, cant use it in the middle of training'''
    
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answer"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [14]:
training_args = TrainingArguments(
    output_dir=r'.\distilbertSQUAD_finteuned_SciQ', 
    overwrite_output_dir=True, 
    num_train_epochs=N_EPOCHS,
    weight_decay=0.01,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE, 
    per_device_eval_batch_size=BATCH_SIZE, 
    load_best_model_at_end=True,
    evaluation_strategy='steps',
    disable_tqdm=False,
    logging_steps=LOGGING_STEPS,
    eval_steps=LOGGING_STEPS,
    save_steps=LOGGING_STEPS,
    warmup_steps=WARMUP_STEPS, # no. of warmup steps till it reaches set value for learning rate (default strat: linear)
    fp16=True,
    report_to='tensorboard')

'''no need for data collator, becausesamples are padded to max length.
    I disabled early stopping to try to reach the interploation regime (after overfitting regime)'''
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)])

Using cuda_amp half precision backend


In [15]:
trainer.train()

***** Running training *****
  Num examples = 9527
  Num Epochs = 20
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 7940
  Number of trainable parameters = 64799234
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
100,1.6813,1.422232
200,1.2232,1.125617
300,1.0821,1.001087
400,0.9677,0.936279
500,0.8629,0.880835
600,0.7943,0.882369
700,0.7724,0.859374
800,0.7805,0.87981
900,0.5539,0.87767
1000,0.5479,0.89297


***** Running Evaluation *****
  Num examples = 1059
  Batch size = 24
Saving model checkpoint to .\distilbertSQUAD_finteuned_SciQ\checkpoint-100
Configuration saved in .\distilbertSQUAD_finteuned_SciQ\checkpoint-100\config.json
Model weights saved in .\distilbertSQUAD_finteuned_SciQ\checkpoint-100\pytorch_model.bin
tokenizer config file saved in .\distilbertSQUAD_finteuned_SciQ\checkpoint-100\tokenizer_config.json
Special tokens file saved in .\distilbertSQUAD_finteuned_SciQ\checkpoint-100\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1059
  Batch size = 24
Saving model checkpoint to .\distilbertSQUAD_finteuned_SciQ\checkpoint-200
Configuration saved in .\distilbertSQUAD_finteuned_SciQ\checkpoint-200\config.json
Model weights saved in .\distilbertSQUAD_finteuned_SciQ\checkpoint-200\pytorch_model.bin
tokenizer config file saved in .\distilbertSQUAD_finteuned_SciQ\checkpoint-200\tokenizer_config.json
Special tokens file saved in .\distilbertSQUAD_finteuned_SciQ

TrainOutput(global_step=1000, training_loss=0.9266230850219727, metrics={'train_runtime': 283.9749, 'train_samples_per_second': 670.975, 'train_steps_per_second': 27.96, 'total_flos': 2490839087742096.0, 'train_loss': 0.9266230850219727, 'epoch': 2.52})

In [16]:
trainer.save_model()

Saving model checkpoint to .\distilbertSQUAD_finteuned_SciQ
Configuration saved in .\distilbertSQUAD_finteuned_SciQ\config.json
Model weights saved in .\distilbertSQUAD_finteuned_SciQ\pytorch_model.bin
tokenizer config file saved in .\distilbertSQUAD_finteuned_SciQ\tokenizer_config.json
Special tokens file saved in .\distilbertSQUAD_finteuned_SciQ\special_tokens_map.json
