In [2]:
import numpy as np
import pandas as pd
import nltk
import os
import json
import transformers
import torch
import random
from torch import cuda
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader
from transformers import DebertaV2Tokenizer, DebertaV2Model, DebertaV2ForQuestionAnswering

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
# Setting the random seed for consistent results
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

In [4]:
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
device

'cuda'

### Loading the data

In [6]:
# Loading train data
train_file = open('/kaggle/input/squad-2/train-v2.0.json')
train_data = json.load(train_file)

# Loading validation data
val_file = open('/kaggle/input/squad-2/dev-v2.0.json')
val_data = json.load(val_file)

### Preprocessing the dataset

In [7]:
# Using the pretrained tokenizer for DEBERTA
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-base')

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
def index_converter(context, context_tokenized):
    """
    Maps start str index to a tokenized index
    """
    mapper = {}
    curr = ''
    token_idx = 0
    for i, char in enumerate(context):
        if char != ' ' and char != '\n' and char != '\t' and char != '\r': 
            curr += char
            if curr == context_tokenized[token_idx]:
                start = i - len(curr) + 1
                for j in range(start, i+1):
                    mapper[j] = (curr, token_idx)                
                curr = ''
                token_idx += 1
    if token_idx != len(context_tokenized): 
        return None
    return mapper

def preprocess_data(dataset, is_training=True, tokenized=True):
    """
    Extracts the info from json_data object into a pandas readable data representation (list of dicts)
    """
    
    def _tokenize(seq):
        """
        Performing tokenization to minimize errors between tokenizers and encodings.
        """
        return [t.replace("``", '"').replace("''", '"') for t in seq.split()]
    
    examples = [] # store rows of data here for qa
    
    tokenization_errors = 0
    misaligned_ans_errors = 0
    num_impossibles = 0
    num_questions = 0
    
    for article_id in tqdm(range(len(dataset['data']))): # for each context
        paragraphs = dataset['data'][article_id]['paragraphs']
        for paragraph_id in range(len(paragraphs)):
            questions = dataset['data'][article_id]['paragraphs'][paragraph_id]['qas']
            
            context = paragraphs[paragraph_id]['context']
            context_tokenized = _tokenize(context)
                    
            for qid in range(len(questions)): # loop through questions
                num_questions += 1
                
                question = questions[qid]['question']
                question_tokenized = _tokenize(question)
                qas_id = questions[qid]['id']
                
                is_impossible = questions[qid]['is_impossible']
                
                if is_impossible: # check if question is impossible to answer
                    num_impossibles += 1
                    examples.append({'qas_id': qas_id, 
                                     'question':question_tokenized if tokenized else question, 
                                     'context': context_tokenized if tokenized else context, 
                                     'answer':'', 
                                     'is_impossible': is_impossible,
                                     'start_pos': -1, 
                                     'end_pos':-1,
                                    'santiy_check': context_tokenized[-1:0]})
                    continue
                    
                # question is not impossible, continue parsing
                answers = questions[qid]['answers']
                
                for ans_id in range(len(answers)): # for each answer
                    answer = answers[ans_id]['text']
                    start_pos = answers[ans_id]['answer_start'] # inclusive start index in raw context
                    end_pos = start_pos + len(answer) #exclusive end index in raw context
                          
                    if context[start_pos:end_pos] != answer:
                        misaligned_ans_errors += 1
                        continue
                        
                    if tokenized:
                        mapper = index_converter(context, context_tokenized)
                        if mapper is None:
                            tokenization_errors += 1
                            continue
                        
                        start_pos = mapper[start_pos][1]
                        end_pos = mapper[end_pos-1][1] # inclusive
                    
                    examples.append({'qas_id': qas_id, 
                                     'question':question_tokenized if tokenized else question, 
                                     'context': context_tokenized if tokenized else context, 
                                     'answer':answer, 
                                     'is_impossible': is_impossible,
                                     'start_pos': start_pos, 
                                     'end_pos':end_pos,
                                    'santiy_check': context_tokenized[start_pos:end_pos+1] if tokenized else context[start_pos:end_pos+1]})
            
                    
    print('No. of questions:{}'.format(num_questions))
    return examples


Preprocessing of Train data

In [9]:
train_processed = preprocess_data(train_data)

100%|██████████| 442/442 [00:41<00:00, 10.77it/s]

No. of questions:130319





In [10]:
train_df = pd.DataFrame(train_processed)
train_df[:5]

Unnamed: 0,qas_id,question,context,answer,is_impossible,start_pos,end_pos,santiy_check
0,56be85543aeaaa14008c9063,"[When, did, Beyonce, start, becoming, popular?]","[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...",in the late 1990s,False,39,42,"[in, the, late, 1990s]"
1,56be85543aeaaa14008c9065,"[What, areas, did, Beyonce, compete, in, when,...","[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...",singing and dancing,False,28,30,"[singing, and, dancing]"
2,56be85543aeaaa14008c9066,"[When, did, Beyonce, leave, Destiny's, Child, ...","[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...",2003,False,82,82,"[(2003),]"
3,56bf6b0f3aeaaa14008c9601,"[In, what, city, and, state, did, Beyonce, gro...","[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...","Houston, Texas",False,22,23,"[Houston,, Texas,]"
4,56bf6b0f3aeaaa14008c9602,"[In, which, decade, did, Beyonce, become, famo...","[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...",late 1990s,False,41,42,"[late, 1990s]"


In [11]:
def encode_data(processed_data, tokenizer, max_len, max_query_len):
    """
    Converts examples of data into Deberta input format tensors.
    """
    context_length_errors = 0
    encoded_data = []
    for sample in tqdm(processed_data):
        question_raw = ' '.join(sample['question'])
        context_raw = ' '.join(sample['context'])
        if len(question_raw) > max_query_len:
            question_raw = question_raw[:max_query_len]
        
        # encode the data using the tokenizer
        encoded = tokenizer.encode_plus(question_raw, context_raw,
                                        max_length=max_len,
                                        padding='max_length',
                                        truncation='only_second',
                                        return_token_type_ids=True)
        if sample['is_impossible']:
            start = -1
            end = -1
        else: # Adjust the start_pos and end_pos 
            input_ids = encoded['input_ids']
            answer_ids = tokenizer.encode(sample['answer']) 
            start, end = 0, 0 
            for i in range(len(input_ids)):
                if input_ids[i: i+len(answer_ids[1:-1])] == answer_ids[1:-1]:
                    start = i
                    end = i + len(answer_ids[1:-1]) - 1
                    break
            
        ids = encoded['input_ids']
        token_type_ids = encoded['token_type_ids']
        mask = encoded['attention_mask']
        
        assert len(ids) == max_len
        assert len(token_type_ids) == max_len
        assert len(mask) == max_len
        
        encoded_data.append({'ids': ids,
                      'token_type_ids': token_type_ids,
                      'mask': mask,
                      'start_pos': start,
                      'end_pos': end})        
    return encoded_data

In [12]:
# Model Settings
MAX_SEQ_LEN = 512
MAX_QN_LEN = 128
NO_EPOCHS = 2
BATCH_SIZE = 8
LEARNING_RATE = 5e-05
OUT = 2

In [13]:
train_encoded = encode_data(train_processed, tokenizer, MAX_SEQ_LEN, MAX_QN_LEN)

100%|██████████| 130217/130217 [03:04<00:00, 704.26it/s]


In [14]:
# Creation of Dataloader
input_ids = torch.tensor([sample['ids'] for sample in train_encoded], dtype=torch.long)
input_masks = torch.tensor([sample['mask'] for sample in train_encoded], dtype=torch.long)
segment_ids = torch.tensor([sample['token_type_ids'] for sample in train_encoded], dtype=torch.long)

start_positions = torch.tensor([sample['start_pos'] for sample in train_encoded], dtype=torch.long)
end_positions = torch.tensor([sample['end_pos'] for sample in train_encoded], dtype=torch.long)
train_dataset = TensorDataset(input_ids, input_masks, segment_ids, start_positions, end_positions)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(train_dataset, **train_params)

Preprocessing of Validation Data

In [15]:
val_processed = preprocess_data(val_data)

100%|██████████| 35/35 [00:09<00:00,  3.68it/s]

No. of questions:11873





In [16]:
val_df = pd.DataFrame(val_processed)

In [17]:
val_df[:5]

Unnamed: 0,qas_id,question,context,answer,is_impossible,start_pos,end_pos,santiy_check
0,56ddde6b9a695914005b9628,"[In, what, country, is, Normandy, located?]","[The, Normans, (Norman:, Nourmands;, French:, ...",France,False,26,26,[France.]
1,56ddde6b9a695914005b9628,"[In, what, country, is, Normandy, located?]","[The, Normans, (Norman:, Nourmands;, French:, ...",France,False,26,26,[France.]
2,56ddde6b9a695914005b9628,"[In, what, country, is, Normandy, located?]","[The, Normans, (Norman:, Nourmands;, French:, ...",France,False,26,26,[France.]
3,56ddde6b9a695914005b9628,"[In, what, country, is, Normandy, located?]","[The, Normans, (Norman:, Nourmands;, French:, ...",France,False,26,26,[France.]
4,56ddde6b9a695914005b9629,"[When, were, the, Normans, in, Normandy?]","[The, Normans, (Norman:, Nourmands;, French:, ...",10th and 11th centuries,False,14,17,"[10th, and, 11th, centuries]"


In [18]:
val_encoded = encode_data(val_processed, tokenizer, MAX_SEQ_LEN, MAX_QN_LEN)

100%|██████████| 26232/26232 [00:41<00:00, 629.51it/s]


In [19]:
# Creation of Dataloader
input_ids = torch.tensor([sample['ids'] for sample in val_encoded], dtype=torch.long)
input_mask = torch.tensor([sample['mask'] for sample in val_encoded], dtype=torch.long)
segment_ids = torch.tensor([sample['token_type_ids'] for sample in val_encoded], dtype=torch.long)

start_positions = torch.tensor([sample['start_pos'] for sample in val_encoded], dtype=torch.long)
end_positions = torch.tensor([sample['end_pos'] for sample in val_encoded], dtype=torch.long)

val_dataset = TensorDataset(input_ids, input_mask, segment_ids, start_positions, end_positions)

val_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }    

val_loader = DataLoader(val_dataset, **val_params)

Evaluation Metrics for Prediction

In [20]:
# Functions to calculate Evaluation Metrics
def normalize_text(s):
    """Removing articles, punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re
    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_f1(prediction, truth):
    ''' Calculates the F1 score'''
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)

    return 2 * (prec * rec) / (prec + rec)

def compute_exact_match(prediction, truth):
    ''' Computes the exact match score '''
    return int(normalize_text(prediction) == normalize_text(truth))

In [21]:
def train(model, training_loader, optimizer):
    ''' Training finetunes the DEBERTA model for Question Answering'''
    print('Starting training...')
    step = 0
    counter = 0
    loss_tracker = 0
    em_score = 0
    f1_score = 0
    model.zero_grad()
    model.train()
    counter = 0
    for data in tqdm(training_loader):
        data = tuple(d.to(device) for d in data)
        inputs = {'input_ids':     data[0],
                'attention_mask':  data[1], 
                'token_type_ids':  data[2],  
                'start_positions': data[3], 
                'end_positions':   data[4]}
        outputs = model(**inputs)
        loss = outputs[0]
        loss.backward() # back propagation
        optimizer.step()
        model.zero_grad()
        starts = outputs[1]
        ends = outputs[2]
        target_starts = data[3]
        target_ends = data[4]
        
        for i,(s,e) in enumerate(zip(starts, ends)):
            start_pred = torch.argmax(s)
            end_pred = torch.argmax(e)
            predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][i][start_pred : end_pred+1]))
            predicted_answer = predicted_answer if predicted_answer!= '[CLS]' else ''
            actual_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][i][target_starts[i] : target_ends[i]+1]))
            curr_em = compute_exact_match(predicted_answer, actual_answer)
            curr_f1 = compute_f1(predicted_answer, actual_answer)
            em_score += curr_em
            f1_score += curr_f1
            counter += 1
                
        loss_tracker += loss.item()
        step += 1
        if step % 1000 == 0:
            print("Train loss: {}, Exact Match: {}, F1 Score: {}".format(loss_tracker/step, em_score/counter, f1_score/counter))

    return loss_tracker/step, em_score/counter, f1_score/counter


def validator(model, testing_loader):
    ''' Performs Prediction of the Answers'''
    print('Starting validation...')
    model.eval()
    preds = []
    targs = []
    pred_answers = []
    target_answers = []
    val_loss = 0
    step = 0
    with torch.no_grad():
        for data in tqdm(testing_loader):
            data = tuple(d.to(device) for d in data)
            inputs = {'input_ids': data[0],
                'attention_mask':  data[1], 
                'token_type_ids':  data[2],
                'start_positions': data[3], 
                'end_positions':   data[4]}
            output = model(**inputs)
            valloss = output.loss
            val_loss += valloss.item()
            starts = output[1]
            ends = output[2]
            start_preds = []
            end_preds = []
            
            target_starts = data[3]
            target_ends = data[4]
            step += 1

            for s,e in zip(starts, ends):
                start_pred = torch.argmax(s)
                start_preds.append(start_pred)
                end_pred = torch.argmax(e)
                end_preds.append(end_pred)

            for i, (s,e) in enumerate(zip(start_preds, target_ends)):
                preds.append((s,e))
                predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][i][s : e+1]))
                pred_answers.append(predicted_answer)
            
            for i, (s,e) in enumerate(zip(target_starts, target_ends)):
                targs.append((s,e))
                actual_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][i][s : e+1]))
                target_answers.append(actual_answer)
                
        em_score = 0
        f1_score = 0
        pred_answers = [item if item != '[CLS]' else '' for item in pred_answers]
        for predicted_ans, target_ans in zip(pred_answers, target_answers):
            em_score += compute_exact_match(predicted_ans, target_ans)
            f1_score += compute_f1(predicted_ans, target_ans)
        em_score /= len(pred_answers)
        f1_score /= len(pred_answers)
        val_loss /= step
    return pred_answers, target_answers, val_loss, em_score, f1_score

### Finetuning the DEBERTA Model

In [23]:
model = DebertaV2ForQuestionAnswering.from_pretrained('microsoft/deberta-v3-base')
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForQuestionAnswering: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2ForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForS

DebertaV2ForQuestionAnswering(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True

In [24]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [25]:
for epoch in range(NO_EPOCHS): #TRAINING
    loss, em, f1 = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss:  {loss}, Em: {em}, F1: {f1}') 
    pred_answers, target_answers, val_loss, em_score, f1_score = validator(model, val_loader)
    print(f'Loss:  {val_loss}, Em: {em_score}, F1: {f1_score}')
    torch.save(model, '/kaggle/working/fine_tuned_deberta'+str(epoch)+'.model')

Starting training...


  6%|▌         | 1000/16278 [10:47<2:43:57,  1.55it/s]

Train loss: 1.5775389344990254, Exact Match: 0.484125, F1 Score: 0.5465711867852185


 12%|█▏        | 2000/16278 [21:32<2:33:19,  1.55it/s]

Train loss: 1.3480776539593935, Exact Match: 0.54125, F1 Score: 0.6090345104899915


 18%|█▊        | 3000/16278 [32:18<2:22:49,  1.55it/s]

Train loss: 1.2443933578381936, Exact Match: 0.5690416666666667, F1 Score: 0.6380313044408674


 25%|██▍       | 4000/16278 [43:03<2:11:45,  1.55it/s]

Train loss: 1.178520329132676, Exact Match: 0.58590625, F1 Score: 0.6565931593594634


 31%|███       | 5000/16278 [53:48<2:01:24,  1.55it/s]

Train loss: 1.1282489785075187, Exact Match: 0.59885, F1 Score: 0.6697735285497916


 37%|███▋      | 6000/16278 [1:04:34<1:50:39,  1.55it/s]

Train loss: 1.0937820962568123, Exact Match: 0.608375, F1 Score: 0.6799716276040616


 43%|████▎     | 7000/16278 [1:15:19<1:39:39,  1.55it/s]

Train loss: 1.0701450980539833, Exact Match: 0.6140714285714286, F1 Score: 0.6860899567707766


 49%|████▉     | 8000/16278 [1:26:06<1:28:59,  1.55it/s]

Train loss: 1.047037526583299, Exact Match: 0.620140625, F1 Score: 0.692078286236876


 55%|█████▌    | 9000/16278 [1:36:51<1:18:13,  1.55it/s]

Train loss: 1.0284284226699836, Exact Match: 0.6257222222222222, F1 Score: 0.697449326431178


 61%|██████▏   | 10000/16278 [1:47:35<1:07:42,  1.55it/s]

Train loss: 1.0141583908889442, Exact Match: 0.6295625, F1 Score: 0.7013366879780171


 68%|██████▊   | 11000/16278 [1:58:21<56:42,  1.55it/s]  

Train loss: 1.0011129779907113, Exact Match: 0.6332045454545454, F1 Score: 0.7051931567654401


 74%|███████▎  | 12000/16278 [2:09:06<45:59,  1.55it/s]

Train loss: 0.9910335133718327, Exact Match: 0.6360104166666667, F1 Score: 0.708631249847345


 80%|███████▉  | 13000/16278 [2:19:51<35:13,  1.55it/s]

Train loss: 0.9795596673405514, Exact Match: 0.6394807692307692, F1 Score: 0.7122270800872295


 86%|████████▌ | 14000/16278 [2:30:37<24:30,  1.55it/s]

Train loss: 0.9709443525419171, Exact Match: 0.6418392857142857, F1 Score: 0.714845568829898


 92%|█████████▏| 15000/16278 [2:41:22<13:44,  1.55it/s]

Train loss: 0.9649412616285185, Exact Match: 0.6432916666666667, F1 Score: 0.7165685192106197


 98%|█████████▊| 16000/16278 [2:52:07<02:59,  1.55it/s]

Train loss: 0.9571916833340656, Exact Match: 0.6455390625, F1 Score: 0.718891856700808


100%|██████████| 16278/16278 [2:55:08<00:00,  1.55it/s]


Epoch: 0, Loss:  0.9549916205970754, Em: 0.646252025465185, F1: 0.7195392697451615
Starting validation...


100%|██████████| 3279/3279 [11:45<00:00,  4.64it/s]


Loss:  1.0499374188980375, Em: 0.734065263799939, F1: 0.8023998221079864
Starting training...


  6%|▌         | 1000/16278 [10:45<2:44:16,  1.55it/s]

Train loss: 0.6321305536404253, Exact Match: 0.744125, F1 Score: 0.8125204424647726


 12%|█▏        | 2000/16278 [21:30<2:33:47,  1.55it/s]

Train loss: 0.6441771030277014, Exact Match: 0.7386875, F1 Score: 0.8097124674609452


 18%|█▊        | 3000/16278 [32:15<2:22:49,  1.55it/s]

Train loss: 0.6459446301137408, Exact Match: 0.7353333333333333, F1 Score: 0.8065125858600812


 25%|██▍       | 4000/16278 [43:01<2:12:28,  1.54it/s]

Train loss: 0.6474901300808414, Exact Match: 0.7340625, F1 Score: 0.8057105448399015


 31%|███       | 5000/16278 [53:46<2:01:13,  1.55it/s]

Train loss: 0.6575181175677106, Exact Match: 0.730675, F1 Score: 0.8016704982123622


 37%|███▋      | 6000/16278 [1:04:32<1:50:35,  1.55it/s]

Train loss: 0.6574962177132256, Exact Match: 0.730625, F1 Score: 0.8022263172596079


 43%|████▎     | 7000/16278 [1:15:19<1:39:49,  1.55it/s]

Train loss: 0.6599990542444534, Exact Match: 0.7303214285714286, F1 Score: 0.8021013579401784


 49%|████▉     | 8000/16278 [1:26:04<1:29:04,  1.55it/s]

Train loss: 0.6629003320023185, Exact Match: 0.72953125, F1 Score: 0.8013490723092497


 55%|█████▌    | 9000/16278 [1:36:49<1:18:12,  1.55it/s]

Train loss: 0.6690767378379694, Exact Match: 0.727875, F1 Score: 0.8001260086111853


 61%|██████▏   | 10000/16278 [1:47:35<1:07:27,  1.55it/s]

Train loss: 0.6700510902366601, Exact Match: 0.7282125, F1 Score: 0.8000978713915511


 68%|██████▊   | 11000/16278 [1:58:20<56:45,  1.55it/s]  

Train loss: 0.6700391308186915, Exact Match: 0.7279772727272728, F1 Score: 0.8004819919479669


 74%|███████▎  | 12000/16278 [2:09:06<45:57,  1.55it/s]

Train loss: 0.6723060722986702, Exact Match: 0.7278020833333333, F1 Score: 0.7999073446640621


 80%|███████▉  | 13000/16278 [2:19:51<35:15,  1.55it/s]

Train loss: 0.6722230460935343, Exact Match: 0.7282884615384615, F1 Score: 0.8003023157490832


 86%|████████▌ | 14000/16278 [2:30:36<24:28,  1.55it/s]

Train loss: 0.6726389190218678, Exact Match: 0.7285178571428571, F1 Score: 0.8005073962823732


 92%|█████████▏| 15000/16278 [2:41:22<13:44,  1.55it/s]

Train loss: 0.6723244343568261, Exact Match: 0.7286583333333333, F1 Score: 0.8006819621363055


 98%|█████████▊| 16000/16278 [2:52:09<02:59,  1.55it/s]

Train loss: 0.6733810710380204, Exact Match: 0.7283125, F1 Score: 0.8005694650656011


100%|██████████| 16278/16278 [2:55:08<00:00,  1.55it/s]


Epoch: 1, Loss:  0.6731386496693456, Em: 0.728430235683513, F1: 0.8006643839483354
Starting validation...


100%|██████████| 3279/3279 [11:45<00:00,  4.65it/s]


Loss:  1.056676895009168, Em: 0.7269746874046965, F1: 0.7969707472718022
