In [1]:
import numpy as np
import pandas as pd
import nltk
import os
import json
import transformers
import torch
import random
from torch import cuda
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertModel, BertForQuestionAnswering

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
# Setting the random seed for consistent results
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

In [3]:
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
device

'cuda'

### Loading the data

In [5]:
# Loading train data
train_file = open('/kaggle/input/squad-2/train-v2.0.json')
train_data = json.load(train_file)

# Loading validation data
val_file = open('/kaggle/input/squad-2/dev-v2.0.json')
val_data = json.load(val_file)

### Preprocessing the dataset

In [6]:
# Using the pretrained tokenizer for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
def index_converter(context, context_tokenized):
    """
    Maps start str index to a tokenized index
    """
    mapper = {}
    curr = ''
    token_idx = 0
    for i, char in enumerate(context):
        if char != ' ' and char != '\n' and char != '\t' and char != '\r': 
            curr += char
            if curr == context_tokenized[token_idx]:
                start = i - len(curr) + 1
                for j in range(start, i+1):
                    mapper[j] = (curr, token_idx)                
                curr = ''
                token_idx += 1
    if token_idx != len(context_tokenized):
        return None
    return mapper

def preprocess_data(dataset, is_training=True, tokenized=True):
    """
    Extracts the info from json_data object into a pandas readable data representation (list of dicts)
    """
    
    def _tokenize(seq):
        """
        Performing tokenization to minimize errors between tokenizers and encodings.
        """
        return [t.replace("``", '"').replace("''", '"') for t in seq.split()]
    
    examples = [] 
    
    tokenization_errors = 0
    misaligned_ans_errors = 0
    num_impossibles = 0
    num_questions = 0
    
    for article_id in tqdm(range(len(dataset['data']))): 
        paragraphs = dataset['data'][article_id]['paragraphs']
        for paragraph_id in range(len(paragraphs)):
            questions = dataset['data'][article_id]['paragraphs'][paragraph_id]['qas']
            
            context = paragraphs[paragraph_id]['context']
            context_tokenized = _tokenize(context)
                    
            for qid in range(len(questions)): 
                num_questions += 1
                
                question = questions[qid]['question']
                question_tokenized = _tokenize(question)
                qas_id = questions[qid]['id']
                
                is_impossible = questions[qid]['is_impossible']
                
                if is_impossible: 
                    num_impossibles += 1
                    examples.append({'qas_id': qas_id, 
                                     'question':question_tokenized if tokenized else question, 
                                     'context': context_tokenized if tokenized else context, 
                                     'answer':'', 
                                     'is_impossible': is_impossible,
                                     'start_pos': -1, 
                                     'end_pos':-1,
                                    'santiy_check': context_tokenized[-1:0]})
                    continue
                    
                answers = questions[qid]['answers']
                
                for ans_id in range(len(answers)):
                    answer = answers[ans_id]['text']
                    start_pos = answers[ans_id]['answer_start'] 
                    end_pos = start_pos + len(answer) 
                          
                    if context[start_pos:end_pos] != answer:
                        misaligned_ans_errors += 1
                        continue
                        
                    if tokenized:
                        mapper = index_converter(context, context_tokenized)
                        if mapper is None:
                            tokenization_errors += 1
                            continue
                        
                        start_pos = mapper[start_pos][1]
                        end_pos = mapper[end_pos-1][1] 
                    
                    examples.append({'qas_id': qas_id, 
                                     'question':question_tokenized if tokenized else question, 
                                     'context': context_tokenized if tokenized else context, 
                                     'answer':answer, 
                                     'is_impossible': is_impossible,
                                     'start_pos': start_pos, 
                                     'end_pos':end_pos,
                                    'santiy_check': context_tokenized[start_pos:end_pos+1] if tokenized else context[start_pos:end_pos+1]})
            
                    
    print('No. of questions:{}'.format(num_questions))
    return examples

Preprocessing of Train data

In [8]:
train_processed = preprocess_data(train_data)

100%|██████████| 442/442 [00:41<00:00, 10.69it/s]

No. of questions:130319





In [9]:
train_df = pd.DataFrame(train_processed)
train_df[:5]

Unnamed: 0,qas_id,question,context,answer,is_impossible,start_pos,end_pos,santiy_check
0,56be85543aeaaa14008c9063,"[When, did, Beyonce, start, becoming, popular?]","[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...",in the late 1990s,False,39,42,"[in, the, late, 1990s]"
1,56be85543aeaaa14008c9065,"[What, areas, did, Beyonce, compete, in, when,...","[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...",singing and dancing,False,28,30,"[singing, and, dancing]"
2,56be85543aeaaa14008c9066,"[When, did, Beyonce, leave, Destiny's, Child, ...","[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...",2003,False,82,82,"[(2003),]"
3,56bf6b0f3aeaaa14008c9601,"[In, what, city, and, state, did, Beyonce, gro...","[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...","Houston, Texas",False,22,23,"[Houston,, Texas,]"
4,56bf6b0f3aeaaa14008c9602,"[In, which, decade, did, Beyonce, become, famo...","[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...",late 1990s,False,41,42,"[late, 1990s]"


In [11]:
def encode_data(processed_data, tokenizer, max_len, max_query_len):
    """
    Converts examples of data into BERT input format tensors.
    """
    context_length_errors = 0
    encoded_data = []
    for sample in tqdm(processed_data):
        question_raw = ' '.join(sample['question'])
        context_raw = ' '.join(sample['context'])
        if len(question_raw) > max_query_len:
            question_raw = question_raw[:max_query_len]
        
        # encode the data using the tokenizer
        encoded = tokenizer.encode_plus(question_raw, context_raw,
                                        max_length=max_len,
                                        padding='max_length',
                                        truncation='only_second',
                                        return_token_type_ids=True)
        if sample['is_impossible']:
            start = -1
            end = -1
        else:
            input_ids = encoded['input_ids']
            answer_ids = tokenizer.encode(sample['answer']) 
            start, end = 0, 0 
            for i in range(len(input_ids)):
                if input_ids[i: i+len(answer_ids[1:-1])] == answer_ids[1:-1]:
                    start = i
                    end = i + len(answer_ids[1:-1]) - 1
                    break
            
        ids = encoded['input_ids']
        token_type_ids = encoded['token_type_ids']
        mask = encoded['attention_mask']
        
        assert len(ids) == max_len
        assert len(token_type_ids) == max_len
        assert len(mask) == max_len
        
        encoded_data.append({'ids': ids,
                      'token_type_ids': token_type_ids,
                      'mask': mask,
                      'start_pos': start,
                      'end_pos': end})        
    return encoded_data

In [27]:
# Model Settings
MAX_SEQ_LEN = 512
MAX_QN_LEN = 128
NO_EPOCHS = 3
BATCH_SIZE = 8
LEARNING_RATE = 5e-05
OUT = 2

In [13]:
train_encoded = encode_data(train_processed, tokenizer, MAX_SEQ_LEN, MAX_QN_LEN)

100%|██████████| 130217/130217 [12:47<00:00, 169.63it/s]


In [14]:
# Creation of Dataloader
input_ids = torch.tensor([sample['ids'] for sample in train_encoded], dtype=torch.long)
input_masks = torch.tensor([sample['mask'] for sample in train_encoded], dtype=torch.long)
segment_ids = torch.tensor([sample['token_type_ids'] for sample in train_encoded], dtype=torch.long)

start_positions = torch.tensor([sample['start_pos'] for sample in train_encoded], dtype=torch.long)
end_positions = torch.tensor([sample['end_pos'] for sample in train_encoded], dtype=torch.long)
train_dataset = TensorDataset(input_ids, input_masks, segment_ids, start_positions, end_positions)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(train_dataset, **train_params)

Preprocessing of Validation Data

In [15]:
val_processed = preprocess_data(val_data)

100%|██████████| 35/35 [00:09<00:00,  3.63it/s]

No. of questions:11873





In [16]:
val_df = pd.DataFrame(val_processed)

In [17]:
val_df[:5]

Unnamed: 0,qas_id,question,context,answer,is_impossible,start_pos,end_pos,santiy_check
0,56ddde6b9a695914005b9628,"[In, what, country, is, Normandy, located?]","[The, Normans, (Norman:, Nourmands;, French:, ...",France,False,26,26,[France.]
1,56ddde6b9a695914005b9628,"[In, what, country, is, Normandy, located?]","[The, Normans, (Norman:, Nourmands;, French:, ...",France,False,26,26,[France.]
2,56ddde6b9a695914005b9628,"[In, what, country, is, Normandy, located?]","[The, Normans, (Norman:, Nourmands;, French:, ...",France,False,26,26,[France.]
3,56ddde6b9a695914005b9628,"[In, what, country, is, Normandy, located?]","[The, Normans, (Norman:, Nourmands;, French:, ...",France,False,26,26,[France.]
4,56ddde6b9a695914005b9629,"[When, were, the, Normans, in, Normandy?]","[The, Normans, (Norman:, Nourmands;, French:, ...",10th and 11th centuries,False,14,17,"[10th, and, 11th, centuries]"


In [18]:
val_encoded = encode_data(val_processed, tokenizer, MAX_SEQ_LEN, MAX_QN_LEN)

100%|██████████| 26232/26232 [02:46<00:00, 157.98it/s]


In [19]:
# Creation of Dataloader
input_ids = torch.tensor([sample['ids'] for sample in val_encoded], dtype=torch.long)
input_mask = torch.tensor([sample['mask'] for sample in val_encoded], dtype=torch.long)
segment_ids = torch.tensor([sample['token_type_ids'] for sample in val_encoded], dtype=torch.long)

start_positions = torch.tensor([sample['start_pos'] for sample in val_encoded], dtype=torch.long)
end_positions = torch.tensor([sample['end_pos'] for sample in val_encoded], dtype=torch.long)

val_dataset = TensorDataset(input_ids, input_mask, segment_ids, start_positions, end_positions)

val_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }    

val_loader = DataLoader(val_dataset, **val_params)

Evaluation Metrics for Prediction

In [20]:
# Functions to calculate Evaluation Metrics
def normalize_text(s):
    """Removing articles, punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re
    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_f1(prediction, truth):
    ''' Calculates the F1 score'''
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)

    return 2 * (prec * rec) / (prec + rec)

def compute_exact_match(prediction, truth):
    ''' Computes the exact match score '''
    return int(normalize_text(prediction) == normalize_text(truth))

In [21]:
def train(model, training_loader, optimizer):
    ''' Training finetunes the BERT model for Question Answering'''
    print('Starting training...')
    step = 0
    counter = 0
    loss_tracker = 0
    em_score = 0
    f1_score = 0
    model.zero_grad()
    model.train()
    counter = 0
    for data in tqdm(training_loader):
        data = tuple(d.to(device) for d in data)
        inputs = {'input_ids':     data[0],
                'attention_mask':  data[1], 
                'token_type_ids':  data[2],  
                'start_positions': data[3], 
                'end_positions':   data[4]}
        outputs = model(**inputs)
        loss = outputs[0]
        loss.backward() # back propagation
        optimizer.step()
        model.zero_grad()
        
        starts = outputs[1]
        ends = outputs[2]
        target_starts = data[3]
        target_ends = data[4]
        
        for i,(s,e) in enumerate(zip(starts, ends)):
            start_pred = torch.argmax(s)
            end_pred = torch.argmax(e)
            predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][i][start_pred : end_pred+1]))
            predicted_answer = predicted_answer if predicted_answer!= '[CLS]' else ''
            actual_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][i][target_starts[i] : target_ends[i]+1]))
            curr_em = compute_exact_match(predicted_answer, actual_answer)
            curr_f1 = compute_f1(predicted_answer, actual_answer)
            em_score += curr_em
            f1_score += curr_f1
            counter += 1
                
        loss_tracker += loss.item()
        step += 1
        if step % 1000 == 0:
            print("Train loss: {}, Exact Match: {}, F1 Score: {}".format(loss_tracker/step, em_score/counter, f1_score/counter))

    return loss_tracker/step, em_score/counter, f1_score/counter


def validator(model, testing_loader):
    ''' Performs Prediction of the Answers'''
    print('Starting validation...')
    model.eval()
    preds = []
    targs = []
    pred_answers = []
    target_answers = []
    val_loss = 0
    step = 0
    with torch.no_grad():
        for data in tqdm(testing_loader):
            data = tuple(d.to(device) for d in data)
            inputs = {'input_ids': data[0],
                'attention_mask':  data[1], 
                'token_type_ids':  data[2],
                'start_positions': data[3], 
                'end_positions':   data[4]}
            output = model(**inputs)
            valloss = output.loss
            val_loss += valloss.item()
            starts = output[1]
            ends = output[2]
            start_preds = []
            end_preds = []
            
            target_starts = data[3]
            target_ends = data[4]
            step += 1

            for s,e in zip(starts, ends):
                start_pred = torch.argmax(s)
                start_preds.append(start_pred)
                end_pred = torch.argmax(e)
                end_preds.append(end_pred)

            for i, (s,e) in enumerate(zip(start_preds, target_ends)):
                preds.append((s,e))
                predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][i][s : e+1]))
                pred_answers.append(predicted_answer)
            
            for i, (s,e) in enumerate(zip(target_starts, target_ends)):
                targs.append((s,e))
                actual_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][i][s : e+1]))
                target_answers.append(actual_answer)
                
        em_score = 0
        f1_score = 0
        pred_answers = [item if item != '[CLS]' else '' for item in pred_answers]
        for predicted_ans, target_ans in zip(pred_answers, target_answers):
            em_score += compute_exact_match(predicted_ans, target_ans)
            f1_score += compute_f1(predicted_ans, target_ans)
        em_score /= len(pred_answers)
        f1_score /= len(pred_answers)
        val_loss /= step
    return pred_answers, target_answers, val_loss, em_score, f1_score

### Finetuning the BERT Model

In [28]:
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_out

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [29]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [30]:
for epoch in range(NO_EPOCHS): #TRAINING
    loss, em, f1 = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss:  {loss}, Em: {em}, F1: {f1}') 
    pred_answers, target_answers, val_loss, em_score, f1_score = validator(model, val_loader)
    print(f'Loss:  {val_loss}, Em: {em_score}, F1: {f1_score}')
    torch.save(model, '/kaggle/working/fine_tuned_bert'+str(epoch)+'.model')

Starting training...


  6%|▌         | 1000/16278 [07:08<1:49:44,  2.32it/s]

Train loss: 2.282231767475605, Exact Match: 0.34475, F1 Score: 0.37329433893801844


 12%|█▏        | 2000/16278 [14:16<1:41:43,  2.34it/s]

Train loss: 1.9946906406581402, Exact Match: 0.3859375, F1 Score: 0.42709518107251604


 18%|█▊        | 3000/16278 [21:24<1:34:31,  2.34it/s]

Train loss: 1.8378129691978295, Exact Match: 0.42075, F1 Score: 0.46884801590195196


 25%|██▍       | 4000/16278 [28:32<1:27:24,  2.34it/s]

Train loss: 1.7441270611435176, Exact Match: 0.44553125, F1 Score: 0.49669636994916966


 31%|███       | 5000/16278 [35:40<1:20:44,  2.33it/s]

Train loss: 1.6684530946552754, Exact Match: 0.463325, F1 Score: 0.5179631096392409


 37%|███▋      | 6000/16278 [42:48<1:13:38,  2.33it/s]

Train loss: 1.61426243190219, Exact Match: 0.4772708333333333, F1 Score: 0.5336061133430452


 43%|████▎     | 7000/16278 [49:56<1:06:00,  2.34it/s]

Train loss: 1.5703584958463908, Exact Match: 0.4875535714285714, F1 Score: 0.546194565218156


 49%|████▉     | 8000/16278 [57:03<59:01,  2.34it/s]  

Train loss: 1.5347517072912304, Exact Match: 0.496921875, F1 Score: 0.557066659578646


 55%|█████▌    | 9000/16278 [1:04:11<51:44,  2.34it/s]

Train loss: 1.5062061384816965, Exact Match: 0.5054444444444445, F1 Score: 0.5667057401208434


 61%|██████▏   | 10000/16278 [1:11:19<44:43,  2.34it/s]

Train loss: 1.479998749050498, Exact Match: 0.5133625, F1 Score: 0.5757322386751693


 68%|██████▊   | 11000/16278 [1:18:26<37:32,  2.34it/s]

Train loss: 1.4570404150485992, Exact Match: 0.5208409090909091, F1 Score: 0.5837494284224527


 74%|███████▎  | 12000/16278 [1:25:33<30:23,  2.35it/s]

Train loss: 1.4343435669491689, Exact Match: 0.5272916666666667, F1 Score: 0.5909573161453922


 80%|███████▉  | 13000/16278 [1:32:40<23:15,  2.35it/s]

Train loss: 1.4155058127143063, Exact Match: 0.5333173076923077, F1 Score: 0.597630029362828


 86%|████████▌ | 14000/16278 [1:39:47<16:13,  2.34it/s]

Train loss: 1.3985273746991795, Exact Match: 0.5385535714285714, F1 Score: 0.6030527320380964


 92%|█████████▏| 15000/16278 [1:46:54<09:05,  2.34it/s]

Train loss: 1.383756852464378, Exact Match: 0.5432666666666667, F1 Score: 0.6079431631221158


 98%|█████████▊| 16000/16278 [1:54:01<01:58,  2.34it/s]

Train loss: 1.3711132315299475, Exact Match: 0.547328125, F1 Score: 0.6123605643434512


100%|██████████| 16278/16278 [1:55:59<00:00,  2.34it/s]


Epoch: 0, Loss:  1.3668032011816134, Em: 0.5487916324289456, F1: 0.6139140230537155
Starting validation...


100%|██████████| 3279/3279 [07:30<00:00,  7.27it/s]


Loss:  1.406473960758128, Em: 0.7075709057639524, F1: 0.784692355549256
Starting training...


  6%|▌         | 1000/16278 [07:06<1:48:35,  2.34it/s]

Train loss: 0.8731408924013376, Exact Match: 0.691, F1 Score: 0.7610322948246152


 12%|█▏        | 2000/16278 [14:13<1:42:12,  2.33it/s]

Train loss: 0.8843289804905653, Exact Match: 0.6865625, F1 Score: 0.7552132015151011


 18%|█▊        | 3000/16278 [21:20<1:34:23,  2.34it/s]

Train loss: 0.8921370060046514, Exact Match: 0.6824583333333333, F1 Score: 0.7525893421843761


 25%|██▍       | 4000/16278 [28:27<1:27:43,  2.33it/s]

Train loss: 0.8933067723847926, Exact Match: 0.6809375, F1 Score: 0.7509995769358456


 31%|███       | 5000/16278 [35:34<1:20:01,  2.35it/s]

Train loss: 0.8988183406502008, Exact Match: 0.67855, F1 Score: 0.7489583515213385


 37%|███▋      | 6000/16278 [42:40<1:13:00,  2.35it/s]

Train loss: 0.9028703393985827, Exact Match: 0.6775833333333333, F1 Score: 0.7480476812290577


 43%|████▎     | 7000/16278 [49:47<1:06:01,  2.34it/s]

Train loss: 0.9051407993063331, Exact Match: 0.6765178571428572, F1 Score: 0.7472294889343474


 49%|████▉     | 8000/16278 [56:54<58:46,  2.35it/s]  

Train loss: 0.9065439669527113, Exact Match: 0.676234375, F1 Score: 0.746985553110275


 55%|█████▌    | 9000/16278 [1:04:01<51:46,  2.34it/s]

Train loss: 0.909068475385093, Exact Match: 0.6760555555555555, F1 Score: 0.7467502164045586


 61%|██████▏   | 10000/16278 [1:11:07<44:37,  2.34it/s]

Train loss: 0.9137957805242389, Exact Match: 0.675, F1 Score: 0.7457708947286688


 68%|██████▊   | 11000/16278 [1:18:14<37:36,  2.34it/s]

Train loss: 0.9176167923696339, Exact Match: 0.6737954545454545, F1 Score: 0.7451286687859338


 74%|███████▎  | 12000/16278 [1:25:21<30:22,  2.35it/s]

Train loss: 0.9190072591134036, Exact Match: 0.6731979166666666, F1 Score: 0.7448398079628076


 80%|███████▉  | 13000/16278 [1:32:28<23:31,  2.32it/s]

Train loss: 0.9205838582019966, Exact Match: 0.6726634615384616, F1 Score: 0.7446193709139647


 86%|████████▌ | 14000/16278 [1:39:35<16:12,  2.34it/s]

Train loss: 0.9211886426378042, Exact Match: 0.6719910714285714, F1 Score: 0.7441936250271451


 92%|█████████▏| 15000/16278 [1:46:41<09:04,  2.35it/s]

Train loss: 0.9199364594079554, Exact Match: 0.6727083333333334, F1 Score: 0.7445682491404805


 98%|█████████▊| 16000/16278 [1:53:48<01:58,  2.35it/s]

Train loss: 0.9208229562982451, Exact Match: 0.6726875, F1 Score: 0.7445304567503587


100%|██████████| 16278/16278 [1:55:47<00:00,  2.34it/s]


Epoch: 1, Loss:  0.9212553432388184, Em: 0.6726233901871491, F1: 0.7445111768903623
Starting validation...


100%|██████████| 3279/3279 [07:30<00:00,  7.28it/s]


Loss:  1.4807406994455943, Em: 0.7072659347362, F1: 0.7881763960798981
Starting training...


  6%|▌         | 1000/16278 [07:06<1:48:34,  2.35it/s]

Train loss: 0.6576093506366014, Exact Match: 0.749, F1 Score: 0.819017798982742


 12%|█▏        | 2000/16278 [14:13<1:41:32,  2.34it/s]

Train loss: 0.6650560670793056, Exact Match: 0.7508125, F1 Score: 0.8194763784716034


 18%|█▊        | 3000/16278 [21:19<1:34:23,  2.34it/s]

Train loss: 0.6732152291921277, Exact Match: 0.7499166666666667, F1 Score: 0.8185821247727584


 25%|██▍       | 4000/16278 [28:26<1:27:15,  2.34it/s]

Train loss: 0.6783229616740718, Exact Match: 0.74625, F1 Score: 0.8163967383779209


 31%|███       | 5000/16278 [35:33<1:20:03,  2.35it/s]

Train loss: 0.6811044192768634, Exact Match: 0.74485, F1 Score: 0.8156241759042109


 37%|███▋      | 6000/16278 [42:39<1:13:31,  2.33it/s]

Train loss: 0.6879984465048958, Exact Match: 0.7428125, F1 Score: 0.8133081355861201


 43%|████▎     | 7000/16278 [49:46<1:05:55,  2.35it/s]

Train loss: 0.6931227269491979, Exact Match: 0.7404464285714286, F1 Score: 0.8111117145159815


 49%|████▉     | 8000/16278 [56:53<58:58,  2.34it/s]  

Train loss: 0.6979584510289133, Exact Match: 0.738859375, F1 Score: 0.8092225085280147


 55%|█████▌    | 9000/16278 [1:04:00<51:54,  2.34it/s]

Train loss: 0.7041452924439476, Exact Match: 0.7378194444444445, F1 Score: 0.80802537723399


 61%|██████▏   | 10000/16278 [1:11:07<44:36,  2.35it/s]

Train loss: 0.7071864727382549, Exact Match: 0.7372, F1 Score: 0.807254296053915


 68%|██████▊   | 11000/16278 [1:18:13<37:27,  2.35it/s]

Train loss: 0.7106148911701854, Exact Match: 0.7364886363636364, F1 Score: 0.8066543952756381


 74%|███████▎  | 12000/16278 [1:25:20<30:21,  2.35it/s]

Train loss: 0.7133849491462266, Exact Match: 0.7354479166666666, F1 Score: 0.8055326833068611


 80%|███████▉  | 13000/16278 [1:32:27<23:20,  2.34it/s]

Train loss: 0.7158640698352828, Exact Match: 0.7338461538461538, F1 Score: 0.8041727523210459


 86%|████████▌ | 14000/16278 [1:39:34<16:11,  2.35it/s]

Train loss: 0.7192979649319313, Exact Match: 0.7325982142857143, F1 Score: 0.8029281418085091


 92%|█████████▏| 15000/16278 [1:46:40<09:04,  2.35it/s]

Train loss: 0.7229219135523464, Exact Match: 0.7316333333333334, F1 Score: 0.8020854109698002


 98%|█████████▊| 16000/16278 [1:53:47<01:58,  2.34it/s]

Train loss: 0.7263534528264427, Exact Match: 0.7307421875, F1 Score: 0.8011533117438701


100%|██████████| 16278/16278 [1:55:45<00:00,  2.34it/s]


Epoch: 2, Loss:  0.7267978676231602, Em: 0.7304883386961764, F1: 0.800965612503017
Starting validation...


100%|██████████| 3279/3279 [07:33<00:00,  7.23it/s]


Loss:  1.5756120255359538, Em: 0.6971637694419031, F1: 0.7748366773633778
