In [1]:
# 1. "AnswerPosition" shows where the first character of the answer is located in the passage. The position of the last character can be computed:

# 2. There are multiple questions for a given context. Make sure to split the questions and answers.
#Team - 12 : 2001 to 3000
# Validation data: Use the following indices

# 9400 to 9599

# Test data: Use the following indices

# 9600 to 9749


In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import torch
import pandas as pd
import numpy as np
import re
from datasets import load_metric
import torch.nn as nn
from transformers import AutoTokenizer,AutoModel, TrainingArguments, Trainer, DefaultDataCollator
from tqdm.auto import tqdm
import collections




In [3]:
df = pd.read_csv("datasets0/Question-answering.csv")

train_df = df.iloc[1:9001]
val_df = df.iloc[9400:9600]
test_df = df.iloc[9600:9750]

def get_qa_context_pairs(qa_set):
    questions = []
    answers = []
    answers_start_id = []
    qa_set = qa_set.replace('"\\','')
    qa_set = qa_set.replace('\\"','')
    splitted_words  = qa_set.split('"')
    question_indices = [i for i, x in enumerate(splitted_words) if x == "Question"]
    answer_indices = [i for i, x in enumerate(splitted_words) if x == "Answers"]
    answer_start_indices = [i for i, x in enumerate(splitted_words) if x == "AnswerPositions"]

    questions = [splitted_words[i+2] for i in question_indices]
    answers = [splitted_words[i+2] for i in answer_indices]
    if "\\" in answers:
        print(qa_set)
    answers_start_id = [splitted_words[i+1] for i in answer_start_indices]
    answers_start_id = [''.join(i for i in s if i.isdigit()) for s in answers_start_id]
    
    return questions,answers,answers_start_id

In [4]:
def get_encodings(df):
    contexts = []
    questions_full = []
    answers_full = []
    for i in range(len(df)):
        questions,answers,answers_start_id = get_qa_context_pairs(df.iloc[i]['QuestionAnswerSets'])
        for j in range(len(questions)):
            contexts.append(df["Context"].iloc[i])
            questions_full.append(questions[j])
            answers_full.append({"text":answers[j],"answer_start":int(answers_start_id[j])})

    return contexts,questions_full,answers_full

In [5]:
train_contexts,train_questions,train_answers = get_encodings(train_df)
val_contexts,val_questions,val_answers = get_encodings(val_df)
test_contexts,test_questions,test_answers = get_encodings(test_df)

In [6]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters
        else:
            answer['answer_end'] = end_idx
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)
add_end_idx(test_answers, test_contexts)

In [7]:
train_examples ={
    "question":train_questions,
    "context":train_contexts,
    "answers":train_answers
}
val_examples = {
    "question":val_questions,
    "context":val_contexts,
    "answers":val_answers
}
test_examples = {
    "question":test_questions,
    "context":test_contexts,
    "answers":test_answers
}

In [8]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
pad_on_right = tokenizer.padding_side == "right"
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

In [10]:
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    print(len(tokenized_examples["input_ids"]))
    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        start_char = answers["answer_start"]
        end_char = start_char + len(answers["text"])

        # Start token index of the current span in the text.
        token_start_index = 0
        while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
            token_start_index += 1

        # End token index of the current span in the text.
        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
            token_end_index -= 1

        # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
        if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
            # Note: we could go after the last offset if the answer is the last word (edge case).
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)
            while offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [11]:
train_encodings = prepare_train_features(train_examples)
val_encodings = prepare_train_features(val_examples)
test_encodings = prepare_train_features(test_examples)

42058
1323
647


In [12]:
index = 31
start_position,end_position = train_encodings["start_positions"][index],train_encodings["end_positions"][index]
print(train_examples["question"][index],train_examples["answers"][index])
print(tokenizer.decode(train_encodings["input_ids"][index]))
tokenizer.decode(train_encodings["input_ids"][index][start_position:end_position+1])

In what year was the Theodore M. Hesburgh Library at Notre Dame finished? {'text': '1963', 'answer_start': 188, 'answer_end': 192}
[CLS] in what year was the theodore m. hesburgh library at notre dame finished? [SEP] the library system of the university is divided between the main library and each of the colleges and schools. the main building is the 14 - story theodore m. hesburgh library, completed in 1963, which is the third building to house the main collection of books. the front of the library is adorned with the word of life mural designed by artist millard sheets. this mural is popularly known as " touchdown jesus " because of its proximity to notre dame stadium and jesus'arms appearing to make the signal for a touchdown. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

'1963'

In [13]:
index = 67
start_position,end_position = val_encodings["start_positions"][index],val_encodings["end_positions"][index]
print(val_examples["question"][index],val_examples["answers"][index],val_examples["context"][index])
#print(tokenizer.decode(val_encodings["input_ids"][index]))
tokenizer.decode(val_encodings["input_ids"][index][start_position:end_position+1])

When did the second terminal open at the Mexico City Airport? {'text': '2007', 'answer_start': 168, 'answer_end': 172} In the Mexico City airport, the government engaged in an extensive restructuring program that includes the addition of a new second terminal, which began operations in 2007, and the enlargement of four other airports (at the nearby cities of Toluca, Querétaro, Puebla and Cuernavaca) that, along with Mexico City's airport, comprise the Grupo Aeroportuario del Valle de México, distributing traffic to different regions in Mexico. The city of Pachuca will also provide additional expansion to central Mexico's airport network. Mexico City's airport is the main hub for 11 of the 21 national airline companies.


'2007'

In [14]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)
test_dataset = SquadDataset(test_encodings)

In [15]:
epochs = 3

In [16]:
# Create the BertClassfier class
class BertQA(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertQA, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 2

        # Instantiate BERT model
        self.bert = AutoModel.from_pretrained("bert-base-uncased")
        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # self.classifier = nn.Sequential(
        #     nn.Linear(D_in, D_out),
        # )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0]
        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)
        return logits

model = BertQA(freeze_bert=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
loss_fn = nn.CrossEntropyLoss()

In [18]:
from torch.utils.data import DataLoader
from transformers import AdamW
import time
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()
batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=10)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=10)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=10)

optim = AdamW(model.parameters(), lr=2e-5)
whole_train_eval_time = time.time()

train_losses = []
val_losses = []

print_every = 100



In [20]:


for epoch in range(epochs):
  epoch_time = time.time()

  # Set model in train mode
  model.train()
    
  loss_of_epoch = 0

  print("############Train############")

  for batch_idx,batch in enumerate(train_loader): 
    optim.zero_grad()
    
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)
    start_logits, end_logits = outputs.split(1, dim=-1)
    
    start_logits = start_logits.squeeze(-1)
    end_logits = end_logits.squeeze(-1)
    
    start_loss = loss_fn(start_logits, start_positions)
    end_loss = loss_fn(end_logits, end_positions)
    loss = (start_loss + end_loss) / 2
    
    # do a backwards pass 
    loss.backward()
    # update the weights
    optim.step()
    # Find the total loss
    loss_of_epoch += loss.item()

    if (batch_idx+1) % print_every == 0:
      print("Batch {:} / {:}".format(batch_idx+1,len(train_loader)),"\nLoss:", round(loss.item(),1),"\n")

  loss_of_epoch /= len(train_loader)
  train_losses.append(loss_of_epoch)

  ##########Evaluation##################

  # Set model in evaluation mode
  model.eval()

  print("############Evaluate############")

  loss_of_epoch = 0

  for batch_idx,batch in enumerate(val_loader):
    
    with torch.no_grad():

      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)
      
      outputs = model(input_ids, attention_mask=attention_mask)
      start_logits, end_logits = outputs.split(1, dim=-1)

      start_logits = start_logits.squeeze(-1)
      end_logits = end_logits.squeeze(-1)
      start_loss = loss_fn(start_logits, start_positions)
      end_loss = loss_fn(end_logits, end_positions)
      loss = (start_loss + end_loss) / 2
    
      loss_of_epoch += loss.item()

    if (batch_idx+1) % print_every == 0:
       print("Batch {:} / {:}".format(batch_idx+1,len(val_loader)),"\nLoss:", round(loss.item(),1),"\n")

  loss_of_epoch /= len(val_loader)
  val_losses.append(loss_of_epoch)

  # Print each epoch's time and train/val loss 
  print("\n-------Epoch ", epoch+1,
        "-------"
        "\nTraining Loss:", train_losses[-1],
        "\nValidation Loss:", val_losses[-1],
        "\nTime: ",(time.time() - epoch_time),
        "\n-----------------------",
        "\n\n")

print("Total training and evaluation time: ", (time.time() - whole_train_eval_time))

############Train############
Batch 100 / 5258 
Loss: 1.8 

Batch 200 / 5258 
Loss: 1.0 

Batch 300 / 5258 
Loss: 1.7 



KeyboardInterrupt: 

In [63]:
loss_of_epoch = 0
start_logits_all = []
end_logits_all = []
for batch_idx,batch in enumerate(test_loader):
    with torch.no_grad():

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        start_logits, end_logits = outputs.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        start_logits_all.append(start_logits)
        end_logits_all.append(end_logits)


In [64]:
start_logits_all = torch.concat(start_logits_all, dim=0).cpu().numpy()
end_logits_all = torch.concat(end_logits_all, dim=0).cpu().numpy()

In [40]:
def prepare_validation_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [41]:
test_examples["id"]= [str(i) for i in range(len(test_examples["question"]))]

In [43]:
test_features = prepare_validation_features(test_examples)

In [65]:
from tqdm.auto import tqdm

def postprocess_qa_predictions(examples, features, all_start_logits,all_end_logits, n_best_size = 20, max_answer_length = 30):
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        predictions[example["id"]] = best_answer["text"]
        
    return predictions

In [58]:
type(dict(test_features))

dict

In [61]:
from datasets import Dataset
test_features_ds = Dataset.from_dict(dict(test_features))
test_examples_ds = Dataset.from_dict(dict(test_examples))

In [66]:
predictions = postprocess_qa_predictions(test_examples_ds, test_features_ds, start_logits_all, end_logits_all)

Post-processing 642 example predictions split into 647 features.


  0%|          | 0/642 [00:00<?, ?it/s]

In [79]:
metric = load_metric("squad")
formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
references = []
for ex in test_examples_ds:
    ex["answers"]["text"] = [ex["answers"]["text"]]
    ex["answers"]["answer_start"] = [ex["answers"]["answer_start"]]
    ex["answers"].pop("answer_end", None)
    references.append({"id": ex["id"], "answers": ex["answers"]})

In [80]:
metric.compute(predictions=formatted_predictions, references=references)

{'exact_match': 42.52336448598131, 'f1': 57.89997035763593}