## Imports

In [1]:
%%capture
# Suppress output

# Whether the notebook is run within Google Colab or not
colab = 'google.colab' in str(get_ipython())

# General imports
import numpy as np
import pandas as pd
import torch
import gc
# Install needed dependencies on Colab
if colab:
    !pip install transformers
from transformers import DistilBertTokenizerFast, DistilBertModel

# Automatically reimport modules at each execution
%reload_ext autoreload
%autoreload 2

In [2]:
if colab:
    !git clone https://github.com/michimichiamo/question-answering
    import os
    os.chdir('/content/question-answering')

Cloning into 'question-answering'...
remote: Enumerating objects: 117, done.[K
remote: Counting objects: 100% (104/104), done.[K
remote: Compressing objects: 100% (87/87), done.[K
remote: Total 117 (delta 51), reused 57 (delta 17), pack-reused 13[K
Receiving objects: 100% (117/117), 32.29 MiB | 4.97 MiB/s, done.
Resolving deltas: 100% (54/54), done.
Checking out files: 100% (19/19), done.


## Utils

In [13]:
## Load tokenizer (and transformers)
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased-distilled-squad')
#model = DistilBertModel.from_pretrained('distilbert-base-cased-distilled-squad')

# Move answer positions
from util.modify_answers_values import move_answers_position

## Training data

### Read data

In [39]:
train_filename = './data/context_id/train_df.csv' if not colab else 'https://raw.githubusercontent.com/michimichiamo/question-answering/main/data/context_id/train_df.csv'

train_df = pd.read_csv(train_filename)

In [40]:
train_questions = list(train_df['question'].values)
train_contexts = list(train_df['context'].values)
#train_context_ids = list(train_df['context_id'].values)
train_answers = [train_df['start'].values, train_df['end'].values]

print('question:', train_questions[0])
print('context:', train_contexts[0])
#print('context_id:', train_context_ids[0])
print(f'answer: ({train_answers[0][0]}, {train_answers[1][0]})')

question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
answer: (515, 541)


In [41]:
del train_df

gc.collect()

332

### Tokenize

In [42]:
## Tokenize questions and contexts
max_length = 512#model.config.max_position_embeddings
doc_stride = 256

train_tokenized = tokenizer(
   train_questions,
   train_contexts,
   max_length=max_length,
   truncation="only_second",
   return_overflowing_tokens=True,
   return_offsets_mapping=True,
   stride=doc_stride,
   return_attention_mask=True,
   padding='max_length'
)

In [43]:
del train_questions, train_contexts

gc.collect()

88

### Fix answers' position

In [44]:
train_new_answers = move_answers_position(train_tokenized, train_answers)

stats: (all, noth): (71270, 84)


In [45]:
train_new_answer_start, train_new_answer_end = list(zip(*train_new_answers))

### Save

In [46]:
train_input_ids = np.array(train_tokenized['input_ids'])
train_attention_mask = np.array(train_tokenized['attention_mask'])
train_answer_start = np.array(train_new_answer_start)
train_answer_end = np.array(train_new_answer_end)

In [47]:
answer_start = np.zeros(shape=(len(train_answer_start), 512), dtype='float64')
answer_end = np.zeros(shape=(len(train_answer_end), 512), dtype='float64')

for row, (start_value, end_value) in enumerate(zip(train_answer_start, train_answer_end)):
    answer_start[row][start_value] = 1
    answer_end[row][end_value] = 1

In [48]:
np.savez_compressed('./data/tokenized/train.npz', input_ids=train_input_ids, attention_mask=train_attention_mask, answer_start=answer_start, answer_end=answer_end)

In [27]:
del train_answers, train_input_ids, train_attention_mask, train_answer_start, train_answer_end, train_new_answers, train_new_answer_start, train_new_answer_end, train_tokenized

gc.collect()

287

In [None]:
#train_data = np.load('./data/tokenized/train.npz')

## Validation data

### Read data

In [28]:
val_filename = './data/context_id/val_df.csv' if not colab else 'https://raw.githubusercontent.com/michimichiamo/question-answering/main/data/context_id/val_df.csv'

val_df = pd.read_csv(val_filename)

In [29]:
val_questions = list(val_df['question'].values)
val_contexts = list(val_df['context'].values)
#val_context_ids = list(val_df['context_id'].values)
val_answers = [val_df['start'].values, val_df['end'].values]

print('question:', val_questions[0])
print('context:', val_contexts[0])
#print('context_id:', val_context_ids[0])
print(f'answer: ({val_answers[0][0]}, {val_answers[1][0]})')

question: What happened to Joseph I in 1758?
context: Following the earthquake, Joseph I gave his Prime Minister even more power, and Sebastião de Melo became a powerful, progressive dictator. As his power grew, his enemies increased in number, and bitter disputes with the high nobility became frequent. In 1758 Joseph I was wounded in an attempted assassination. The Távora family and the Duke of Aveiro were implicated and executed after a quick trial. The Jesuits were expelled from the country and their assets confiscated by the crown. Sebastião de Melo prosecuted every person involved, even women and children. This was the final stroke that broke the power of the aristocracy. Joseph I made his loyal minister Count of Oeiras in 1759.
answer: (272, 309)


In [30]:
del val_df

gc.collect()

88

### Tokenize

In [31]:
val_tokenized = tokenizer(
   val_questions,
   val_contexts,
   max_length=max_length,
   truncation="only_second",
   return_overflowing_tokens=True,
   return_offsets_mapping=True,
   stride=doc_stride,
   return_attention_mask=True,
   padding='max_length'
)

In [32]:
del val_questions, val_contexts

gc.collect()

304

### Fix answers' position

In [33]:
val_new_answers = move_answers_position(val_tokenized, val_answers)

stats: (all, noth): (16376, 18)


In [34]:
val_new_answer_start, val_new_answer_end = list(zip(*val_new_answers))

### Save

In [35]:
val_input_ids = np.array(val_tokenized['input_ids'])
val_attention_mask = np.array(val_tokenized['attention_mask'])
val_answer_start = np.array(val_new_answer_start)
val_answer_end = np.array(val_new_answer_end)

In [36]:
answer_start = np.zeros(shape=(len(val_answer_start), 512), dtype='float64')
answer_end = np.zeros(shape=(len(val_answer_end), 512), dtype='float64')

for row, (start_value, end_value) in enumerate(zip(val_answer_start, val_answer_end)):
    answer_start[row][start_value] = 1
    answer_end[row][end_value] = 1

In [37]:
np.savez_compressed('./data/tokenized/val.npz', input_ids=val_input_ids, attention_mask=val_attention_mask, answer_start=answer_start, answer_end=answer_end)

In [38]:
del val_answers, val_input_ids, val_attention_mask, val_answer_start, val_answer_end, val_new_answers, val_new_answer_start, val_new_answer_end, val_tokenized

gc.collect()

375

In [None]:
#val_data = np.load('./data/tokenized/val.npz')