## Imports

In [1]:
%%capture
# Suppress output

# Whether the notebook is run within Google Colab or not
colab = 'google.colab' in str(get_ipython())

# General imports
import numpy as np
import pandas as pd
# Install needed dependencies on Colab
if colab:
    !pip install transformers

# Automatically reimport modules at each execution
%reload_ext autoreload
%autoreload 2

In [3]:
if colab:
    !git clone https://github.com/michimichiamo/question-answering
    %cd '/content/question-answering'

# Question Answering

## Convert JSON to CSV format

- Load nested JSON into linearized `pandas.DataFrame`

In [3]:
from util.preprocessing import read_from_json

df = read_from_json()

### Save DataFrame

In [None]:
#df.to_csv('./data/raw/df.csv')

## Split training/validation

- Split based on titles, following the suggestion:
> all the questions/paragraphs regarding the same title should be in the same split

In [4]:
from sklearn.model_selection import train_test_split

# Split on titles
titles = df['title'].unique()
train_titles, val_titles = train_test_split(titles, test_size=0.2, shuffle=True, random_state=42)
# Extract DataFrames
train_df = df[df.apply(lambda x: x['title'] in train_titles, axis=1)]
val_df = df[df.apply(lambda x: x['title'] in val_titles, axis=1)]

### Save DataFrames

In [None]:
#train_df.to_csv('./data/raw/train_df.csv')
#val_df.to_csv('./data/raw/val_df.csv')

## Read data

- Load data from saved `.csv` files
- `keep_default_na=False` is needed as to avoid interpreting as `nan` an answer reporting `'null'`

In [None]:
directory = './data/raw/'
train_filename = directory+'train_df.csv'
val_filename = directory+'val_df.csv'
train_df = pd.read_csv(train_filename, index_col=0, keep_default_na=False)
val_df = pd.read_csv(val_filename, index_col=0, keep_default_na=False)

## Tokenize questions and contexts

- Use of `transformers.DistilBertTokenizerFast` to obtain tokenized questions and contexts

In [7]:
from util.preprocessing import tokenize

train_df = tokenize(train_df)
val_df = tokenize(val_df)

Loading data...
Tokenization...(should take about 30 seconds)
Done.
Loading data...
Tokenization...(should take about 30 seconds)
Done.


## Fix answers' position

Answers are provided in the form of (char_start, char_end) with respect to the original context, however:
- After tokenization, **characters** have no meaning anymore, since we deal with words
  - Thus, we convert characters to **word indices**
- Tokenization splits long contexts (according to the parameter `max_length`), which results in some context splits not containing the answer.
  - To address this problem, we assign `answer_start, answer_end = (0,0)` whenever the answer is neither partially nor fully contained within the context.

In [9]:
from util.preprocessing import fix_answers

fix_answers(train_df)
fix_answers(val_df)

## Save

- Allow for one-hot encoding of answers (which however seems not useful, as both the loss function (`torch.CrossEntropyLoss`) and the evaluation metrics (`torchmetrics.F1Score`, `torchmetrics.Accuracy`, `torchmetrics.AveragePrecision`) accept targets as 1D.
- Save data as a compressed `.npz` archive for later retrieval.

In [None]:
# One-hot encoding for answers
one_hot = False
if one_hot:
    from util.preprocessing import one_hot_answers
    oh_data = one_hot_answers(val_df)

In [63]:
# Save integer data
keys = ['input_ids', 'attention_mask', 'answer_start', 'answer_end']
train_data = {key:np.stack(train_df[key]).astype('int32') for key in keys}
val_data = {key:np.stack(val_df[key]).astype('int32') for key in keys}

In [89]:
# Save id (string)
train_data['id'] = train_df['id'].values.astype(np.unicode_)
val_data['id'] = val_df['id'].values.astype(np.unicode_)

In [90]:
np.savez_compressed('./data/tokenized/train.npz', **train_data)
np.savez_compressed('./data/tokenized/val.npz', **val_data)

## Sanity check

- Check data was correctly saved

In [96]:
train_data_saved = np.load('./data/tokenized/train.npz')
val_data_saved = np.load('./data/tokenized/val.npz')

for key in keys:
    assert np.equal(train_data[key], train_data_saved[key]).all()
    assert np.equal(val_data[key], val_data_saved[key]).all()
    
assert (np.char.strip(train_data['id']) == np.char.strip(train_data_saved['id'])).all()
assert (np.char.strip(val_data['id']) == np.char.strip(val_data_saved['id'])).all()

# Question Generation

## Read data

- Load data from saved `.csv` files
- `keep_default_na=False` is needed as to avoid interpreting as `nan` an answer reporting `'null'`

In [2]:
directory = './data/raw/'
train_filename = directory+'train_df.csv'
val_filename = directory+'val_df.csv'
train_df = pd.read_csv(train_filename, index_col=0, keep_default_na=False)
val_df = pd.read_csv(val_filename, index_col=0, keep_default_na=False)

## Tokenize questions and contexts

- Use of `transformers.T5TokenizerFast` to obtain tokenized questions and contexts

In [3]:
from util.preprocessing_qg import tokenize

train_df = tokenize(train_df)
val_df = tokenize(val_df)

Loading data...
Tokenization...(should take about 30 seconds)
Done.
Loading data...
Tokenization...(should take about 30 seconds)
Done.


## Fix answers' position

Answers are provided in the form of (char_start, char_end) with respect to the original context, however:
- After tokenization, **characters** have no meaning anymore, since we deal with words
  - Thus, we convert characters to **word indices**
- Tokenization splits long contexts (according to the parameter `max_length`), which results in some context splits not containing the answer.
  - To address this problem, we assign `answer_start, answer_end = (0,0)` whenever the answer is neither partially nor fully contained within the context.

In [4]:
from util.preprocessing_qg import fix_answers

fix_answers(train_df)
fix_answers(val_df)

## Clear 0s

- Eliminate samples for which answers are not included in the context
- This is needed to ensure at least a target for each context when it comes to generated questions

In [6]:
train_df = train_df[~((train_df['answer_start'] == 0) & (train_df['answer_end'] == 0))]
val_df = val_df[~((val_df['answer_start'] == 0) & (val_df['answer_end'] == 0))]

## Save

- Allow for one-hot encoding of answers (which however seems not useful, as both the loss function (`torch.CrossEntropyLoss`) and the evaluation metrics (`torchmetrics.F1Score`, `torchmetrics.Accuracy`, `torchmetrics.AveragePrecision`) accept targets as 1D.
- Save data as a compressed `.npz` archive for later retrieval.

In [None]:
# One-hot encoding for answers
one_hot = False
if one_hot:
    from util.preprocessing import one_hot_answers
    oh_data = one_hot_answers(val_df)

In [None]:
# Save integer data
keys = [item for item in list(train_df.columns) if item != 'id']
train_data = {key:np.stack(train_df[key]).astype('int32') for key in keys}
val_data = {key:np.stack(val_df[key]).astype('int32') for key in keys}

In [10]:
# Save id (string)
train_data['id'] = train_df['id'].values.astype(np.unicode_)
val_data['id'] = val_df['id'].values.astype(np.unicode_)

In [11]:
np.savez_compressed('./data/tokenized-qg/train.npz', **train_data)
np.savez_compressed('./data/tokenized-qg/val.npz', **val_data)

## Sanity check

- Check data was correctly saved

In [12]:
train_data_saved = np.load('./data/tokenized-qg/train.npz')
val_data_saved = np.load('./data/tokenized-qg/val.npz')

for key in keys:
    assert np.equal(train_data[key], train_data_saved[key]).all()
    assert np.equal(val_data[key], val_data_saved[key]).all()
    
assert (np.char.strip(train_data['id']) == np.char.strip(train_data_saved['id'])).all()
assert (np.char.strip(val_data['id']) == np.char.strip(val_data_saved['id'])).all()