## Imports

In [1]:
# Suppress output
%%capture

# Whether the notebook is run within Google Colab or not
colab = 'google.colab' in str(get_ipython())

# General imports
import pandas as pd
import torch
# Install needed dependencies on Colab
if colab:
    !pip install transformers
from transformers import DistilBertTokenizerFast, DistilBertModel

# Enable GPU acceleration, whenever available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Automatically reimport modules at each execution
%reload_ext autoreload
%autoreload 2

## Read data

In [2]:
# Execute this only to load the dataset in csv format if not already done
# from read_dataset import read_dataset

# dataset = read_dataset(path='training_set.json', validation_set_perc=20)
# train_df = pd.DataFrame(dataset[0], columns=['id', 'title', 'context_id', 'context', 'question', 'start', 'end'])
# train_df.to_csv('train_df.csv')
# val_df = pd.DataFrame(dataset[1], columns=['id', 'title', 'context_id', 'context', 'question', 'start', 'end'])
# val_df.to_csv('val_df.csv')

In [3]:
train_filename = './data/train_df.csv' if not colab else 'https://raw.githubusercontent.com/michimichiamo/question-answering/main/data/Dataset/train_df.csv?token=GHSAT0AAAAAABKAIOLOH645DE75GJJWC3WWYPG6MIA'
val_filename = './data/val_df.csv' if not colab else 'https://raw.githubusercontent.com/michimichiamo/question-answering/main/data/Dataset/val_df.csv?token=GHSAT0AAAAAABKAIOLPB2NZWHYR4JBIK37IYPG6MIQ'

train_df = pd.read_csv(train_filename)
val_df = pd.read_csv(val_filename)

### Read questions and contexts

In [4]:
train_questions = list(train_df['question'].values)
train_contexts = list(train_df['context'].values)
train_context_ids = list(train_df['context_id'].values)
train_answers = [train_df['start'].values, train_df['end'].values]

print('question:', train_questions[0])
print('context:', train_contexts[0])
print('context_id:', train_context_ids[0])
print(f'answer: ({train_answers[0][0]}, {train_answers[1][0]})')

val_questions = list(val_df['question'].values)
val_contexts = list(val_df['context'].values)
val_context_ids = list(val_df['context_id'].values)
val_answers = [val_df['start'].values, val_df['end'].values]

print('question:', val_questions[0])
print('context:', val_contexts[0])
print('context_id:', val_context_ids[0])
print(f'answer: ({val_answers[0][0]}, {val_answers[1][0]})')

question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
context_id: 0
answer: (515, 541)
question: What happened to Joseph I in 1758?
context: Following the earthquake, Joseph I gave his Prime Minister even more power, and Sebastião de Melo became a powerful, progressi

## Embedding

In [5]:
## Load tokenizer and transformers

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased-distilled-squad')
model = DistilBertModel.from_pretrained('distilbert-base-cased-distilled-squad')

In [6]:
## Tokenize questions and contexts
max_length = model.config.max_position_embeddings
doc_stride = 256

train_tokenized = tokenizer(
   train_questions,
   train_contexts,
   max_length=max_length,
   truncation="only_second",
   return_overflowing_tokens=True,
   return_offsets_mapping=True,
   stride=doc_stride,
   return_attention_mask=True,
   padding='max_length'
)

val_tokenized = tokenizer(
   val_questions,
   val_contexts,
   max_length=max_length,
   truncation="only_second",
   return_overflowing_tokens=True,
   return_offsets_mapping=True,
   stride=doc_stride,
   return_attention_mask=True,
   padding='max_length'
)

In [7]:
def move_answers_position(tokenized, answers):

	input_ids = tokenized['input_ids']
	offsets = tokenized['offset_mapping']
	mappings = tokenized['overflow_to_sample_mapping']

	new_answers = []
	all_inclusive = 0
	not_all_inclusive = 0
	start_inclusive = 0
	end_inclusive = 0
	for post_index, pre_index in enumerate(mappings):
		start_found = False
		end_found = False
		start_index_word = 0
		end_index_word = 0
		ans_start = answers[0][pre_index]
		ans_end = answers[1][pre_index]
		offset = offsets[post_index]
		input_id = input_ids[post_index]

		# the search starts after the question, so after the first 102 tag
		for i in range(input_id.index(102), len(offset)):
			of = offset[i]
			range_offset = list(range(of[0], of[1]+1))

			if ans_start in range_offset:
				# save the word index i as the answer start
				start_found = True
				start_index_word = i

			if ans_end in range_offset:
				# save the word index i as the answer end
				end_found = True
				end_index_word = i

			if start_found and end_found:
				break

		# the answer is completely included in this context -> no changes
		if start_found and end_found:
			all_inclusive += 1
			new_answers.append((start_index_word, end_index_word))
			continue
		else:
			# here's the problems
			not_all_inclusive += 1
			new_answers.append((0, 0))
			continue

	print("stats: (all, noth): {}".format((all_inclusive, not_all_inclusive)))
	return new_answers


In [9]:
train_new_answers = move_answers_position(train_tokenized, train_answers)
val_new_answers = move_answers_position(val_tokenized, val_answers)

stats: (all, noth): (71270, 84)
stats: (all, noth): (16376, 18)


In [10]:
train_new_start, train_new_end = list(zip(*train_new_answers))
val_new_start, val_new_end = list(zip(*val_new_answers))

In [11]:
new_train_df = pd.DataFrame()
new_train_df['input_ids'] = train_tokenized['input_ids']
new_train_df['start'] = train_new_start
new_train_df['end'] = train_new_end

new_val_df = pd.DataFrame()
new_val_df['input_ids'] = val_tokenized['input_ids']
new_val_df['start'] = val_new_start
new_val_df['end'] = val_new_end

In [12]:
new_train_df.to_csv('pp_train_df.csv')

In [13]:
new_val_df.to_csv('pp_val_df.csv')

In [None]:
## Move to device
#bert_dict = {}
#
#bert_dict['input_ids'] = torch.IntTensor(tokenized['input_ids']).to(device)
#bert_dict['attention_mask'] = torch.IntTensor(tokenized['attention_mask']).to(device)
#
#model = model.to(device)

In [None]:
## Network structure

#transformed = model(**bert_dict)
#dropped = torch.nn.Dropout(0.3)(transformed[0])
#logits = torch.nn.Linear(768, 2, device=device)(dropped)
#start_logits, end_logits = logits.split(1, dim=-1)
#start_logits = start_logits.squeeze(-1)
#end_logits = end_logits.squeeze(-1)
#outputs = (start_logits, end_logits)

In [None]:
#tokenizer.decode(tokenized['input_ids'][0])

In [None]:
#train_df['question'].apply(lambda x: len(x.strip().split(' '))).max()

## TODO

- `tokenized['offset_mapping'][0]` returna le tuple (start,end) di ogni parola dell'input (query, context)

- Problema: splittare i contesti online (nel Dataloader) produce batch di lunghezza variabile
    - Prima soluzione: eliminare gli split che non contengono la domanda
    - Seconda soluzione: creare dataframe con contesti già splittati usando il tokenizer (lunghezza 512, overlapping 256) invece che farlo online nel Dataloader

- Problema: risposte sono presenti in un solo split, cosa fare con gli altri?
    - Una [soluzione](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb#scrollTo=iLekL6Un9D70&line=24&uniqifier=1): riscalare tuple di contesti tagliati (invece che 0, allinearli alla risposta)

    - Un'altra soluzione: 
        - riscalare answer_start e answer_end per ogni contesto
        - lo split che contiene la risposta mantiene answer_start e answer_end, gli altri split dello stesso contesto vanno trattati (assegniamo (0,0)? oppure scartiamo)
        - Possibile [soluzione](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb#scrollTo=v86c_RApFdNG) (in grassetto parte interessante): *Now let's put everything together in one function we will apply to our training set. In the case of impossible answers (the answer is in another feature given by an example with a long context), **we set the cls index for both the start and end position**. We could also simply discard those examples from the training set if the flag allow_impossible_answers is False. Since the preprocessing is already complex enough as it is, we've kept is simple for this part.*

- N.B. sul token `[CLS]` preso da [qui](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb#scrollTo=kv1iD9E6FdND):
> *The very first token ([CLS]) has (0, 0) because it doesn't correspond to any part of the question/answer, then the second token is the same as the characters 0 to 3 of the question*

## Network

In [None]:
class QA(torch.nn.Module):

    def __init__(self, hidden_size=768, num_labels=2, dropout_rate=0.5):
        super(QA, self).__init__()
        # Device
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        # Parameters
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        
        # Layers
        self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased-distilled-squad')
        self.transformers = DistilBertModel.from_pretrained('distilbert-base-cased-distilled-squad').to(self.device)
        self.dropout = torch.nn.Dropout(dropout_rate)
        #self.extra_linear = torch.nn.Linear(self.hidden_size, self.hidden_size)
        #self.extra_linear_tanh = torch.nn.Tanh()
        self.dense = torch.nn.Linear(self.hidden_size, self.num_labels, device=self.device)

    def forward(self, inputs):
        # Unpack inputs
        questions, contexts = inputs
        # Tokenizer
        max_length = self.transformers.config.max_position_embeddings
        doc_stride = 128
        tokenized = self.tokenizer(
            questions,
            contexts,
            max_length=max_length,
            truncation="only_second",
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            stride=doc_stride,
            return_attention_mask=True,
            padding='max_length'
        )
        # Put to device
        bert_dict = {}

        bert_dict['input_ids'] = torch.IntTensor(tokenized['input_ids']).to(self.device)
        bert_dict['attention_mask'] = torch.IntTensor(tokenized['attention_mask']).to(self.device)
        # Transformers 
        transformed = self.transformers(**bert_dict)
        # Dropout
        dropped = self.dropout(transformed[0])
        # Obtain logits
        logits = self.dense(dropped) #(None, seq_len, hidden_size)*(hidden_size, 2)=(None, seq_len, 2)
        start_logits, end_logits = logits.split(1, dim=-1)    #(None, seq_len, 1), (None, seq_len, 1)
        start_logits = start_logits.squeeze(-1)  #(None, seq_len)
        end_logits = end_logits.squeeze(-1)    #(None, seq_len)
        # --- 4) Prepare output tuple
        outputs = (start_logits, end_logits)
        
        return outputs

In [None]:
net = QA()

In [None]:
inputs = (questions, contexts)

In [None]:
class Dataset(torch.utils.data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, questions, contexts):
        'Initialization'
        self.questions = questions
        self.contexts = contexts

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.questions)

  def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        question = self.questions[index]
        context = self.contexts[index]

        # Load data and get label
        X = (question, context)
        #y = answer

        return X#, y

In [None]:
data = Dataset(questions, contexts)

In [None]:
generator = torch.utils.data.DataLoader(data, batch_size=32, num_workers=2, pin_memory=True)

In [None]:
for inputs in generator:
    print(inputs)
    outputs = net.forward(inputs)
    break

In [None]:
outputs