## Imports

In [2]:
colab = 'google.colab' in str(get_ipython())

import pandas as pd
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
if colab:
    !pip install transformers



In [4]:
from transformers import DistilBertTokenizerFast, DistilBertModel

In [5]:
%reload_ext autoreload
%autoreload 2

## Read data

In [69]:
#from read_dataset import read_dataset

In [70]:
#dataset = read_dataset(path='SQUAD MATERIAL/training_set.json', validation_set_perc=20)

In [71]:
#train_df = pd.DataFrame(dataset[0], columns=['id', 'title', 'context', 'question', 'start', 'end'])
#train_df.to_csv('./data/train_df.csv')

In [72]:
#val_df = pd.DataFrame(dataset[1], columns=['id', 'title', 'context', 'question', 'start', 'end'])
#val_df.to_csv('./data/val_df.csv')

In [6]:
train_filename = './data/train_df.csv' if not colab else 'https://raw.githubusercontent.com/michimichiamo/question-answering/main/data/train_df.csv?token=AIA5FIEICIT5A5YK7SA735DB323QY'
val_filename = './data/val_df.csv' if not colab else 'https://raw.githubusercontent.com/michimichiamo/question-answering/main/data/val_df.csv?token=AIA5FICMZXY4V26OTB7NVR3B323OI'

train_df = pd.read_csv(train_filename)
val_df = pd.read_csv(val_filename)

### Read questions and contexts

In [7]:
questions = list(train_df['question'].values)
contexts = list(train_df['context'].values)

## Embedding

In [75]:
## Load tokenizer and transformers

#tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased-distilled-squad')
#model = DistilBertModel.from_pretrained('distilbert-base-cased-distilled-squad')

Some weights of the model checkpoint at distilbert-base-cased-distilled-squad were not used when initializing DistilBertModel: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [76]:
## Tokenize questions and contexts
#max_length = model.config.max_position_embeddings
#doc_stride = 128
#
#tokenized = tokenizer(
#    questions[:10],
#    contexts[:10],
#    max_length=max_length,
#    truncation="only_second",
#    return_overflowing_tokens=True,
#    return_offsets_mapping=True,
#    stride=doc_stride,
#    return_attention_mask=True,
#    padding='max_length'
#)

In [None]:
## Move to device
#bert_dict = {}
#
#bert_dict['input_ids'] = torch.IntTensor(tokenized['input_ids']).to(device)
#bert_dict['attention_mask'] = torch.IntTensor(tokenized['attention_mask']).to(device)
#
#model = model.to(device)

In [None]:
## Network structure

#transformed = model(**bert_dict)
#dropped = torch.nn.Dropout(0.3)(transformed[0])
#logits = torch.nn.Linear(768, 2, device=device)(dropped)
#start_logits, end_logits = logits.split(1, dim=-1)
#start_logits = start_logits.squeeze(-1)
#end_logits = end_logits.squeeze(-1)
#outputs = (start_logits, end_logits)

In [77]:
#tokenizer.decode(tokenized['input_ids'][0])

In [78]:
#train_df['question'].apply(lambda x: len(x.strip().split(' '))).max()

## TODO

- `tokenized['offset_mapping'][0]` returna le tuple (start,end) di ogni parola dell'input (query, context)
  - Problema: risposte si accavallano attorno a max_length
  - [Soluzione](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb#scrollTo=iLekL6Un9D70&line=24&uniqifier=1): riscalare tuple di contesti tagliati (invece che 0, allinearli alla risposta)

- creare dataframe con contesti splittati (lunghezza 512, overlapping 256)
    - riscalare answer_start e answer_end per ogni contesto
    - lo split che contiene la risposta mantiene answer_start e answer_end, gli altri split dello stesso contesto vanno trattati (assegniamo (0,0)? oppure scartiamo)

## Network

In [25]:
class QA(torch.nn.Module):

    def __init__(self, hidden_size=768, num_labels=2, dropout_rate=0.5):
        super(QA, self).__init__()
        # Device
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        # Parameters
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        
        # Layers
        self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased-distilled-squad')
        self.transformers = DistilBertModel.from_pretrained('distilbert-base-cased-distilled-squad').to(self.device)
        self.dropout = torch.nn.Dropout(dropout_rate)
        #self.extra_linear = torch.nn.Linear(self.hidden_size, self.hidden_size)
        #self.extra_linear_tanh = torch.nn.Tanh()
        self.dense = torch.nn.Linear(self.hidden_size, self.num_labels, device=self.device)

    def forward(self, inputs):
        # Unpack inputs
        questions, contexts = inputs
        # Tokenizer
        max_length = self.transformers.config.max_position_embeddings
        doc_stride = 128
        tokenized = self.tokenizer(
            questions,
            contexts,
            max_length=max_length,
            truncation="only_second",
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            stride=doc_stride,
            return_attention_mask=True,
            padding='max_length'
        )
        # Put to device
        bert_dict = {}

        bert_dict['input_ids'] = torch.IntTensor(tokenized['input_ids']).to(self.device)
        bert_dict['attention_mask'] = torch.IntTensor(tokenized['attention_mask']).to(self.device)
        # Transformers 
        transformed = self.transformers(**bert_dict)
        # Dropout
        dropped = self.dropout(transformed[0])
        # Obtain logits
        logits = self.dense(dropped) #(None, seq_len, hidden_size)*(hidden_size, 2)=(None, seq_len, 2)
        start_logits, end_logits = logits.split(1, dim=-1)    #(None, seq_len, 1), (None, seq_len, 1)
        start_logits = start_logits.squeeze(-1)  #(None, seq_len)
        end_logits = end_logits.squeeze(-1)    #(None, seq_len)
        # --- 4) Prepare output tuple
        outputs = (start_logits, end_logits)
        
        return outputs

In [27]:
net = QA()

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.7/multiprocessing/queues.py", line 263, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times

Some weights of the model checkpoint at distilbert-base-cased-distilled-squad were not used when initializing DistilBertModel: ['qa_outputs.bias', 'qa_

In [10]:
inputs = (questions, contexts)

In [30]:
class Dataset(torch.utils.data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, questions, contexts):
        'Initialization'
        self.questions = questions
        self.contexts = contexts

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.questions)

  def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        question = self.questions[index]
        context = self.contexts[index]

        # Load data and get label
        X = (question, context)
        #y = answer

        return X#, y

In [31]:
data = Dataset(questions, contexts)

In [32]:
generator = torch.utils.data.DataLoader(data, batch_size=32, num_workers=2, pin_memory=True)

In [34]:
for inputs in generator:
    print(inputs)
    outputs = net.forward(inputs)
    break

[['To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'What is in front of the Notre Dame Main Building?', 'The Basilica of the Sacred heart at Notre Dame is beside to which structure?', 'What is the Grotto at Notre Dame?', 'What sits on top of the Main Building at Notre Dame?', 'When did the Scholastic Magazine of Notre dame begin publishing?', "How often is Notre Dame's the Juggler published?", 'What is the daily student paper at Notre Dame called?', 'How many student news papers are found at Notre Dame?', 'In what year did the student paper Common Sense begin publication at Notre Dame?', 'Where is the headquarters of the Congregation of the Holy Cross?', 'What is the primary seminary of the Congregation of the Holy Cross?', 'What is the oldest structure at Notre Dame?', 'What individuals live at Fatima House at Notre Dame?', 'Which prize did Frederick Buechner create?', 'How many BS level degrees are offered in the College of Engineering at Notre Dame?', 'In w

In [35]:
outputs

(tensor([[ 0.4342,  0.0733, -0.0475,  ..., -0.2960,  0.1097, -0.0904],
         [-0.4771, -1.3238,  0.1047,  ..., -0.2838, -0.4498, -0.8579],
         [ 0.0358, -1.0890, -0.8980,  ..., -1.0202,  0.0600, -0.2759],
         ...,
         [-0.7880, -0.1207, -0.3248,  ...,  0.7129,  0.3810, -0.3489],
         [ 0.1539, -0.5027,  0.2483,  ...,  0.0085, -0.3860,  0.1066],
         [-0.0382, -0.4316,  0.2223,  ...,  0.8928, -0.1665,  0.3745]],
        device='cuda:0', grad_fn=<SqueezeBackward1>),
 tensor([[-0.2273, -0.2517,  0.5381,  ..., -0.1428,  0.1750, -0.1129],
         [-0.3737,  0.3421,  0.0112,  ...,  0.3464,  0.1567,  0.1738],
         [ 0.1011,  0.0220, -0.6760,  ...,  0.0618, -0.2218, -0.2397],
         ...,
         [-0.4341, -0.4212, -1.0017,  ..., -0.2668,  0.6228,  0.2970],
         [ 0.2590,  0.2825, -0.0592,  ..., -0.0066,  0.2497, -0.7250],
         [-0.0498, -0.7950, -0.3621,  ...,  0.1067,  0.2344, -0.4733]],
        device='cuda:0', grad_fn=<SqueezeBackward1>))