In [None]:
# IF USING GOOGLE COLABORATORY -> RUN FIRST!!!
# OTHERWISE -> IGNORE ;-)

from google.colab import drive

drive.mount('/content/gdrive')

!pip install pymysql
!pip install transformers

# <font color="#003660">Applied Machine Learning for Text Analysis (M.184.5331)</font>
# <font color="#003660">Lesson 9: How Much do Machines Truly Understand? Putting BERT's Reading Comprehension Skills to the Test</font>

<center><br><img width=256 src="https://git.uni-paderborn.de/data.analytics.teaching/aml4ta-2020/-/raw/master/resources/dag.png"/><br></center>

<p>
<center>
<div>
    <font color="#085986"><b>By the end of this lesson, you will be able to...</b><br><br>
        ... recognize the benefits of the Transformer approach; and<br>
        ... implement and train a question answering model using BERT.<br>
    </font>
</div>
</center>
</p>

# 1. Transformers & Question Answering: What's All That Hype About?

<table class="image">
<center>
<caption align="bottom">(Vaswani et al., 2017,  p.3)</caption>
<tr><td><img width=384 src='https://git.uni-paderborn.de/data.analytics.teaching/aml4ta-2020/-/raw/master/week_7/images/transformer_architecture.png'></td></tr>
</center>
</table>

<table class="image">
<center>
<caption align="bottom">(Devlin et al., 2018)</caption>
<tr><td><img width=640 src='https://git.uni-paderborn.de/data.analytics.teaching/aml4ta-2020/-/raw/master/week_7/images/bert.png'></td></tr>
</center>
</table>

# 2. Dataset

<p>For this tutorial, we will use the so-called <a href="https://rajpurkar.github.io/SQuAD-explorer/">Stanford Question Answering Dataset (SQuAD)</a>. As revealed by its name, this benchmark dataset is used for training and evaluating models on the task of question answering. Because of the sheer size of the original dataset and the complexity of the task at hand, we will focus on a small sample of the corpus (SQuAD1.1).</p>

In [None]:
%matplotlib inline

################
# Load dataset #
################

# Import
import re
import getpass
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

# Get credentials
user = input("Username: ")
host = input("Host: ")
db = input("Database: ")
passwd = getpass.getpass("Password: ")

# Create an engine instance (SQLAlchemy)
engine = create_engine("mysql+pymysql://{}:{}@{}/{}".format(user, passwd, host, db))

# Define SQL query
sql_query = "SELECT * FROM SQuAD_V1_Sample"

# Query dataset (pandas)
data = pd.read_sql(sql=sql_query, con=engine, index_col='index')

# Sample
data.head()

In [None]:
###################
# Explore dataset #
###################

data.info()

In [None]:
######################
# Train / Val./ Test #
######################

# Import
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, train_size=0.9, random_state=42)
test, val = train_test_split(test, test_size=0.5, random_state=42)

print(train.shape)
print(val.shape)
print(test.shape)

In [None]:
################
# Print sample #
################

print(train.iloc[0]['context'])
print(f"\n>>> Question: {train.iloc[0]['question']}")
print(f">>> Answer: {train.iloc[0]['answer_text']}")

In [None]:
#################
# Preprocessing #
#################

def preprocessing(row):
    
    # Remove special characters
    regex = re.compile(r'[^a-z0-9\-\'\s]')
    
    # Remove multiple spaces
    row.context = re.sub(r' +', ' ', re.sub(regex, '', row.context.lower()))
    row.question = re.sub(r' +', ' ', re.sub(regex, '', row.question.lower()))
    row.answer_text = re.sub(r' +', ' ', re.sub(regex, '', row.answer_text.lower()))
    
    return row

# Apply preprocessing
train = train.apply(preprocessing, axis=1).reset_index(drop=True)
val = val.apply(preprocessing, axis=1).reset_index(drop=True)
test = test.apply(preprocessing, axis=1).reset_index(drop=True)

train.head()

# 3. Question Answering (Almost) From Scratch

## 3.1 Encoding

<table class="image">
<center>
<caption align="bottom">(Devlin et al., 2018)</caption>
<tr><td><img width=384 src='https://git.uni-paderborn.de/data.analytics.teaching/aml4ta-2020/-/raw/master/week_7/images/bert_qa.png'></td></tr>
</center>
</table>

<p>As can be seen above, we need to encode our data in a specific manner in order to train a question answering model &mdash; i.e., for every entry in our dataset, we need to encode a single sequence (starting with the so-called <code>[CLS]</code> token) composed of the question and the context divided by a <code>[SEP]</code> token. Keep in mind that different Transformer archictures may require different encodings!</p><br>

<center><a href="https://huggingface.co/transformers/"><img width=512 src="https://git.uni-paderborn.de/data.analytics.teaching/aml4ta-2020/-/raw/master/week_7/images/huggingface_transformers_logo.png"/></a></center>

<p>For the rest of this tutorial, we will be using models, implementations, and tokenizers contained within the <a href="https://huggingface.co/transformers/">Transformers</a> library provided by <a href="https://huggingface.co">huggingface</a> (Wolf et al., 2019). This library offers one of the most extensive collection of NLP resources and is, without a doubt, the reference when it comes to Transformer models.</p>

<br>

<p><center><a href="https://huggingface.co/transformers/pretrained_models.html">https://huggingface.co/transformers/pretrained_models.html</a></center></p>

In [None]:
####################
# Encoding example #
# >>> Question     #
####################

# Import
from transformers import AutoTokenizer

# Define model
# BERT -> 'bert-base-uncased'
# MiniLM (12 layers) -> 'microsoft/MiniLM-L12-H384-uncased'
# MiniLM (6 layers) -> 'nreimers/MiniLM-L6-H384-uncased'
# MiniLM -> https://arxiv.org/pdf/2002.10957.pdf

PATH = 'nreimers/MiniLM-L6-H384-uncased'

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(PATH, use_fast=True)

# Question
sample_question = train.iloc[0]['question']

# Tokenizer
sample_question_encoded = tokenizer(
    # TODO
)

print(f'Question:\n{sample_question}\n')
print(f'Token:\n{tokenizer.convert_ids_to_tokens(sample_question_encoded["input_ids"][0])}\n')
print(f'input_ids:\n{sample_question_encoded["input_ids"]}\n')
print(f'token_type_ids:\n{sample_question_encoded["token_type_ids"]}\n')
print(f'attention_mask:\n{sample_question_encoded["attention_mask"]}')

In [None]:
##########################
# Encoding example       #
# >>> Question + Context #
##########################

# Question + Context
sample_question = train.iloc[0]['question']
sample_context = train.iloc[0]['context']

# Tokenizer
sample_sequence_encoded = tokenizer(
    # TODO
)

print(f'Question:\n{sample_question}\n')
print(f'Context:\n{sample_context}\n')
print(f'input_ids:\n{sample_sequence_encoded["input_ids"]}\n')
print(f'token_type_ids:\n{sample_sequence_encoded["token_type_ids"]}\n')
print(f'attention_mask:\n{sample_sequence_encoded["attention_mask"]}')

<center><img width=100 src="https://git.uni-paderborn.de/data.analytics.teaching/aml4ta-2020/-/raw/master/resources/question.png"></center>

<p><center><b>What can we observe here?<br>What is the tensor <code>token_type_ids</code> good for?</b><br><b>What is the purpose of the <code>attention_mask</code> tensor?</b></center></p>

In [None]:
########################
# Generate DataLoaders #
########################

# Import
import torch

def dataloader_generator(dataframe, batch_size):
    
    # Maximum sequence length
    MAX_LENGTH = 512
    
    # Tokenizer
    sequences = tokenizer(
        dataframe.question.tolist(), 
        dataframe.context.tolist(), 
        add_special_tokens=True,
        padding=True, 
        truncation=True, 
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )
    
    # Initialize
    start_tokens = list()
    end_tokens = list()
    
    for index, answer in enumerate(dataframe.answer_text.tolist()):
    
        # Define context start
        context_start = (sequences['input_ids'][index]==102).nonzero()[0]

        # Encode answers
        answer = tokenizer.encode(answer, add_special_tokens=False)
        answer = list(filter(lambda token: token in sequences['input_ids'][index][context_start:], answer))
            
        # Find start/end tokens (answer)    
        for token in (sequences['input_ids'][index]==answer[0]).nonzero():

            if token.item() > context_start:

                if sequences['input_ids'][index][token.item()+len(answer)-1]==answer[-1]:

                    start_tokens.append(token.item())
                    end_tokens.append(token.item()+len(answer)-1)

                    break
    
    # Generate dataset
    dataset = torch.utils.data.TensorDataset(
        sequences['input_ids'], 
        sequences['token_type_ids'], 
        sequences['attention_mask'], 
        torch.tensor(start_tokens), 
        torch.tensor(end_tokens)
    )
                
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size)

# Define batch size

# (bert-base-uncased)
# BATCH_SIZE = 6  ~ 8 GB RAM (GPU) required! 
# BATCH_SIZE = 12 ~ 12 GB RAM (GPU) required! 
# BATCH_SIZE = 24 ~ 22 GB RAM (GPU) required!

# (MiniLM / 6 layers)
# BATCH_SIZE = 6  ~  3 GB RAM (GPU) required! 
# BATCH_SIZE = 12 ~  5 GB RAM (GPU) required! 
# BATCH_SIZE = 24 ~  8 GB RAM (GPU) required! 
# BATCH_SIZE = 48 ~ 14 GB RAM (GPU) required! 

BATCH_SIZE = 48

train_dataloader = dataloader_generator(train, BATCH_SIZE)
val_dataloader = dataloader_generator(val, BATCH_SIZE)

## 3.2 Model

<center><img width=100 src="https://git.uni-paderborn.de/data.analytics.teaching/aml4ta-2020/-/raw/master/resources/tip.png"></center>

<p><center><font color="#003660"><strong><i>The following model is based on the implementation of the <a href="https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertForQuestionAnswering">BertForQuestionAnswering</a> module by huggingface (Wolf et al., 2019).</i></strong></font></center></p>

In [None]:
#########
# Model #
#########

# Import
from torch import nn
from transformers import AutoModel, logging

# Disable warnings
logging.set_verbosity_error()


# Define model
class QuestionAnswering_Model(nn.Module):

    def __init__(self, model):

        super(QuestionAnswering_Model, self).__init__()
        
        # Define model
        self.lm = # TODO
        
        # Define FCL
        self.linear = # TODO
                
    def forward(self, input_ids, attention_mask, token_type_ids):
        
        # Transformer
        # Output[0] -> (batch_size, seq_len, hidden_size)
        outputs = self.lm(
            # TODO
        )
        
        # FCL
        # Output -> (batch_size, seq_len, 2)
        ouputs = # TODO
        
        return ouputs

In [None]:
###############
# Debug model #
###############

sample_question = train.iloc[0]['question']
sample_context = train.iloc[0]['context']

# Tokenizer
sample_sequence_encoded = tokenizer(
    sample_question,
    sample_context, 
    add_special_tokens=True,
    return_tensors='pt'
)

model = QuestionAnswering_Model(PATH)
outputs = model.forward(**sample_sequence_encoded)

print(outputs.shape)

# Debugging
# (if required!)

In [None]:
###########
# Explore #
# Part 1  #
###########

# Split
start_logits, end_logits = outputs[0].split(1, dim=-1)

print(start_logits.shape); print(end_logits.shape)

In [None]:
###########
# Explore #
# Part 2  #
###########

# Get start_position / end_position
start_position = torch.argmax(start_logits)
end_position = torch.argmax(end_logits)

print(start_position); print(end_position)

## 3.3 Training

In [None]:
####################
# Initialize model #
####################

# Define device (GPU vs CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
   
#  Model
model = QuestionAnswering_Model(PATH)
model.to(device)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=4e-5)

# Loss function (a.k.a. criterion)
criterion = torch.nn.CrossEntropyLoss()
    
print(model)

In [None]:
#################
# Loss function #
#################

def compute_loss(outputs, true_start, true_end):
    
    # Split logits – i.e., 1x tensor for start_logits, 1x tensor for end_logits
    # Output: (batch_size, seq_len, 1)
    pred_start, pred_end = outputs.split(1, dim=-1)

    # Squeeze last dimension
    # Output -> (batch_size, seq_len)
    pred_start = pred_start.squeeze(-1)
    pred_end = pred_end.squeeze(-1)

    # Squeeze last dimension
    # Output -> (batch_size, seq_len)
    true_start = true_start.squeeze(-1)
    true_end = true_end.squeeze(-1)

    # Compute loss 
    # loss -> mean(start_loss, end_loss)
    loss_start = criterion(pred_start, true_start.to(device))
    loss_end = criterion(pred_end, true_end.to(device))
    loss = torch.mean(torch.stack([loss_start, loss_end]))
    
    return loss

In [None]:
#######################
# Evaluation function #
#######################

def evaluate(model, dataloader, criterion):
    
    model.eval()
    
    total_loss = 0
    
    for batch_id, batch in enumerate(dataloader):

        # Get validation/test data
        input_ids, token_type_ids, attention_mask, true_start, true_end = batch

        # Feedforward
        with torch.no_grad():
            
            outputs = model(
                input_ids.to(device),
                attention_mask.to(device),
                token_type_ids.to(device)
            )
        
        # Compute loss
        loss = compute_loss(outputs, true_start, true_end)

        # Update loss
        total_loss += loss.item()
        
    return total_loss / len(dataloader)

In [None]:
%%time

###############
# Train model #
###############

EPOCHS = 5

train_history = list()
validation_history = list()

best_validation_loss = np.inf

# Training loop
for epoch in range(EPOCHS):
    
    model.train()
        
    total_loss = 0
    
    for batch_id, batch in enumerate(train_dataloader):
        
        # Get training data
        input_ids, token_type_ids, attention_mask, true_start, true_end = batch
        
        # Clear gradients
        optimizer.zero_grad()
        
        # Feedforward
        outputs = model(
            input_ids.to(device),
            attention_mask.to(device),
            token_type_ids.to(device)
        )
        
        # Compute loss
        loss = compute_loss(outputs, true_start, true_end)
        
        # Backpropagate errors
        loss.backward()
        
        # Clip gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
        
        # Update weights
        optimizer.step()
        
        # Update loss
        total_loss =+ loss.item()
        
    # Validation
    validation_loss = evaluate(model, val_dataloader, criterion)
    
    if validation_loss < best_validation_loss:

        best_validation_loss = validation_loss
        torch.save(model, 'best_QA_model.pt')
    
    train_history.append(total_loss)
    validation_history.append(validation_loss)
    
    print({ 'epoch': epoch, 'training loss': total_loss, 'validation loss': validation_loss})
    
print('\n>>> DONE!')
print(f'>>> BEST MODEL (VALIDATION): {best_validation_loss}')

In [None]:
################
# Plot history #
################

fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))

axs[0].plot(train_history)
axs[0].set_title('Training Loss')

axs[1].plot(validation_history)
axs[1].set_title('Validation Loss')

fig.show()

In [None]:
###############
# Clear cache #
###############

del model, train, train_dataloader, val, val_dataloader

## 4.4 Evaluation

In [None]:
#########################
# Evaluate model (test) #
#########################

# Load model
best_model = torch.load('best_QA_model.pt')

# Evaluate
test_dataloader = dataloader_generator(test, BATCH_SIZE)
print(evaluate(best_model, test_dataloader, criterion))

In [None]:
###########################
# Test model (best model) #
###########################

def get_answer(model, model_path, question, context):
    
    model.eval()
    
    # Tokenizer / Encoder
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
    sequence = tokenizer(
        question, 
        context, 
        add_special_tokens=True, 
        return_tensors='pt'
    )
    sequence.to(device)
    
    # Feedforward
    with torch.no_grad():
        outputs = model(**sequence)
    
    # Logits
    if len(outputs) == 1:
        start_logits, end_logits = outputs.split(1, dim=-1)        
    else:
        start_logits  = outputs[0]
        end_logits = outputs[1]
   
    # Get start_position / end_position
    start_position = torch.argmax(start_logits.cpu())
    end_position = torch.argmax(end_logits.cpu())
            
    # Convert sequence to tokens
    sequence = tokenizer.convert_ids_to_tokens(sequence['input_ids'][0].cpu())
    sequence = ' '.join(sequence[start_position:end_position+1])
       
    return re.sub(r' \#\#', '', sequence) if sequence else 'Sorry... I could not find an answer :-('
 
# Question ID
QUESTION_ID = 0

# Answer (Model)
question = test.iloc[QUESTION_ID]['question']
context = test.iloc[QUESTION_ID]['context']
model_answer = get_answer(best_model, PATH, question, context)
                    
# Question / Context / Answer (Model) / Answer (Truth) 
print(f">>> Context: {context}")
print(f"\n>>> Question: {question}")
print(f">>> Answer (Model): {model_answer}")
print(f">>> Answer (Truth): {test.iloc[QUESTION_ID]['answer_text']}")

# 5. The Easy Approach &rarr; <code>BertForQuestionAnswering(...)</code>

In [None]:
################
# Define model #
################

# Import
from transformers import BertForQuestionAnswering

# Path
MODEL_PATH = 'bert-large-uncased-whole-word-masking-finetuned-squad'

# Load model
finetuned_model = BertForQuestionAnswering.from_pretrained(MODEL_PATH)
finetuned_model.to(device)

print(finetuned_model)

In [None]:
###########################
# Test model (fine-tuned) #
###########################

# Question ID 
QUESTION_ID = 0

# Answer (Model)
question = test.iloc[QUESTION_ID]['question']
context = test.iloc[QUESTION_ID]['context']
model_answer = get_answer(finetuned_model, MODEL_PATH, question, context)
                    
# Question / Context / Answer (Model) / Answer (Truth) 
print(f">>> Question: {question}")
print(f"\n>>> Context: {context}")
print(f"\n>>> Answer (Model): {model_answer}")
print(f">>> Answer (Truth): {test.iloc[QUESTION_ID]['answer_text']}")

In [None]:
######################################################
# Example (UPB)                                      #
# https://en.wikipedia.org/wiki/Paderborn_University #
######################################################

# Answer (Model)
question = "where is paderborn university located"
context = """Paderborn University is one of the fourteen public research universities in the state of
North Rhine-Westphalia in Germany. It was founded in 1972 and 20,308 students were enrolled at the university 
in the wintersemester 2016/2017.[1] It offers 62 different degree programmes. The university has several winners of the Gottfried Wilhelm Leibniz Prize awarded by the German Research Foundation 
(DFG) and ERC grant recipients of the European Research Council. In 2002, the Romanian mathematician Preda Mihailescu proved the Catalan conjecture, a number-theoretical conjecture, formulated by the French and 
Belgian mathematician Eugène Charles Catalan, which had stood unresolved for 158 years. The University Closely 
Collaborates with the Heinz Nixdorf Institute, Paderborn Center for Parallel Computing and two Fraunhofer Institutes
for research in Computer Science, Mathematics, Electrical Engineering and Quantum Photonics."""
model_answer = get_answer(finetuned_model, MODEL_PATH, question, re.sub(r'[^a-z0-9\-\'\s]', '', context.lower()))

# Question / Answer (Model)
print(f">>> Question: {question}")
print(f">>> Answer (Model): {model_answer}")

<ul style="list-style-type:round">
<i>
    <li>Bahdanau, D., Cho, K., &amp; Bengio, Y. (2014). Neural machine translation by jointly learning to align and translate.</li>
    <li>Devlin, J., Chang, M. W., Lee, K., &amp; Toutanova, K. (2018). Bert: Pre-training of deep bidirectional transformers for language understanding.</li>
    <li>Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., &amp; Polosukhin, I. (2017). Attention is all you need. Advances in neural information processing systems, 30, 5998-6008.</li>
    <li>Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., &amp; Brew, J. (2019). HuggingFace's Transformers: State-of-the-art Natural Language Processing.</li>
</i>
</ul>