## Source Cited: <br> Natural Language Processing with TensorFlow - Second Edition <br> by Packt Publishing, Ch 10 (Transformers)

Load dataset SQUAD which we will use for the modeling.

In [1]:
from datasets import load_dataset
dataset = load_dataset("squad")

In [2]:
for q, a in zip(dataset["train"]["question"][:5], dataset["train"]["answers"][:5]):
    print(f"{q} -> {a}")

To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? -> {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}
What is in front of the Notre Dame Main Building? -> {'text': ['a copper statue of Christ'], 'answer_start': [188]}
The Basilica of the Sacred heart at Notre Dame is beside to which structure? -> {'text': ['the Main Building'], 'answer_start': [279]}
What is the Grotto at Notre Dame? -> {'text': ['a Marian place of prayer and reflection'], 'answer_start': [381]}
What sits on top of the Main Building at Notre Dame? -> {'text': ['a golden statue of the Virgin Mary'], 'answer_start': [92]}


In [3]:
def compute_end_index(answers, contexts):
    """ Add end index to answers """
    
    fixed_answers = []
    for answer, context in zip(answers, contexts):
        gold_text = answer['text'][0]
        answer['text'] = gold_text
        start_idx = answer['answer_start'][0]
        answer['answer_start'] = start_idx
        
        # Make sure the starting index is valid and there is an answer
        assert start_idx >=0 and len(gold_text.strip()) > 0
        
        end_idx = start_idx + len(gold_text)        
        answer['answer_end'] = end_idx
        
        # Make sure the corresponding context matches the actual answer
        assert context[start_idx:end_idx] == gold_text
        
        fixed_answers.append(answer)
    
    return fixed_answers, contexts
train_questions = dataset["train"]["question"]
print("Training data corrections")
train_answers, train_contexts = compute_end_index(
    dataset["train"]["answers"], dataset["train"]["context"]
)
test_questions = dataset["validation"]["question"]
print("\nValidation data correction")
test_answers, test_contexts = compute_end_index(
    dataset["validation"]["answers"], dataset["validation"]["context"]
)

Training data corrections

Validation data correction


In [4]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [5]:
context = "This is the context"
question = "This is the question"
token_ids = tokenizer(
    text=context, text_pair=question,
padding=False, return_tensors='tf'
)

In [6]:
# Encode train data
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True, return_tensors='tf')
# Encode test data
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True, return_tensors='tf')

In [7]:
def replace_char_with_token_indices(encodings, answers):
    start_positions = []
    end_positions = []
    n_updates = 0
    # Go through all the answers
    for i in range(len(answers)):
        # Get the token position for both start end char positions
        start_positions.append(encodings.char_to_token(i, 
        answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, 
        answers[i]['answer_end'] - 1))
        
        if start_positions[-1] is None or end_positions[-1] is None:
            n_updates += 1
        # if start position is None, the answer passage has been truncated
        # In the guide, https://huggingface.co/transformers/custom_
        # datasets.html#qa-squad they set it to model_max_length, but
        # this will result in NaN losses as the last available label is
        # model_max_length-1 (zero-indexed)
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length -1
            
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length -1
            
    print("{}/{} had answers truncated".format(n_updates, 
    len(answers)))
    encodings.update({'start_positions': start_positions, 
    'end_positions': end_positions})
    
replace_char_with_token_indices(train_encodings, train_answers)
replace_char_with_token_indices(test_encodings, test_answers)

10/87599 had answers truncated
8/10570 had answers truncated


In [8]:
def data_gen(input_ids, attention_mask, start_positions, end_positions):
    """ Generator for data """
    for inps, attn, start_pos, end_pos in zip(input_ids, 
    attention_mask, start_positions, end_positions):
        yield (inps, attn), (start_pos, end_pos)

In [9]:
from functools import partial
# Define the generator as a callable
train_data_gen = partial(data_gen,
    input_ids=train_encodings['input_ids'], 
    attention_mask=train_encodings['attention_mask'],
    start_positions=train_encodings['start_positions'],
    end_positions=train_encodings['end_positions']
)

In [10]:
import tensorflow as tf
train_dataset = tf.data.Dataset.from_generator(
    train_data_gen, output_types=(('int32', 'int32'), ('int32', 'int32'))
)

In [11]:
# Shuffling the data
train_dataset = train_dataset.shuffle(5)

In [12]:
# Valid set is taken as the first 50 samples in the shuffled set
valid_dataset = train_dataset.take(5)
valid_dataset = valid_dataset.batch(1)
# Rest is kept as the training data
train_dataset = train_dataset.take(5)
train_dataset = train_dataset.batch(1)

In [35]:
# Creating test data
print("Creating test data")
# Define the generator as a callable
test_data_gen = partial(data_gen,
    input_ids=test_encodings['input_ids'], 
    attention_mask=test_encodings['attention_mask'],
    start_positions=test_encodings['start_positions'], 
    end_positions=test_encodings['end_positions']
)
test_dataset = tf.data.Dataset.from_generator(
    test_data_gen, output_types=(('int32', 'int32'), ('int32', 
    'int32'))
)
test_dataset = test_dataset.batch(1)

Creating test data


In [14]:
from transformers import BertConfig, TFBertForQuestionAnswering

In [15]:
config = BertConfig.from_pretrained("bert-base-uncased", return_dict=False)

In [16]:
model = TFBertForQuestionAnswering.from_pretrained("bert-base-uncased", config=config)

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

Some weights or buffers of the TF 2.0 model TFBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
def tf_wrap_model(model):
    """ Wraps the huggingface's model with in the Keras Functional API """
    # Define inputs
    input_ids = tf.keras.layers.Input([None,], dtype=tf.int32, 
    name="input_ids")
    attention_mask = tf.keras.layers.Input([None,], dtype=tf.int32, 
    name="attention_mask")
    
    # Define the output (TFQuestionAnsweringModelOutput)
    out = model([input_ids, attention_mask])
    
    # Get the correct attributes in the produced object to generate an
    # output tuple
    wrap_model = tf.keras.models.Model([input_ids, attention_mask], 
    #outputs=(out.start_logits, out.end_logits)
    outputs=(out[0], out[1])
    )
    
    return wrap_model

In [18]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
acc = tf.keras.metrics.SparseCategoricalAccuracy()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
model_v2 = tf_wrap_model(model)
model_v2.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [19]:
model_v2.fit(
    train_dataset, 
    validation_data=valid_dataset,
    epochs=3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2c33d5f74f0>

In [None]:
import os
# Create folders
if not os.path.exists('models'):
    os.makedirs('models')
if not os.path.exists('tokenizers'):
    os.makedirs('tokenizers')
    
# Save the model
model_v2.get_layer("tf_bert_for_question_answering").save_pretrained(os.path.join('models', 'bert_qa'))
# Save the tokenizer
tokenizer.save_pretrained(os.path.join('tokenizers', 'bert_qa'))