In [1]:
import tensorflow as tf
from tensorflow import keras
import keras.layers as layers
import transformers
import json
import tokenizers
from tqdm import tqdm
import datetime
import os
import numpy as np


# Download and Load URL

In [2]:
train_url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json'
test_url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json'
vocab_url = 'https://huggingface.co/distilbert-base-uncased/blob/main/vocab.txt'

train_load = keras.utils.get_file(fname='squad_train.json',
                                 origin=train_url)
test_load = keras.utils.get_file(fname='squad_test.json',
                                origin=test_url)
vocab_load = keras.utils.get_file(fname='distilbert-base-uncased-vocab.txt',
                                 origin=vocab_url)

In [3]:
with open(train_load) as f:
    train_data = json.load(f)['data']
    
with open(test_load) as f:
    test_data = json.load(f)['data']

In [4]:
# Data structure:
# articles --> 'paragraph' in article --> [qas,context] in article

In [5]:
# For transformer we need
# input_ids
# token_type_ids
# attention_mask

# Write Functions to Process Input Data
* Make input_ids
    * These are inputs in a form that BERT can process.
    * Input should be in form [CLS] + Question + [SEP] + Context.
    * This should then be padded and tokenized to get input_ids.
* Token_type_ids
    * Lets BERT know to what segment each part of the input corresponds.
    * [CLS] through [SEP] are 0, the context is 1.
* Attention_mask
    * Prevent BERT from performing attention on padding tokens.
    * Padding tokens are 0, everything else is 1.

## Initialize Tokenizer

In [6]:
# Load tokenizer
tokenizer = tokenizers.BertWordPieceTokenizer('distilbert-base-uncased-vocab.txt')

## Make input IDs

In [7]:
# Takes a question and context, both in string form.
# Return a list of input_ids
def make_input_ids(question, context):
    processed_question = question
    encoded_question = tokenizer.encode(processed_question)
    encoded_context = tokenizer.encode(context)
    input_ids = encoded_question.ids + encoded_context.ids
    return input_ids, len(encoded_question), len(encoded_context.ids), encoded_context

## Make Token Type IDs

In [8]:
# Takes the length of the question portion of the input_ids and
# the full input_ids length.
# Returns a token_type_id
def make_token_id(question_length, context_length):
    token_ids = [0] * question_length + [1] * context_length
    return token_ids

In [9]:
# Example of input_ids and token_ids
ids, question_length, context_length, _ = make_input_ids('how are you?', 'I am great')

token_id = make_token_id(question_length, context_length)

print(question_length, len(token_id), token_id)
print(ids)

6 11 [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
[101, 2129, 2024, 2017, 1029, 102, 101, 1045, 2572, 2307, 102]


## Pad sequences and Create Attention Masks

In [10]:
# Takes input_ids, token_ids, and a maximum length
# Returns the padded input sequences and an attention mask
def pad_and_mask_sequences(input_ids, token_ids, max_length):
    padding_length = max_length - len(input_ids)
    attention_mask = [1] * len(input_ids) + [0] * padding_length
    input_ids += [0] * padding_length
    token_ids += [0] * padding_length
    return attention_mask, token_ids, input_ids

In [11]:
# Example of padding with max_length = 20
attn, tok, id = pad_and_mask_sequences(ids, token_id, 20)
len(attn), len(tok), len(id)

(20, 20, 20)

# Process Targets
Targets are in the form of (start_index, end_index)

In [12]:
a = tokenizer.encode('dogs and cats')
a.offsets

[(0, 0), (0, 4), (5, 8), (9, 13), (0, 0)]

In [13]:
# Takes in an answer dictionary and an encoded context
# Returns a start_index and end_index of the answer in the
# tokenized context.

def process_answers(answer, encoded_context):
    # Find the start
    start_index = answer['answer_start']
    end_index = start_index + len(answer['text'])
    
    # Recover original context string by stripping start/end tokens
    original_context = ' '.join(encoded_context.tokens[1:-1])
    
    # Create a of the context with answer characters as 1
    answer_mask = [0] * len(original_context)
    for index in range(start_index, end_index):
        answer_mask[index] = 1
        
    # Use mask to find index of starting and ending tokens in the
    # encoded answer.
    # Offsets returns a tuple giving the starting and ending index
    # of each token in the tokenized context.
    answer_tokens = []
    for index, (start, end) in enumerate(encoded_context.offsets):
        if 1 in answer_mask[start:end]:
            answer_tokens.append(index)
    
    start_index = answer_tokens[0]
    end_index = answer_tokens[-1]
    
    return start_index, end_index         
        

## Put Preprocessing Functions Together

In [14]:
def process_text(question, answer, context, max_length):
    input_ids, question_length, context_length, encoded_context = make_input_ids(question, context)
    token_type_ids = make_token_id(question_length, context_length)
    attention_mask, token_type_ids, input_ids = pad_and_mask_sequences(input_ids, token_type_ids, max_length)
    start_index, end_index = process_answers(answer, encoded_context)
    return attention_mask, token_type_ids, input_ids, start_index, end_index

# Preprocess the Text

In [15]:
def process_data(data, max_length):
    x_output = {'attention_masks' : [],
           'type_ids' : [],
           'input_ids' : []}
    y_output ={'start_index' : [],
           'end_index' : []}
    for sample in tqdm(data):
        for paragraph in sample['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                # Only add to output if the question has an answer
                if len(qa['answers']) > 0:
                    answer = qa['answers'][0]
                    attention_mask, token_type_ids, input_ids, start_index, end_index = process_text(question, answer, context, max_length)
                    
                    # Only keep sequences below max_length
                    if len(input_ids) <= max_length:
                        x_processing_outputs = attention_mask, token_type_ids, input_ids
                        y_processing_outputs = start_index, end_index
                        for index, key in enumerate(x_output.keys()):
                            x_output[key].append(x_processing_outputs[index])

                        for index, key in enumerate(y_output.keys()):
                            y_output[key].append(y_processing_outputs[index])
        
    return x_output, y_output

# Load Data
For the sake of convenience, the below function checks to see if the processed data is already stored in a json file before performing any data processing. If it is already stored, the stored data is loaded instead of processing any data. If the data is not stored, it is saved in a json file after it is processed. This also performs some conversions to numpy arrays, since those are not json serializable.

In [16]:
# Checks to see if data is saved to disk. If not, processes the 
# downloaded json files and saves the processed data to the disk.
# Returns xtrain, ytrain always. Also returns xtest, ytest if load_test is set to true
def load_data(maxlength, load_test=False):
    if os.path.exists(f'xtrain_squad{maxlength}.json') and os.path.exists(f'ytrain_squad{maxlength}.json'):
        print('Train data found on disk!')
        with open(f'xtrain_squad{maxlength}.json', 'r') as f:
            xtrain = json.load(f)
        with open(f'ytrain_squad{maxlength}.json', 'r') as f:
            ytrain = json.load(f)
    else:
        xtrain, ytrain = process_data(train_data, maxlength)
        print('Writing xtrain')
        with open(f'xtrain_squad{maxlength}.json', 'w') as f:
            json.dump(xtrain, f)
        print('Writing ytrain')
        with open(f'ytrain_squad{maxlength}.json', 'w') as f:
            json.dump(ytrain, f)
    print('performing numpy conversion')      
    for key in xtrain.keys():
        xtrain[key] = np.asarray(xtrain[key], dtype=object).astype('float32')
    for key in ytrain.keys():
            output = np.array([[index] for index in ytrain[key]])
            ytrain[key] = np.asarray(output)        
            
    if load_test:
        if os.path.exists(f'xtest_squad{maxlength}.json') and os.path.exists(f'ytest_squad{maxlength}.json'):
            print('Test data found on disk!')
            with open(f'xtest_squad{maxlength}.json', 'r') as f:
                xtest = json.load(f)
            with open(f'ytest_squad{maxlength}.json', 'r') as f:
                ytest = json.load(f)      
        else:
            xtest, ytest = process_data(test_data, maxlength)
            print('Writing xtest')
            with open(f'xtest_squad{maxlength}.json', 'w') as f:
                json.dump(xtest, f)
            print('Writing ytest')
            with open(f'ytest_squad{maxlength}.json', 'w') as f:
                json.dump(ytest, f)

        print('performing numpy conversion')      
        for key in xtest.keys():
            xtest[key] = np.asarray(xtest[key], dtype=object).astype('float32')
        for key in ytest.keys():
            output = np.array([[index] for index in ytest[key]])
            ytest[key] = np.asarray(output)
                            
        return xtrain, ytrain, xtest, ytest
    else:
        return xtrain, ytrain
    

In [17]:
maxlength=256
batch_size= 20

In [18]:
xtrain, ytrain, xtest, ytest = load_data(maxlength, load_test=True)

Train data found on disk!
performing numpy conversion
Test data found on disk!
performing numpy conversion


In [19]:
data_len = 1000
data = [xtrain, ytrain, xtest, ytest]
for dataset in data:
    for key in dataset:
        dataset[key] = dataset[key][:]
xtrain['input_ids'].shape


(78318, 256)

# Build Model

## Model Parameters

## Model

In [20]:
# Inputs
input_ids = layers.Input(shape=(maxlength,), 
                         name='input_ids',
                         dtype=tf.int32)
token_type_ids = layers.Input(shape=(maxlength),
                              name='type_ids',
                             dtype=tf.int32)
attention_mask = layers.Input(shape=(maxlength), 
                              name='attention_masks',
                             dtype=tf.int32)

# Transformer
bert = transformers.TFDistilBertModel.from_pretrained("distilbert-base-uncased")

# bert returns a tuple, the first item in which is the hidden state
hidden_state = bert(input_ids=input_ids,
                               #token_type_ids=token_type_ids,
                               attention_mask=attention_mask)[0]

start = layers.Dense(1)(hidden_state)
start = layers.Flatten()(start)
p_start = layers.Activation(keras.activations.softmax)(start)

end = layers.Dense(1)(hidden_state)
end = layers.Flatten()(end)
p_end = layers.Activation(keras.activations.softmax)(end)

model = keras.Model(inputs=[input_ids, token_type_ids, attention_mask],
                   outputs=[p_start, p_end])

model.summary()

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_transform', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_masks (InputLayer)   [(None, 256)]        0           []                               
                                                                                                  
 tf_distil_bert_model (TFDistil  TFBaseModelOutput(l  66362880   ['input_ids[0][0]',              
 BertModel)                     ast_hidden_state=(N               'attention_masks[0][0]']        
                                one, 256, 768),                                                   
                                 hidden_states=None                                           

In [21]:
scce = keras.losses.SparseCategoricalCrossentropy()
opt = keras.optimizers.Adam(learning_rate=5e-5)
callback = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0.001,
    patience=10,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
)

model.compile(optimizer = opt, 
              loss=[scce,scce], 
              metrics=['Accuracy'])

In [22]:
model.fit(xtrain,
         [ytrain['start_index'],ytrain['end_index']],
          validation_split=0.2,
          callbacks = [callback],
         epochs=50,
         batch_size=batch_size)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50


<keras.callbacks.History at 0x1b024eff760>

In [23]:
ytrain['start_index'][0]

array([67])