# Stage 1: Importing dependencies

In [0]:
!pip install tf-models-official
!pip install tf-nightly

In [0]:
import tensorflow as tf

In [0]:
tf.__version__

'2.1.0-dev20191217'

In [0]:
import tensorflow_hub as hub

from official.nlp.bert.tokenization import FullTokenizer
from official.nlp.bert.input_pipeline import create_squad_dataset
from official.nlp.bert.squad_lib import generate_tf_record_from_json_file

from official.nlp import optimization

from official.nlp.bert.squad_lib import read_squad_examples
from official.nlp.bert.squad_lib import FeatureWriter
from official.nlp.bert.squad_lib import convert_examples_to_features
from official.nlp.bert.squad_lib import write_predictions

In [0]:
import numpy as np
import math
import random
import time
import json
import collections
import os

from google.colab import drive

# Stage 2: Data preprocessing

In [0]:
drive.mount("/content/drive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
input_meta_data = generate_tf_record_from_json_file(
    "/content/drive/My Drive/projects/BERT/data/squad/train-v1.1.json",
    "/content/drive/My Drive/projects/BERT/data/squad/vocab.txt",
    "/content/drive/My Drive/projects/BERT/data/squad/train-v1.1.tf_record")

In [0]:
with tf.io.gfile.GFile("/content/drive/My Drive/projects/BERT/data/squad/train_meta_data", "w") as writer:
    writer.write(json.dumps(input_meta_data, indent=4) + "\n")

In [0]:
BATCH_SIZE = 4

train_dataset = create_squad_dataset(
    "/content/drive/My Drive/projects/BERT/data/squad/train-v1.1.tf_record",
    input_meta_data['max_seq_length'], # 384
    BATCH_SIZE,
    is_training=True)

# Stage 3: Model building

## Squad layer

In [0]:
class BertSquadLayer(tf.keras.layers.Layer):

  def __init__(self):
    super(BertSquadLayer, self).__init__()
    self.final_dense = tf.keras.layers.Dense(
        units=2,
        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))

  def call(self, inputs):
    logits = self.final_dense(inputs) # (batch_size, seq_len, 2)

    logits = tf.transpose(logits, [2, 0, 1]) # (2, batch_size, seq_len)
    unstacked_logits = tf.unstack(logits, axis=0) # [(batch_size, seq_len), (batch_size, seq_len)] 
    return unstacked_logits[0], unstacked_logits[1]

## Whole model

In [0]:
class BERTSquad(tf.keras.Model):
    
    def __init__(self,
                 name="bert_squad"):
        super(BERTSquad, self).__init__(name=name)
        
        self.bert_layer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
            trainable=True)
        
        self.squad_layer = BertSquadLayer()
    
    def apply_bert(self, inputs):
        _ , sequence_output = self.bert_layer([inputs["input_ids"],
                                               inputs["input_mask"],
                                               inputs["segment_ids"]])
        return sequence_output

    def call(self, inputs):
        seq_output = self.apply_bert(inputs)

        start_logits, end_logits = self.squad_layer(seq_output)
        
        return start_logits, end_logits

# Stage 4: Training

## Creating the AI

In [0]:
TRAIN_DATA_SIZE = 88641
NB_BATCHES_TRAIN = 2000
BATCH_SIZE = 4
NB_EPOCHS = 3
INIT_LR = 5e-5
WARMUP_STEPS = int(NB_BATCHES_TRAIN * 0.1)

In [0]:
train_dataset_light = train_dataset.take(NB_BATCHES_TRAIN)

In [0]:
bert_squad = BERTSquad()

In [0]:
optimizer = optimization.create_optimizer(
    init_lr=INIT_LR,
    num_train_steps=NB_BATCHES_TRAIN,
    num_warmup_steps=WARMUP_STEPS)

In [0]:
def squad_loss_fn(labels, model_outputs):
    start_positions = labels['start_positions']
    end_positions = labels['end_positions']
    start_logits, end_logits = model_outputs

    start_loss = tf.keras.backend.sparse_categorical_crossentropy(
        start_positions, start_logits, from_logits=True)
    end_loss = tf.keras.backend.sparse_categorical_crossentropy(
        end_positions, end_logits, from_logits=True)
    
    total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2

    return total_loss

train_loss = tf.keras.metrics.Mean(name="train_loss")

In [0]:
bert_squad.compile(optimizer,
                   squad_loss_fn)

In [0]:
checkpoint_path = "./drive/My Drive/projects/BERT/ckpt/"

ckpt = tf.train.Checkpoint(bert_squad=bert_squad)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

Latest checkpoint restored!!


## Custom training

In [0]:
for epoch in range(NB_EPOCHS):
    print("Start of epoch {}".format(epoch+1))
    start = time.time()
    
    train_loss.reset_states()
    
    for (batch, (inputs, targets)) in enumerate(train_dataset_light):
        with tf.GradientTape() as tape:
            model_outputs = bert_squad(inputs)
            loss = squad_loss_fn(targets, model_outputs)
        
        gradients = tape.gradient(loss, bert_squad.trainable_variables)
        optimizer.apply_gradients(zip(gradients, bert_squad.trainable_variables))
        
        train_loss(loss)
        
        if batch % 50 == 0:
            print("Epoch {} Batch {} Loss {:.4f}".format(
                epoch+1, batch, train_loss.result()))
        
        if batch % 500 == 0:
            ckpt_save_path = ckpt_manager.save()
            print("Saving checkpoint for epoch {} at {}".format(epoch+1,
                                                                ckpt_save_path))
    print("Time taken for 1 epoch: {} secs\n".format(time.time() - start))

# Stage 5: Evaluation

## Prepare evaluation

Get the dev set in the session

In [0]:
eval_examples = read_squad_examples(
    "/content/drive/My Drive/projects/BERT/data/squad/dev-v1.1.json",
    is_training=False,
    version_2_with_negative=False)

Define the function that will write the tf_record file for the dev set

In [0]:
eval_writer = FeatureWriter(
    filename=os.path.join("/content/drive/My Drive/projects/BERT/data/squad/",
                          "eval.tf_record"),
    is_training=False)

Create a tokenizer for future information needs

In [0]:
my_bert_layer = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
    trainable=False)
vocab_file = my_bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = my_bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

Define the function that add the features (feature is a protocol in tensorflow) to our eval_features list

In [0]:
def _append_feature(feature, is_padding):
    if not is_padding:
        eval_features.append(feature)
    eval_writer.process_feature(feature)

Create the eval features and the writes the tf.record file

In [0]:
eval_features = []
dataset_size = convert_examples_to_features(
    examples=eval_examples,
    tokenizer=tokenizer,
    max_seq_length=384,
    doc_stride=128,
    max_query_length=64,
    is_training=False,
    output_fn=_append_feature,
    batch_size=4)

In [0]:
eval_writer.close()

Load the ready-to-be-used dataset to our session

In [0]:
BATCH_SIZE = 4

eval_dataset = create_squad_dataset(
    "/content/drive/My Drive/projects/BERT/data/squad/eval.tf_record",
    384,#input_meta_data['max_seq_length'],
    BATCH_SIZE,
    is_training=False)

## Making the predictions

Defines a certain type of collection (like a dictionary)

In [0]:
RawResult = collections.namedtuple("RawResult",
                                   ["unique_id", "start_logits", "end_logits"])

Returns each element of batched output at a time

In [0]:
def get_raw_results(predictions):
    for unique_ids, start_logits, end_logits in zip(predictions['unique_ids'],
                                                    predictions['start_logits'],
                                                    predictions['end_logits']):
        yield RawResult(
            unique_id=unique_ids.numpy(),
            start_logits=start_logits.numpy().tolist(),
            end_logits=end_logits.numpy().tolist())

Let's make our predictions!

In [0]:
all_results = []
for count, inputs in enumerate(eval_dataset):
    x, _ = inputs
    unique_ids = x.pop("unique_ids")
    start_logits, end_logits = bert_squad(x, training=False)
    output_dict = dict(
        unique_ids=unique_ids,
        start_logits=start_logits,
        end_logits=end_logits)
    for result in get_raw_results(output_dict):
        all_results.append(result)
    if count % 100 == 0:
        print("{}/{}".format(count, 2709))

Write the predictions in a json file that will work with the evaluation script

In [0]:
output_prediction_file = "/content/drive/My Drive/projects/BERT/data/squad/predictions.json"
output_nbest_file = "/content/drive/My Drive/projects/BERT/data/squad/nbest_predictions.json"
output_null_log_odds_file = "/content/drive/My Drive/projects/BERT/data/squad/null_odds.json"

write_predictions(
    eval_examples,
    eval_features,
    all_results,
    20,
    30,
    True,
    output_prediction_file,
    output_nbest_file,
    output_null_log_odds_file,
    verbose=False)