In [1]:
import gc
import tensorflow as tf
import tensorflow_datasets
import numpy as np
import tensorflow.keras as keras
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Dense, Input
from utils import downconvert_tf_dataset

from transformers import (TFBertModel, 
                          BertTokenizer,
                          glue_convert_examples_to_features)

In [3]:
import json

def get_pairs(word):
    """Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

print(get_pairs("something"))

{('m', 'e'), ('t', 'h'), ('i', 'n'), ('o', 'm'), ('s', 'o'), ('n', 'g'), ('e', 't'), ('h', 'i')}


In [2]:
# Constants
BATCH_SIZE = 32
MAX_SEQ_LEN = 128
EPOCHS = 3

# FP16 settings
fp16 = True
if fp16:
    tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
    BATCH_SIZE = 48

In [3]:
# Fetch pre-trained models
bert_base_model = TFBertModel.from_pretrained("bert-base-cased")
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [4]:
def create_new_classification_head(dataset_name, base_model_cls_head, dense_config=[256,2]):
    # Fetch the data.
    data = tensorflow_datasets.load(dataset_name)
    train_x, train_y = downconvert_tf_dataset(data["train"], tokenizer, MAX_SEQ_LEN)
    val_x, val_y = downconvert_tf_dataset(data["validation"], tokenizer, MAX_SEQ_LEN)
    print("Dataset %s train_sz=%i val_sz=%i" % \
          (dataset_name, train_y.shape[0], val_y.shape[0]))
    
    # Create the head.
    tensor = base_model_cls_head
    for layer_units in dense_config[0:-1]:
        tensor = Dense(units=layer_units, activation="relu", name="%s_%i" % (dataset_name, layer_units))(tensor)
    tensor = Dense(units=dense_config[-1], activation="softmax", name="final_%s" % (dataset_name))(tensor)
    
    return train_x, train_y, val_x, val_y, tensor

def join_np_arrays_with_starting_none(l_a, l_b):
    # l_a can be None.
    if l_a is None:
        return l_b
    else:
        return np.concatenate((l_a, l_b), axis=0)

def create_joint_classification_heads(datasets, base_model_cls_head, cls_task_selector_input,\
                                      dense_config=[256,2]):
    cls_heads = []
    train_x, train_y, val_x, val_y = None, None, None, None
    for (i, dataset) in enumerate(datasets):
        # tx and vx are lists of inputs which include the unique transformer inputs for the underlying
        #  model.
        tx, ty, vx, vy, h = create_new_classification_head(dataset, base_model_cls_head, dense_config)
        
        # need to append in a signal to tx and vx which indicates which classification task the data
        #  comes from. This will be a one-hot array which the multi-head output will be multiplied against.
        cls_task_id = np.zeros(len(datasets))
        cls_task_id[i] = 1
        tx.append(np.broadcast_to(cls_task_id, (ty.shape[0], len(datasets))))
        vx.append(np.broadcast_to(cls_task_id, (vy.shape[0], len(datasets))))
        
        # next, append the datasets and new head into the final produced list.
        if train_x is None:
            train_x = [None for i in range(len(tx))]
            val_x = [None for i in range(len(tx))]
        for i, _ in enumerate(train_x):
            train_x[i] = join_np_arrays_with_starting_none(train_x[i], tx[i])
            val_x[i] = join_np_arrays_with_starting_none(val_x[i], vx[i])
        train_y = join_np_arrays_with_starting_none(train_y, ty)
        val_y = join_np_arrays_with_starting_none(val_y, vy)
        cls_heads.append(h)
    
    # shuffle the joined training datasets
    randomize = np.arange(len(train_y))
    np.random.shuffle(randomize)
    for i in range(len(train_x)):
        train_x[i] = train_x[i][randomize]
    train_y = train_y[randomize]
    
    # Reshape classification heads to all have a concatenation dimension
    cls_heads_shaped = []
    for head in cls_heads:
        # TODO: TEST changing "2" (hardcoded) to "-1"
        cls_heads_shaped.append(keras.layers.Reshape((1,2))(head))
    # join the classification heads into a single output.
    cls_heads_cat = None
    if len(cls_heads) > 1:
        cls_heads_cat = keras.layers.Concatenate(axis=1)(cls_heads_shaped)
    else:
        cls_heads_cat = cls_heads_shaped[0]
    print("cls_heads_cat:",K.int_shape(cls_heads_cat))
    print("cls_selector:",K.int_shape(cls_task_selector_input))
    cls_output = keras.layers.Dot(name="cls_head_join", axes=(1,1))([cls_task_selector_input, cls_heads_cat])
    
    return train_x,train_y,val_x,val_y,cls_output
    

In [5]:
# Configure and compile model.

dataset_ids=["glue/sst2", "glue/cola"]

# Later cells might set trainable=False; which we don't necessarily want here.
inputs = [Input(shape=(128,), dtype='int32', name='input_ids'),
          Input(shape=(128,), dtype='int32', name='attention_mask'), 
          Input(shape=(128,), dtype='int32', name='token_type_ids'),
          Input(shape=(len(dataset_ids),), dtype='float32', name='classification_task_selector')]

# Fetch the CLS head of the BERT model; index 1.
cls_head = bert_base_model(inputs[0:2])[1]

# Fetch and format dataset and classification head.
sst_train_x, sst_train_y, sst_val_x, sst_val_y, sst_tensor = \
    create_joint_classification_heads(dataset_ids, cls_head, inputs[3], dense_config=[256,2])

sst_bert_model = keras.Model(inputs=inputs, outputs=sst_tensor)
print(sst_bert_model.summary())

# Configure optimizer, loss function and metrics.
sst_optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
if fp16:
    tf.train.experimental.enable_mixed_precision_graph_rewrite(sst_optimizer)
sst_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
sst_metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

sst_bert_model.compile(optimizer=sst_optimizer, loss=sst_loss, metrics=[sst_metric])

INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset glue (C:\Users\jbetk\tensorflow_datasets\glue\sst2\0.0.2)
INFO:absl:Constructing tf.data.Dataset for split None, from C:\Users\jbetk\tensorflow_datasets\glue\sst2\0.0.2
INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset glue (C:\Users\jbetk\tensorflow_datasets\glue\cola\0.0.2)
INFO:absl:Constructing tf.data.Dataset for split None, from C:\Users\jbetk\tensorflow_datasets\glue\cola\0.0.2


Dataset glue/sst2 train_sz=67349 val_sz=872
Dataset glue/cola train_sz=8551 val_sz=1043
cls_heads_cat: (None, 2, 2)
cls_selector: (None, 2)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 128, 768), ( 108310272   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
_____________________________________________________

In [7]:
# Train model.
sst_bert_history = sst_bert_model.fit(sst_train_x, sst_train_y, batch_size=BATCH_SIZE, epochs=EPOCHS, \
                                      validation_data=(sst_val_x, sst_val_y))


Train on 75900 samples, validate on 1915 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [28]:
#phrase = "I was disappointed to see the credits roll, the film really had me."
phrase = "A human there was she walked it"

def pad_zero(inputs, seq_len):
    for k in inputs: 
        output = np.zeros(seq_len+1, dtype='int32')
        output[:len(inputs[k])] = np.asarray(inputs[k])
        inputs[k] = output
    return inputs
 
phrase_encoded = pad_zero(tokenizer.encode_plus(phrase, add_special_tokens=True, max_length=128), 128)

phrase_encoded_formatted = \
    [np.resize(phrase_encoded['input_ids'], (1,-1)),
    np.resize(phrase_encoded['token_type_ids'], (1,-1)),
    np.resize(phrase_encoded['attention_mask'], (1,-1)),
    np.asarray([[1,0]], dtype='float32')]
print(sst_bert_model.predict(phrase_encoded_formatted))

[[1.0000000e+00 7.8082085e-06]]
