In [1]:
import gc
import tensorflow as tf
import tensorflow_datasets
import numpy as np
import tensorflow.keras as keras
import orjson
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Dense, Input
from utils import downconvert_tf_dataset
import wandb
from wandb.keras import WandbCallback

from transformers import (TFBertModel, TFDistilBertModel,
                          BertTokenizer, DistilBertTokenizer,
                          glue_convert_examples_to_features)

In [2]:
# Constants
BATCH_SIZE = 32
MAX_SEQ_LEN = 128
EPOCHS = 3

# FP16 settings
fp16 = True
if fp16:
    tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})

In [3]:
# Fetch pre-trained models
#tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [4]:
def load_json_dataset(filename):
    return orjson.loads(open(filename, "rb").read())

def process_cls_row(sentence, tok, pad_token, max_seq_len, _out_xs):
    input = tok.encode_plus(sentence, add_special_tokens=True, max_length=max_seq_len,)
    input_ids, token_type_ids = input["input_ids"], input["token_type_ids"]
    attention_mask = [0] * len(input_ids)

    # Pad strings to exactly max_seq_len
    padding_length = max_seq_len - len(input_ids)
    input_ids = input_ids + ([pad_token] * padding_length)
    attention_mask = attention_mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([0] * padding_length)

    # Double-check results.
    assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_length)
    assert len(attention_mask) == max_seq_len, "Error with input length {} vs {}".format(
        len(attention_mask), max_seq_len
    )
    assert len(token_type_ids) == max_seq_len, "Error with input length {} vs {}".format(
        len(token_type_ids), max_seq_len
    )

    # Push resultants to the input lists.
    _out_xs[0].append(np.asarray(input_ids))
    _out_xs[1].append(np.asarray(attention_mask))
    _out_xs[2].append(np.asarray(token_type_ids))

def downconvert_tf_nlp_cls_dataset(dataset, tok, pad_token=0, max_seq_len=128):
    inputs = [[],[],[]]
    outputs = []
    for m in dataset:
        # inputs gets modified in-place by process_cls_row.
        process_cls_row(m["sentence"], tok, pad_token, max_seq_len, inputs)
        outputs.append(m['label'])
    return [np.asarray(i) for i in inputs], np.asarray(outputs)

In [5]:
dataset_cache = {}

def create_new_classification_head(train_dataset_name, val_dataset_name, base_model_cls_head, dense_config=[256,2]):
    # Fetch the data.
    global dataset_cache
    if(train_dataset_name in dataset_cache.keys()):
        train_x = dataset_cache[train_dataset_name]["tx"]
        train_y = dataset_cache[train_dataset_name]["ty"]
        val_x = dataset_cache[train_dataset_name]["vx"]
        val_y = dataset_cache[train_dataset_name]["vy"]
        print("Restored dataset from cache.")
    else:
        train_data = load_json_dataset(train_dataset_name)
        train_x, train_y = downconvert_tf_nlp_cls_dataset(train_data, tokenizer, MAX_SEQ_LEN)
        val_data = load_json_dataset(val_dataset_name)
        val_x, val_y = downconvert_tf_nlp_cls_dataset(val_data, tokenizer, MAX_SEQ_LEN)
        dataset_cache.update({train_dataset_name: {"tx": train_x, "ty": train_y, "vx": val_x, "vy": val_y }})
        print("Dataset %s train_sz=%i val_sz=%i" % \
              (train_dataset_name, train_y.shape[0], val_y.shape[0]))
    
    # Create the head.
    tensor = base_model_cls_head
    for layer_units in dense_config[0:-1]:
        tensor = Dense(units=layer_units, activation="relu")(tensor)
    tensor = Dense(units=dense_config[-1], activation="softmax")(tensor)
    
    return train_x, train_y, val_x, val_y, tensor

In [None]:
def fine_tune_task(train_dataset_path, val_dataset_path, optimizer, batch_sz=32, epochs=4):
    # Re-load base model weights.
    #bert_base_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
    bert_base_model = TFBertModel.from_pretrained("bert-base-cased")

    inputs = [Input(shape=(128,), dtype='int32', name='input_ids'),
              Input(shape=(128,), dtype='int32', name='attention_mask'), 
              Input(shape=(128,), dtype='int32', name='token_type_ids'),
             ]

    # Fetch the CLS head of the BERT model; index 1.
    #cls_head = bert_base_model(inputs)[1]
    
    # Classification head for DistilBERT is the flattened hidden state.
    cls_head = keras.layers.Flatten()(bert_base_model(inputs)[0])

    # Fetch and format dataset and classification head.
    train_x, train_y, val_x, val_y, tensor = \
        create_new_classification_head(train_dataset_path, val_dataset_path, cls_head, dense_config=[6])
    model = keras.Model(inputs=inputs, outputs=tensor)
    print(model.summary())

    # Configure loss function and metrics.
    if fp16:
        tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
    
    # Train model.
    history = model.fit(train_x, train_y, batch_size=batch_sz, epochs=epochs, \
                                          validation_data=(val_x, val_y),\
                                          callbacks=[WandbCallback(log_batch_frequency=int(1024/batch_sz))])
    return model, history

dataset = "C:/Users/jbetk/Documents/data/ml/sentiment_analysis/amazon_reviews/processed.json"
val = "C:/Users/jbetk/Documents/data/ml/sentiment_analysis/amazon_reviews/validation.json"
name = "amazon_reviews_sentiment_first_pass"
wandb.init(project="nonint-transformers",\
           name=name,\
           config={"dataset": dataset, "learning_rate": 1e-5, "epsilon": 1e-08, "batch_sz": 32})
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, epsilon=1e-08)
# Todo - configure optimizer in mixed precision mode.
model, history = fine_tune_task(dataset, val, optimizer, 32, epochs=9)

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable


Dataset C:/Users/jbetk/Documents/data/ml/sentiment_analysis/amazon_reviews/processed.json train_sz=470295 val_sz=500
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 128, 768), ( 108310272   input_ids[0

wandb: ERROR Can't save model, h5py returned error: 


Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9

In [None]:
phrase = "I was walking to the park on a sunny day."

def pad_zero(inputs, seq_len):
    for k in inputs: 
        output = np.zeros(seq_len+1, dtype='int32')
        output[:len(inputs[k])] = np.asarray(inputs[k])
        inputs[k] = output
    return inputs
 
phrase_encoded = pad_zero(tokenizer.encode_plus(phrase, add_special_tokens=True, max_length=128), 128)

phrase_encoded_formatted = \
    [np.resize(phrase_encoded['input_ids'], (1,-1)),
    np.resize(phrase_encoded['token_type_ids'], (1,-1)),
    np.resize(phrase_encoded['attention_mask'], (1,-1))]
print(model.predict(phrase_encoded_formatted))