In [1]:
import gc
import tensorflow as tf
import tensorflow_datasets
import numpy as np
import tensorflow.keras as keras
import orjson
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Dense, Input
from utils import downconvert_tf_dataset
import wandb
from wandb.keras import WandbCallback

from transformers import (TFBertModel, TFDistilBertModel,
                          BertTokenizer, DistilBertTokenizer,
                          glue_convert_examples_to_features)

In [2]:
# Constants
BATCH_SIZE = 32
MAX_SEQ_LEN = 128
EPOCHS = 3

# FP16 settings
fp16 = True
if fp16:
    tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})

In [3]:
# Fetch pre-trained models
#tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [4]:
def split_inputs_and_outputs(data_map):
    return [np.asarray(data_map['input_id']),
           np.asarray(data_map['attention_mask']),
           np.asarray(data_map['token_type_id'])], np.asarray(data_map['label'])

def load_data(train_filename, val_filename):
    training_data = orjson.loads(open(train_filename, "rb").read())
    train_x, train_y = split_inputs_and_outputs(training_data)
    val_data = orjson.loads(open(val_filename, "rb").read())
    val_x, val_y = split_inputs_and_outputs(val_data)
    return train_x, train_y, val_x, val_y

In [5]:
def create_new_classification_head(base_model_cls_head, dense_config=[256,2], final_activation="softmax"):   
    # Create the head.
    tensor = base_model_cls_head
    for layer_units in dense_config[0:-1]:
        tensor = Dense(units=layer_units, activation="relu")(tensor)
    tensor = Dense(units=dense_config[-1], activation=final_activation)(tensor)
    
    return tensor

In [None]:
def fine_tune_task(train_dataset_path, val_dataset_path, optimizer, batch_sz=32, epochs=4):
    # Load data.
    train_x, train_y, val_x, val_y = load_data(train_dataset_path, val_dataset_path)
    
    # Re-load base model weights.
    #bert_base_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
    bert_base_model = TFBertModel.from_pretrained("bert-base-cased")

    inputs = [Input(shape=(128,), dtype='int32', name='input_ids'),
              Input(shape=(128,), dtype='int32', name='attention_mask'), 
              Input(shape=(128,), dtype='int32', name='token_type_ids'),
             ]

    # Fetch the CLS head of the BERT model; index 1.
    #cls_head = bert_base_model(inputs)[1]
    # Classification head for DistilBERT is the flattened hidden state.
    cls_head = keras.layers.Flatten()(bert_base_model(inputs)[0])
    
    tensor = create_new_classification_head(cls_head, dense_config=[1], final_activation="linear")
    model = keras.Model(inputs=inputs, outputs=tensor)
    print(model.summary())

    # Configure loss function and metrics.
    if fp16:
        tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)
    #loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    loss = tf.keras.losses.MeanSquaredError()
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
    
    # Train model.
    wandb.init(project="nonint-transformers",\
               name="amazon_reviews_sentiment_first_pass",\
               config={"dataset": dataset, "learning_rate": 1e-5, "epsilon": 1e-08, "batch_sz": 32})
    history = model.fit(train_x, train_y, batch_size=batch_sz, epochs=epochs, \
                                          validation_data=(val_x, val_y),\
                                          callbacks=[WandbCallback(log_batch_frequency=int(1024/batch_sz))])
    return model, history

dataset = "C:/Users/jbetk/Documents/data/ml/sentiment_analysis/amazon_reviews/outputs/processed.json"
val = "C:/Users/jbetk/Documents/data/ml/sentiment_analysis/amazon_reviews/outputs/validation.json"
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, epsilon=1e-08)
# Todo - configure optimizer in mixed precision mode.
model, history = fine_tune_task(dataset, val, optimizer, 32, epochs=9)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 128, 768), ( 108310272   input_ids[0][0]                  
                                                                 attention_mask[0][0]         

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable


Train on 466795 samples, validate on 4000 samples
Epoch 1/9

wandb: ERROR Can't save model, h5py returned error: 


Epoch 2/9

In [None]:
phrases = [
    "I was walking to the park on a sunny day.",
    # 5 star:
    "This product has been the standard for woodworking pros for decades. Before the 800lb gorilla swaggered into the room, taking the market by storm and strong-arming his way to happily humming registers all over the country, Titebond was THE glue to beat.",
    # 4 star:
    "good glue, but had hardened glue stuck in the spout and is clogged have to unscrew and use a popsicle stick to use, i will try and use isopropyl to dissolve it so i can use the spout, kinda annoying",
    # 3 star:
    "Strong as can be, but stain will discolor. Make sure you wipe it off completely before it dries.",
    # 2 star:
    "Did not use this glue immediately. About 4 weeks later, I could not get any glue to squeeze out thru the dispenser tip. I removed the cap and discovered lumps in the glue.",
]

def pad_zero(inputs, seq_len):
    for k in inputs: 
        output = np.zeros(seq_len+1, dtype='int32')
        output[:len(inputs[k])] = np.asarray(inputs[k])
        inputs[k] = output
    return inputs

phrases_encoded = [[], [], []]
for phrase in phrases:
    enc = pad_zero(tokenizer.encode_plus(phrase, add_special_tokens=True, max_length=128), 128)
    

In [None]:
phrases = [
    "This product ruined my day. Not figuratively, I stuck my fingers together and couldn't get them apart! I would never recommend this to anyone! Luckily I could return it..",
    # 5 star:
    "This product has been the standard for woodworking pros for decades. Before the 800lb gorilla swaggered into the room, taking the market by storm and strong-arming his way to happily humming registers all over the country, Titebond was THE glue to beat.",
    # 4 star:
    "good glue, but had hardened glue stuck in the spout and is clogged have to unscrew and use a popsicle stick to use, i will try and use isopropyl to dissolve it so i can use the spout, kinda annoying",
    # 3 star:
    "Strong as can be, but stain will discolor. Make sure you wipe it off completely before it dries.",
    # 2 star:
    "Did not use this glue immediately. About 4 weeks later, I could not get any glue to squeeze out thru the dispenser tip. I removed the cap and discovered lumps in the glue.",
]

def pad_zero(inputs, seq_len):
    for k in inputs: 
        output = np.zeros(seq_len, dtype='int32')
        output[:len(inputs[k])] = np.asarray(inputs[k])
        inputs[k] = output
    return inputs

phrases_encoded = [[], [], []]
for phrase in phrases:
    enc = pad_zero(tokenizer.encode_plus(phrase, add_special_tokens=True, max_length=128), 128)
    for (i,k) in enumerate(enc.keys()):
        phrases_encoded[i].append(enc[k])

inputs = []
for e in phrases_encoded:
    inputs.append(np.asarray(e))
    
print(inputs[0].shape, inputs[1].shape, inputs[2].shape)
    
results = model.predict(inputs)

for p,r in zip(phrases, results):
    print(p, r)

In [None]:
tf.saved_model.save(model,"saved/")