In [1]:
import gc
import tensorflow as tf
import tensorflow_datasets
import numpy as np
import tensorflow.keras as keras
import orjson
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Dense, Input
from utils import downconvert_tf_dataset
import wandb
from wandb.keras import WandbCallback

from transformers import (TFBertModel, TFDistilBertModel, TFGPT2Model,
                          BertTokenizer, DistilBertTokenizer, GPT2Tokenizer)

# FP16 settings
fp16 = True
if fp16:
    tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})

In [2]:
def split_inputs_and_outputs(data_map):
    return [np.asarray(data_map['input_id']),
           np.asarray(data_map['attention_mask']),
           np.asarray(data_map['token_type_id'])], np.asarray(data_map['label'])

def split_inputs_and_outputs_distil(data_map):
    return [np.asarray(data_map['input_id']),
           np.asarray(data_map['attention_mask'])], np.asarray(data_map['label'])

def split_inputs_and_outputs_gpt2(data_map):
    return np.asarray(data_map['input_id']), np.asarray(data_map['label'])

def load_data(train_filename, val_filename, ldr_fn):
    training_data = orjson.loads(open(train_filename, "rb").read())
    train_x, train_y = ldr_fn(training_data)
    val_data = orjson.loads(open(val_filename, "rb").read())
    val_x, val_y = ldr_fn(val_data)
    return train_x, train_y, val_x, val_y

# Load data.
train_dataset_path = "C:/Users/jbetk/Documents/data/ml/sentiment_analysis/outputs/gpt2/processed.json"
val_dataset_path = "C:/Users/jbetk/Documents/data/ml/sentiment_analysis/outputs/gpt2/validation.json"
train_x, train_y, val_x, val_y = load_data(train_dataset_path, val_dataset_path, split_inputs_and_outputs_gpt2)

In [3]:
def create_new_classification_head(base_model_cls_head, dense_config=[256,2], final_activation="softmax"):   
    # Create the head.
    tensor = base_model_cls_head
    for layer_units in dense_config[0:-1]:
        tensor = Dense(units=layer_units, activation="relu")(tensor)
    tensor = Dense(units=dense_config[-1], activation=final_activation, name="final_linear")(tensor)
    
    return tensor

def fine_tune_task(optimizer):
    # Re-load base model weights.
    base_model = TFGPT2Model.from_pretrained("gpt2")
    #base_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
    #base_model = TFBertModel.from_pretrained("bert-base-cased")

    inputs = [Input(shape=(128,), dtype='int32', name='input_ids'),
              Input(shape=(128,), dtype='int32', name='attention_mask'), 
              #Input(shape=(128,), dtype='int32', name='token_type_ids'),
             ]

    # Fetch the CLS head of the BERT model; index 1.
    #cls_head = base_model(inputs)[1]
    # Classification head for DistilBERT is the flattened hidden state.
    cls_head = keras.layers.Flatten()(base_model(inputs[0])[0])
    
    tensor = create_new_classification_head(cls_head, dense_config=[1], final_activation="linear")
    model = keras.Model(inputs=inputs[0], outputs=tensor)
    print(model.summary())

    # Configure loss function and metrics.
    if fp16:
        tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)
    #loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    loss = tf.keras.losses.MeanSquaredError()
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
    
    return model

In [12]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
#tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
#tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, epsilon=1e-08)

batch_sz = 32
epochs = 2

# Todo - configure optimizer in mixed precision mode.
model = fine_tune_task(optimizer)

# Train model.
wandb.init(project="nonint-transformers",\
           name="gpt2_1024_sentiment_mse_amazon_yelp",\
           config={"dataset": "amazon_yelp_dataset", "learning_rate": 1e-5, "epsilon": 1e-08, "batch_sz": batch_sz})

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 128)]             0         
_________________________________________________________________
tfgp_t2model_3 (TFGPT2Model) ((None, 128, 768), ((None 124439808 
_________________________________________________________________
flatten_3 (Flatten)          (None, 98304)             0         
_________________________________________________________________
final_linear (Dense)         (None, 1)                 98305     
Total params: 124,538,113
Trainable params: 124,538,113
Non-trainable params: 0
_________________________________________________________________
None


wandb: Wandb version 0.8.27 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


W&B Run: https://app.wandb.ai/neonbjb/nonint-transformers/runs/b16ds22e

In [13]:

history = model.fit(train_x, train_y, batch_size=batch_sz, epochs=epochs, \
                                      validation_data=(val_x, val_y),\
                                      callbacks=[WandbCallback(log_batch_frequency=int(2048/batch_sz))])

Train on 2625860 samples, validate on 4000 samples
Epoch 1/2


wandb: Wandb version 0.8.27 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


    768/2625860 [..............................] - ETA: 14:49:10 - loss: nan - accuracy: 0.0000e+00

KeyboardInterrupt: 

In [9]:
phrases = [
    "This product ruined my day. Not figuratively, I stuck my fingers together and couldn't get them apart! I would never recommend this to anyone! Luckily I could return it..",
    # 5 star:
    "This product has been the standard for woodworking pros for decades. Before the 800lb gorilla swaggered into the room, taking the market by storm and strong-arming his way to happily humming registers all over the country, Titebond was THE glue to beat.",
    # 4 star:
    "good glue, but had hardened glue stuck in the spout and is clogged have to unscrew and use a popsicle stick to use, i will try and use isopropyl to dissolve it so i can use the spout, kinda annoying",
    # 3 star:
    "Strong as can be, but stain will discolor. Make sure you wipe it off completely before it dries.",
    # 2 star:
    "Did not use this glue immediately. About 4 weeks later, I could not get any glue to squeeze out thru the dispenser tip. I removed the cap and discovered lumps in the glue.",
]

def pad_zero(inputs, seq_len):
    for k in inputs: 
        output = np.zeros(seq_len, dtype='int32')
        output[:len(inputs[k])] = np.asarray(inputs[k])
        inputs[k] = output
    return inputs

phrases_encoded = [[], [], []]
for phrase in phrases:
    enc = pad_zero(tokenizer.encode_plus(phrase, add_special_tokens=True, max_length=128), 128)
    for (i,k) in enumerate(enc.keys()):
        phrases_encoded[i].append(enc[k])

inputs = []
for e in phrases_encoded:
    inputs.append(np.asarray(e))
    
results = model.predict(inputs[0])

for p,r in zip(phrases, results):
    print(p, r)

This product ruined my day. Not figuratively, I stuck my fingers together and couldn't get them apart! I would never recommend this to anyone! Luckily I could return it.. [40.738373]
This product has been the standard for woodworking pros for decades. Before the 800lb gorilla swaggered into the room, taking the market by storm and strong-arming his way to happily humming registers all over the country, Titebond was THE glue to beat. [35.158142]
good glue, but had hardened glue stuck in the spout and is clogged have to unscrew and use a popsicle stick to use, i will try and use isopropyl to dissolve it so i can use the spout, kinda annoying [39.00329]
Strong as can be, but stain will discolor. Make sure you wipe it off completely before it dries. [38.45906]
Did not use this glue immediately. About 4 weeks later, I could not get any glue to squeeze out thru the dispenser tip. I removed the cap and discovered lumps in the glue. [41.917717]


In [7]:
tf.saved_model.save(model,"saved/")


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: saved/assets
