In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
!pip install -q -U "tensorflow-text"
!pip install -q -U tf-models-official
!pip install -U tfds-nightly
!pip install -U tensorflow-addons
!pip install -q tf-models-official
!pip install -U tensorboard_plugin_profile
!pip install tensorflow pandas

In [None]:
import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow_addons as tfa
from official.nlp import optimization
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping
from datetime import datetime
import tensorboard as tb

In [None]:
# os.environ["TFHUB_MODEL_LOAD_FORMAT"] = "UNCOMPRESSED"

In [None]:
# if os.environ["COLAB_TPU_ADDR"] :
#     cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="")
#     tf.config.experimental_connect_to_cluster(cluster_resolver)
#     tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
#     strategy = tf.distribute.TPUStrategy(cluster_resolver)
#     print("Using TPU")

gpus = tf.config.list_logical_devices("GPU")
strategy = tf.distribute.MirroredStrategy(gpus)
print("Using GPU")

# else:
#     raise ValueError("Avoid running on CPU.")

In [None]:
PREPROCESSOR_URL = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
BERT_URL = "https://tfhub.dev/google/experts/bert/wiki_books/sst2/2"
TB_LOGS_DIR = "/content/drive/MyDrive/Project/Models50k/sentiment140_expertbert/logs/" + datetime.now().strftime("%Y%m%d-%H%M%S")
CHKP_DIR = "/content/drive/MyDrive/Project/Models50k/sentiment140_expertbert/cp.ckpt"
AUTOTUNE = tf.data.AUTOTUNE
EPOCHS = 5
BATCH_SIZE = 32
INIT_LR = 2e-5
TFDS_NAME = "sentiment140"
PATIENCE = 2
TAKE_SIZE = 120000

In [None]:
# with tf.device("/job:localhost"):
in_memory_ds, info = tfds.load(TFDS_NAME, with_info=True)

In [None]:
import tensorboard as tb

In [None]:
print(info)

In [None]:
# in_memory_ds["train"]

In [None]:
# tfds_info = tfds.builder(TFDS_NAME).info
# print(tfds_info)
# sentence_features = list(tfds_info.features.keys())
# print(f"Sentence features: {sentence_features}")

# sentence_features.remove("date")
# sentence_features.remove("user")
# sentence_features.remove("query")
# sentence_features.remove("polarity")

# available_splits = list(tfds_info.splits.keys())
# print(f"Splits: {available_splits}")
# train_split = "train"
# validation_split = "validation"
# test_split = "test"

# num_classes = 2
# num_examples = tfds_info.splits.total_num_examples
# print(num_examples)

In [None]:
data_dict = in_memory_ds.get("train").take(50000)

In [None]:
TOTAL_SAMPLES = data_dict.cardinality().numpy()

In [None]:
test_val_size = int(0.1 * TOTAL_SAMPLES)
train_size = int(TOTAL_SAMPLES - 2*test_val_size)

In [None]:
train_data = data_dict.take(train_size)
test_data = data_dict.skip(train_size).take(test_val_size)
val_data = data_dict.skip(train_size + test_val_size)

In [None]:
test_data.cardinality().numpy()
val_data.cardinality().numpy()

In [None]:
def optimize_dataset(dataset):
    return dataset.batch(BATCH_SIZE).cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
train_data = optimize_dataset(train_data.map(lambda x: (x.get("text"), 0 if (x.get("polarity")) == 0 else 1)))
test_data = optimize_dataset(test_data.map(lambda x: (x.get("text"), 0 if (x.get("polarity")) == 0 else 1)))    
val_data = optimize_dataset(val_data.map(lambda x: (x.get("text"), 0 if (x.get("polarity")) == 0 else 1)))

In [None]:
def create_model():
    bert_model = hub.KerasLayer(BERT_URL, trainable=True)
    preprocessor_model = hub.KerasLayer(PREPROCESSOR_URL)

    input_text = tf.keras.layers.Input(shape=(), dtype=tf.string, name="input_layer")
    bert_inputs = preprocessor_model(input_text)
    bert_outputs = bert_model(bert_inputs)
    dense = tf.keras.layers.Dense(units=256, activation="relu")(bert_outputs.get("pooled_output"))
    drop = tf.keras.layers.Dropout(0.3)(dense)
    output_layer = tf.keras.layers.Dense(1, activation="sigmoid", name="output_layer")(drop)

    model = tf.keras.Model(inputs=input_text, outputs=output_layer)

    return model

In [None]:
# def optimize_ds(dataset, batch_size):
#     dataset = tf.data.Dataset.from_tensor_slices(dataset["train"]).take(TAKE_SIZE)
#     total = dataset.cardinality().numpy()
#     dataset = dataset.batch(batch_size)
#     dataset = dataset.map(lambda x: (x[0], 0 if x[1]==0 else 1), num_parallel_calls=tf.data.AUTOTUNE)
#     dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
#     return dataset, total

In [None]:
with strategy.scope():
  metrics = tf.metrics.BinaryAccuracy()
  loss_function = tf.keras.losses.BinaryCrossentropy(from_logits=False)
  # train_dataset, train_data_size = load_dataset_from_tfds(
  #     in_memory_ds, tfds_info, train_split)
  # print(train_data_size)

  steps_per_epoch = tf.data.experimental.cardinality(train_data).numpy()
  num_train_steps = steps_per_epoch * EPOCHS
  num_warmup_steps = int(0.1 * num_train_steps)


  model = create_model()
  # print(train_dataset.take(1))

  optimizer = optimization.create_optimizer(
      init_lr=INIT_LR,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      optimizer_type="adamw"
  )

  model.compile(optimizer=optimizer, loss=loss_function, metrics=[metrics])

  early_stop = EarlyStopping(monitor="val_loss", patience=PATIENCE, restore_best_weights=True)

  
  checkpoint_dir = os.path.dirname(CHKP_DIR)
  # options = tf.train.CheckpointOptions(experimental_io_device="/job:localhost")
  cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=CHKP_DIR,
                                                      save_weights_only=True,
                                                      save_best_only=True,
                                                      verbose=1)
  latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
  if latest_checkpoint is not None:
      print(latest_checkpoint)
      model.load_weights(latest_checkpoint)

  tboard_callback = tf.keras.callbacks.TensorBoard(log_dir = TB_LOGS_DIR,
                                                histogram_freq = 1)

  history = model.fit(train_data, epochs=EPOCHS, batch_size=BATCH_SIZE,
                      validation_data=val_data,
                      callbacks=[ cp_callback, tboard_callback])


In [None]:
SAVE_PATH = "/content/drive/MyDrive/Project/Models50k/"

In [None]:
model.save(SAVE_PATH + "expertbert_120k")

In [None]:
model.save(SAVE_PATH + "expertbert_120k.h5")

In [None]:
tf.saved_model.save(model, SAVE_PATH + "low_level_savedmodel/")

In [None]:
%tensorboard --logdir /content/drive/MyDrive/Project/Models50k/sentiment140_expertbert/logs/20230322-200125/train

In [None]:
!tensorboard dev upload \
  --logdir "/content/drive/MyDrive/Project/Models50k/sentiment140_expertbert/logs/20230322-200125/train" \
  --name "(optional) My latest experiment" \
  --description "(optional) Simple comparison of several hyperparameters" \
  --one_shot