
# CS4248 Project - Truth of Varying Shades

In [1]:
import re
import os
import unidecode
import string
import csv
import random
import pickle
import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', None)
from scipy.stats import linregress
import matplotlib.pyplot as plt
import seaborn as sns


import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, classification_report, make_scorer

import tensorflow_datasets as tfds
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow_addons.metrics import F1Score

## Utilities

In [2]:
def plot_graphs(history, metric):
    plt.figure(figsize=(8, 6), dpi=100)
    plt.plot(history.history[metric], 'r')
    plt.plot(history.history['val_' + metric], 'b')
    plt.title(f'Training and validation {metric}')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])

## Hyperparameters

In [3]:
EMBEDDING_DIM = 100  # Dimension of the dense embedding. Defaults to 100
MAXLEN = 1000  # Maximum length of all sequences. Defaults to 1000.

# training
BATCH_SIZE = 64

## Data pipeline

The dataset is provided in a csv file. Each row of this file contains the following values separated by commas:
- label: the class label of the text
- text: the text content

In [4]:
def preprocess_text(text):
    # remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '' , text)
    # remove accented characters from text, e.g. café
    text = unidecode.unidecode(text)
    # some other preprocessing steps

    return text

In [5]:
def prepare_dataset(file_path):
    # load data
    fulltrain = pd.read_csv(FULLTRAIN_CSV, names=['label', 'text'])
    sentences, labels = fulltrain.text, fulltrain.label - 1  # -1 to make labels 0-index
    print(f"Found {len(sentences)} examples.\n")
    
    sentences = sentences.apply(preprocess_text)
    labels = pd.get_dummies(labels)
    print(f"Label of first example: {labels.iloc[0].tolist()}")
    print(f"Text of first example:\n'{sentences[0]}'\n")
    
    # split dataset into train and validation
    train_sentences, val_sentences, train_labels, val_labels = train_test_split(
        sentences, labels, test_size=0.2, shuffle=True, stratify=labels, random_state=42)
    print('After train val split:')
    print(f"There are {len(train_sentences)} examples for training.")
    print(f"There are {len(val_sentences)} examples for validation.\n")
    
    # data pipeline
    train = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels))\
        .cache()\
        .shuffle(10 * BATCH_SIZE)\
        .batch(BATCH_SIZE)\
        .prefetch(buffer_size=tf.data.AUTOTUNE)
    val = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels))\
        .cache()\
        .shuffle(10 * BATCH_SIZE)\
        .batch(BATCH_SIZE)\
        .prefetch(buffer_size=tf.data.AUTOTUNE)
    return train, val

In [8]:
FULLTRAIN_CSV = './raw_data/fulltrain.csv'
train_dataset, val_dataset = prepare_dataset(FULLTRAIN_CSV)

Found 48854 examples.

Label of first example: [1, 0, 0, 0]
Text of first example:
'A little less than a decade ago hockey fans were blessed with a slate of games every night but on Thursday sources confirmed that for the ninth consecutive year NHL players have been locked out with very slim hopes of an agreement in sight It seems like just yesterday Martin St Louis and his Lightning teammates were raising the Stanley Cup high school hockey coach and onetime ESPN analyst Barry Melrose said Obviously Im still hoping the two sides can come together and reach an agreement but Im starting to think nobody really misses hockey anymore Nope Nobody but old Barry Id still love to catch an Atlanta Thrashers game Observers have noted that when arena doors do reopen the NHL will face the perhaps greater challenge of convincing fans to return to hockey instead of watching more popular sports like football basketball baseball and SlamBall '

After train val split:
There are 39083 examples for traini

2023-03-19 17:11:15.394197: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-03-19 17:11:15.395146: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


## Tokenization

In [9]:
tokenizer = tf.keras.layers.TextVectorization(
    max_tokens=None,
    standardize='lower_and_strip_punctuation',
    ngrams=None,
    output_mode='int',
    output_sequence_length=MAXLEN
)
tokenizer.adapt(train_dataset.map(lambda text, label: text))

vocab = tokenizer.get_vocabulary(include_special_tokens=True)
VOCAB_SIZE = tokenizer.vocabulary_size()

print(f"Vocabulary contains {VOCAB_SIZE} words.")
print(f"First 20 words in the vocabulary: {vocab[:20]}")
print(f"Index of unknown token is {vocab.index('[UNK]')}.")

2023-03-19 17:11:19.040456: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-03-19 17:11:19.101101: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Vocabulary contains 279409 words.
First 20 words in the vocabulary: ['', '[UNK]', 'the', 'to', 'of', 'and', 'a', 'in', 'that', 'is', 'for', 'on', 'it', 'as', 'with', 'are', 'be', 'this', 'was', 'have']
Index of unknown token is 1.


## Prepare pre-defined Embeddings

We here use the 6B & 100d version of [GloVe](https://nlp.stanford.edu/projects/glove/) from Stanford.

In [10]:
GLOVE_FILE = './glove.6B.100d.txt'

GLOVE_EMBEDDINGS = {} # Initialize an empty embeddings index dictionary

# Read file and fill GLOVE_EMBEDDINGS with its contents
with open(GLOVE_FILE) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        GLOVE_EMBEDDINGS[word] = coefs

# Initialize an empty numpy array with the appropriate size
EMBEDDINGS_MATRIX = np.zeros((VOCAB_SIZE, EMBEDDING_DIM)) 

for i, word in enumerate(vocab):  # iterate over each word in the vocabulary
    embedding_vector = GLOVE_EMBEDDINGS.get(word)
    if embedding_vector is not None:
        EMBEDDINGS_MATRIX[i] = embedding_vector

# Since the original GLOVE doesn't have an embedding for [UNK], let's make it the mean of all embeddings.
# According to Pennington: "I've found that just taking an average of all or a subset of the word vectors produces a good unknown vector."
EMBEDDINGS_MATRIX[vocab.index('[UNK]')] = EMBEDDINGS_MATRIX.mean(axis=0) 

## Modeling

In [11]:
def create_model(embedding_dim, maxlen, embeddings_matrix=None, unit=None, finetune_embedding=False):
    """create a basic bidirectional rnn with specified rnn/lstm/gru unit and whether to finetune embeddings or not
       params:
           unit: str, one of 'rnn', 'lstm', 'gru', default rnn
           fintune_embedding: bool, whether to finetune pretrained embedding, default False
       return:
           the text classifier model
    """
    if unit == 'lstm':
        cell = tf.keras.layers.LSTM(64)
    elif unit == 'gru':
        cell = tf.keras.layers.GRU(64)
    else:
        cell = tf.keras.layers.RNN(64)
        
    model = tf.keras.Sequential([
        tokenizer,  # text vectorization layer
        tf.keras.layers.Embedding(
            input_dim=encoder.vocabulary_size(),
            output_dim=embedding_dim,
            weights=embeddings_matrix,
            trainable=finetune_embedding,
            #mask_zero=True  # to handle variable sequence lengths
        ),
        tf.keras.layers.Bidirectional(cell),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(4)
    ])
    
    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                  optimizer=tf.keras.optimizers.Adam(),
                  metrics=['accuracy', F1Score(num_classes=4, average='macro')]) 

    return model

In [None]:
# basic lstm, without finetuing pretrained embeddings
lstm_base = create_model(EMBEDDING_DIM, MAXLEN, [EMBEDDINGS_MATRIX], 'lstm', False)
lstm_base_checkpoint_path = "./checkpoints/my_checkpoint"
print(lstm_base.summary())

early_stopping = tf.keras.callbacks.EarlyStopping('val_loss', patience=5, restore_best_weights=True)
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=lstm_base_checkpoint_path, 
                                                save_weights_only=True, 
                                                save_best_only=True,
                                                verbose=1)
# train 
try:
    lstm_base.load_weights(lstm_base_checkpoint_path)
except:
    history = lstm_base.fit(train_dataset, 
                            epochs=100,
                            validation_data=val_dataset,
                            callbacks=[early_stopping, checkpoint])
    
# evaluate 
loss, accuracy, f1 = lstm_base.evaluate(val_dataset)
print(f'Loss: {loss}, Accuracy: {accuracy}, F1: {f1}')

In [None]:
# fintuned lstm
lstm_finetune = create_model(EMBEDDING_DIM, MAXLEN, [EMBEDDINGS_MATRIX], 'lstm', True)
lstm_finetune_checkpoint_path = "./checkpoints/lstm_finetune_checkpoint"
print(lstm_finetune.summary())

early_stopping = tf.keras.callbacks.EarlyStopping('val_loss', patience=5, restore_best_weights=True)
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=lstm_finetune_checkpoint_path, 
                                                save_weights_only=True, 
                                                save_best_only=True,
                                                verbose=1)
# train
try:
    lstm_finetune.load_weights(lstm_finetune_checkpoint_path)
except:
    history = lstm_finetune.fit(train_dataset, 
                                epochs=100,
                                validation_data=val_dataset,
                                callbacks=[early_stopping, checkpoint])
# evaluate
loss, accuracy, f1 = lstm_finetune.evaluate(val_dataset)
print(f'Loss: {loss}, Accuracy: {accuracy}, F1: {f1}')

In [None]:
plot_graphs(history, 'loss')

In [None]:
plot_graphs(history, 'accuracy')

**Check if the slope of `val_loss` curve, should be <= 0.0005.**

In [None]:
val_loss = history.history['val_loss']
slope, *_ = linregress(range(len(val_loss)), val_loss)
print(f"The slope of validation loss curve is {slope:.5f}")

In [None]:
with open('./history/lstm_finetune_history.pkl', 'wb') as f:
    pickle.dump(history.history, f)

## BERT

In [28]:
!pip install -q -U "tensorflow-text==2.11.*"

[31mERROR: Could not find a version that satisfies the requirement tensorflow-text==2.11.* (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow-text==2.11.*[0m[31m
[0m

In [23]:
!pip install tf-models-official==2.11.0  # to use AdamW optimizer

Collecting tf-models-official==2.11.0
  Using cached tf_models_official-2.11.0-py2.py3-none-any.whl (2.3 MB)
Collecting gin-config
  Using cached gin_config-0.5.0-py3-none-any.whl (61 kB)
Collecting py-cpuinfo>=3.3.0
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Collecting kaggle>=1.3.9
  Using cached kaggle-1.5.13.tar.gz (63 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting oauth2client
  Using cached oauth2client-4.1.3-py2.py3-none-any.whl (98 kB)
Collecting tensorflow-model-optimization>=0.4.1
  Using cached tensorflow_model_optimization-0.7.3-py2.py3-none-any.whl (238 kB)
Collecting tensorflow-datasets
  Using cached tensorflow_datasets-4.8.3-py3-none-any.whl (5.4 MB)
Collecting pyyaml<6.0,>=5.1
  Using cached PyYAML-5.4.1.tar.gz (175 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[31mERROR: Could not find a version that satisfies th

In [26]:
!pip install tensorflow_hub



In [27]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

ModuleNotFoundError: No module named 'tensorflow_hub'

In [None]:
# there are many pretrained BERT models available from TF Hub
## Small BERT: same general architecture but fewer Transformer blocks, tradeoff between speed and quality
bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8' 

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

In [None]:
def build_classifier_model():
    # create layers
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')

    # use Model API to build a model
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text') # specify input
    encoder_inputs = preprocessing_layer(text_input)  # pass through preprocessing layer
    outputs = encoder(encoder_inputs)  # pass through BERT encoder
    net = outputs['pooled_output']  # (B, H), get the embedding of the entire news
    net = tf.keras.layers.Dropout(0.1)(net)  # prevent overfitting
    net = tf.keras.layers.Dense(4, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [None]:
bert_model = build_classifier_model()
tf.keras.utils.plot_model(bert_model)

In [None]:
epochs = 5
steps_per_epoch = tf.data.cardinality(train_dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
bert_model.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=['accuracy', F1Score(num_classes=4, average='macro')]
    optimizer=optimizer)

In [None]:
print(f'Training model with {tfhub_handle_encoder}')
history = bert_model.fit(x=train_ds,
                               validation_data=val_ds,
                               epochs=epochs,
                               callbacks=[early_stopping])

In [None]:
loss, accuracy = bert_model.evaluate(test_ds)
print(f'Loss: {ltrain_datasett(f'Accuracy: {accuracy}')