# BERT Model Training

In [1]:
%load_ext tensorboard

In [2]:
import warnings
import os
import silence_tensorflow
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_hub as hub
from tensorboard.plugins.hparams import api as hp
import numpy as np
import pandas as pd
from datetime import datetime
import pickle
import skopt

from skopt import gbrt_minimize, gp_minimize
from skopt.utils import use_named_args
from skopt.space import Real, Categorical, Integer  

In [3]:
# Importing Google Bert Libraries
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

In [4]:
# removes logs from previous runs
!rm -rf ./logs/
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'

## Specify Output directory and GPU Settings

In [5]:
# Set output directory for saving model data during training
OUTPUT_DIR = "model/OUTPUT_DIR_BERT"

print("GPU Available: ", tf.test.is_gpu_available())
#tf.debugging.set_log_device_placement(True)

GPU Available:  True


In [6]:
# Define Data Coloumn, Label Column
DATA_COLUMN = 'tweet'
LABEL_COLUMN = 'subtask_a'

## Load and split the data

In [7]:
X = pd.read_csv('data/X_processed.csv')
y_tmp = pd.read_csv('data/y_processed.csv')
X.head()

Unnamed: 0,id,tweet,subtask_a
0,86426,she should ask a few native americans what the...,OFF
1,90194,go home your drunk!!! magna trump2020,OFF
2,16820,amazon is investigation chinese employees who ...,NOT
3,62688,someone should havetaken this piece of shit to...,OFF
4,43605,drama wanted liberals a illegal to move into r...,NOT


In [8]:
#transform binary string labels to integers [0, 1]
y = pd.DataFrame(y_tmp['subtask_a'].apply(lambda x: 1 if x=="OFF" else 0))
X['subtask_a'] = X['subtask_a'].apply(lambda x: 1 if x=="OFF" else 0)
y.head()

Unnamed: 0,subtask_a
0,1
1,1
2,0
3,1
4,0


In [9]:
# Splitting train and test set with sklearn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=1)

## Convert Input data to BERT-Embedding features

In [10]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data
InputExamples_train = X_train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1).to_frame()
InputExamples_test = X_test.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1).to_frame()
InputExamples_val = X_val.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1).to_frame()

In [11]:
# Convert list to values for computation purposes
input_list_train = InputExamples_train.values.tolist()
for i in range(len(input_list_train)):
    input_list_train[i] = input_list_train[i][0]

input_list_test = InputExamples_test.values.tolist()
for i in range(len(input_list_test)):
    input_list_test[i] = input_list_test[i][0]

input_list_val = InputExamples_val.values.tolist()
for i in range(len(input_list_val)):
    input_list_val[i] = input_list_val[i][0]

In [12]:
# Load pretrained BERT Model form TF hub
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

In [13]:
# Import BERT Tokenizer and Bert vocab file
def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

In [14]:
# We'll set sequences to be at most 128 tokens long (mor ethan sufficient for Twitter posts)
max_seq_length = 128
label_list = [0, 1]

### Finally convert Input to Embeddings

In [15]:
# Convert our train and test features to InputFeatures that BERT understands.
feature_train = bert.run_classifier.convert_examples_to_features(input_list_train, label_list, max_seq_length, tokenizer)
feature_test = bert.run_classifier.convert_examples_to_features(input_list_test, label_list, max_seq_length, tokenizer)
feature_val = bert.run_classifier.convert_examples_to_features(input_list_val, label_list, max_seq_length, tokenizer)



## Define architecture of DL model

### Specify Input and Output layers of the model

In [16]:
# Creating a classification model
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):


    bert_module = hub.Module(
      BERT_MODEL_HUB,
      trainable=True)
    bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
    bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_outputs" for token-level output.
    output_layer = bert_outputs["pooled_output"]
    print(output_layer.shape)  
    print(output_layer.shape[-1].value)
    print(type(output_layer.shape[-1]))

    hidden_size = output_layer.shape[-1].value

    # Create our own layer to tune for politeness data.
    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
    initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())
    
    # Specify training on GPU resource
    with tf.device('/gpu:0'):
        with tf.variable_scope("loss"):

            # Dropout helps prevent overfitting
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            # Convert labels into one-hot encoding
            one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

            predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
            # If we're predicting, we want predicted labels and the probabiltiies.
            if is_predicting:
                return (predicted_labels, log_probs)

            # If we're train/eval, compute loss between predicted and actual label
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
            loss = tf.reduce_mean(per_example_loss)
            return (loss, predicted_labels, log_probs)

### Define function that builds and returns model

In [17]:
#model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
  """Returns `model_fn` closure for TPUEstimator."""
  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]

    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
    # TRAIN and EVAL
    if not is_predicting:

      (loss, predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      train_op = bert.optimization.create_optimizer(
          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

      # Calculate evaluation metrics. 
      def metric_fn(label_ids, predicted_labels):
        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
        f1_score = tf.contrib.metrics.f1_score(
            label_ids,
            predicted_labels)
        auc = tf.metrics.auc(
            label_ids,
            predicted_labels)
        recall = tf.metrics.recall(
            label_ids,
            predicted_labels)
        precision = tf.metrics.precision(
            label_ids,
            predicted_labels) 
        true_pos = tf.metrics.true_positives(
            label_ids,
            predicted_labels)
        true_neg = tf.metrics.true_negatives(
            label_ids,
            predicted_labels)   
        false_pos = tf.metrics.false_positives(
            label_ids,
            predicted_labels)  
        false_neg = tf.metrics.false_negatives(
            label_ids,
            predicted_labels)
        return {
            "eval_accuracy": accuracy,
            "f1_score": f1_score,
            "auc": auc,
            "precision": precision,
            "recall": recall,
            "true_positives": true_pos,
            "true_negatives": true_neg,
            "false_positives": false_pos,
            "false_negatives": false_neg
        }

      eval_metrics = metric_fn(label_ids, predicted_labels)

      if mode == tf.estimator.ModeKeys.TRAIN:        
        return tf.estimator.EstimatorSpec(mode=mode,
          loss=loss,
          train_op=train_op)
      else:
          return tf.estimator.EstimatorSpec(mode=mode,
            loss=loss,
            eval_metric_ops=eval_metrics)
    else:
      (predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      predictions = {
          'probabilities': log_probs,
          'labels': predicted_labels
      }
      return tf.estimator.EstimatorSpec(mode, predictions=predictions)

  # Return the actual model function in the closure
  return model_fn

## Bayesian Hyperparameter Tuning

In [18]:
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

# HYPERPARAMTER
dim_learning_rate = Real(low=1e-6, high=1e-1, prior='log-uniform',
                         name='learning_rate')
dim_batch_size = Categorical(categories=[16, 32, 64], 
                             name='batch_size')
dim_num_epochs = Integer(low=3, high=15, name='num_epochs')
dim_warmup_proportion = Real(low=0.05, high=0.6, name='warmup_proportion')
dim_max_seq_length = Integer(low=20, high=140, name='max_seq_length')

# Define list that spans Hyperparamter Space
dimensions = [dim_learning_rate,
              dim_batch_size,
              dim_num_epochs,
              dim_warmup_proportion,
              dim_max_seq_length
             ]
# Specify initial parameters that optimizer will try first (x0)
default_parameters = [2e-5, 32, 3, 0.1, 140]

In [19]:
# Specify GPU Settings and input to RunConfig
config=tf.ConfigProto()
config.log_device_placement=True
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.9

# Establish Run Configuration 
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,                            # directory to which models and checkpoints will be saved
    save_summary_steps=SAVE_SUMMARY_STEPS,          
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,  
    session_config=config)

#tf.get_logger().setLevel('INFO')

### Define function that gets called by Bayesian HP Optimizer

In [20]:
def fitness(dimensions):
    print(f'\nBeginning Training!\n')
    learning_rate = dimensions[0]
    batch_size = dimensions[1]
    num_epochs = dimensions[2]
    warmup_proportion = dimensions[3]
    max_seq_length = 128
    print('Learning Rate:     {}'.format(learning_rate))
    print('Batch Size:        {}'.format(batch_size))
    print('Num Epochs:        {}'.format(num_epochs))
    print('Warmup Proportion: {}'.format(warmup_proportion))

    num_train_steps = int(len(feature_train) / batch_size * num_epochs)
    num_warmup_steps = int(num_train_steps * warmup_proportion)
    

    
    # ACTUAL MODEL TRAINING STARTS HERE
    with tf.Session() as sess: 
        model_fn = model_fn_builder(
            num_labels=len(label_list),
            learning_rate=learning_rate,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps)

        estimator = tf.estimator.Estimator(
            model_fn=model_fn,
            config=run_config,
            params={"batch_size": batch_size})

        train_input_fn = bert.run_classifier.input_fn_builder(
            features=feature_train,
            seq_length=max_seq_length,
            is_training=True,
            drop_remainder=False)

        current_time = datetime.now()
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

        val_input_fn = run_classifier.input_fn_builder(
        features=feature_val,
        seq_length=max_seq_length,
        is_training=False,
        drop_remainder=False)

        results = estimator.evaluate(input_fn=val_input_fn, steps=None)
    for acc in results:
        print(acc, ':', results[acc])
    print("\nTraining took time ", datetime.now() - current_time)
    return -results["eval_accuracy"]

## Hyperparameter Tuning

In [None]:
# Function calls fitness function n_calls times and 
# samples each run from the defined Hyperparamter space (dimensions).
gp_result = gp_minimize(func=fitness, 
                            dimensions=dimensions,
                            n_calls=12,
                            noise= 0.01,
                            n_jobs=-1,
                            kappa = 5,
                            x0=default_parameters)


Beginning Training!

Learning Rate:     2e-05
Batch Size:        32
Num Epochs:        3
Warmup Proportion: 0.1
(?, 768)
768
<class 'tensorflow.python.framework.tensor_shape.Dimension'>
auc : 0.5016499
eval_accuracy : 0.6603774
f1_score : 0.49645385
false_negatives : 338.0
false_positives : 22.0
loss : 0.6545831
precision : 0.3529412
recall : 0.034285713
true_negatives : 688.0
true_positives : 12.0
global_step : 7943

Training took time  0:08:48.701154

Beginning Training!

Learning Rate:     6.3467721326444465e-06
Batch Size:        64
Num Epochs:        4
Warmup Proportion: 0.4693355342621294
(?, 768)
768
<class 'tensorflow.python.framework.tensor_shape.Dimension'>
auc : 0.499497
eval_accuracy : 0.65943396
f1_score : 0.49645385
false_negatives : 340.0
false_positives : 21.0
loss : 0.6619486
precision : 0.32258064
recall : 0.028571429
true_negatives : 689.0
true_positives : 10.0
global_step : 7943

Training took time  0:05:27.197134

Beginning Training!

Learning Rate:     0.05445584

### Pass the set of best performing hyperparameters

In [None]:

learning_rate = gp_result.x[0]
batch_size = gp_result.x[1]
num_epochs = gp_result.x[2]
warmup_proportion = gp_result.x[3]
max_seq_length = 128

## Evaluate best performing model on the test set

In [None]:
print('Learning Rate:     {}'.format(learning_rate))
print('Batch Size:        {}'.format(batch_size))
print('Num Epochs:        {}'.format(num_epochs))
print('Warmup Proportion: {}'.format(warmup_proportion))

num_train_steps = int(len(feature_train) / batch_size * num_epochs)
num_warmup_steps = int(num_train_steps * warmup_proportion)

#tf.logging.set_verbosity(tf.logging.INFO)
with tf.Session() as sess:
    model_fn = model_fn_builder(
        num_labels=len(label_list),
        learning_rate=learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps)

    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        config=run_config,
        params={"batch_size": batch_size})

    train_input_fn = bert.run_classifier.input_fn_builder(
        features=feature_train,
        seq_length=max_seq_length,
        is_training=True,
        drop_remainder=False)
    print(f'Beginning Training!')
    current_time = datetime.now()
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    print("Training took time ", datetime.now() - current_time)

In [None]:
test_input_fn = run_classifier.input_fn_builder(
    features=feature_test,
    seq_length=max_seq_length,
    is_training=False,
    drop_remainder=False)

In [None]:
estimator.evaluate(input_fn=test_input_fn, steps=None)

## Test on examples

In [None]:
def getPrediction(in_sentences):
    labels = [0, 1]
    input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label
    print(input_examples[i].label for i in range(len(in_sentences)))
    input_features = run_classifier.convert_examples_to_features(input_examples, label_list, max_seq_length, tokenizer)
    predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=max_seq_length, is_training=False, drop_remainder=False)
    predictions = estimator.predict(predict_input_fn)
    return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]

In [None]:
pred_sentences = [
  "That asshole was absolutely awful",
  "The Democrat Caucus is an unmitigated disaster. Nothing works, just like they ran the Country. Remember the  5 Billion Dollar Obamacare Website, that should have cost 2% of that. The only person that can claim a very big victory in Iowa last night is “Trump”.",
  "The film was creative and surprising",
  "Absolutely fantastic!"
]

In [None]:
predictions = getPrediction(pred_sentences)

In [None]:
predictions