<a href="https://colab.research.google.com/github/ostamand/bert-classifier/blob/master/Toxic_BERT_(TPU).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Toxic BERT (TPU)
Reference: 

- https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb
- https://github.com/google-research/bert

## Install

In [0]:
!python --version

In [0]:
!pip uninstall -y tensorflow

In [0]:
!pip install tensorflow-gpu>=1.11.0

In [0]:
!git clone https://github.com/google-research/bert bert

## Setup TensorBoard

In [0]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip

Start tensorboad

In [0]:
get_ipython().system_raw(
    f'tensorboard --logdir {config.output_dir} --host 0.0.0.0 --port 6006 &'
)

In [0]:
get_ipython().system_raw('./ngrok http 6006 &')

In [0]:
!curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

To restart tensorboard

In [0]:
!ps

In [0]:
!kill 3025

Tensorboard debugging commands:

`!find ./tmp | grep tfevents`

`!tensorboard --inspect --logdir tmp`

In [0]:
%load_ext tensorboard

## Setup

In [0]:
import sys

In [0]:
sys.path.insert(0, 'bert')

In [0]:
import os
import json
import pprint
import pdb

import modeling
import optimization
import tensorflow_hub as hub
import tensorflow as tf

from run_classifier_with_tfhub import create_tokenizer_from_hub_module
from run_classifier import convert_examples_to_features, DataProcessor, InputExample
import pandas as pd
import numpy as np

In [0]:
tf.VERSION

In [0]:
tf.logging.set_verbosity(tf.logging.INFO)

## Setup gdrive


In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

## Config

In [0]:
class Config:
  def __init__(self, *args, **kwargs):
    self.lr = 2e-5
    self.aux_targets=6
    self.module_handle = 'https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1'
    self.maxlen = 220 
    self.bs = 8
    self.epochs = 1
    self.output_dir = 'gs://bert-train-logs/toxic'
    self.train_batch_size = 32
    self.eval_batch_size = 8
    self.predict_batch_size = 8
    self.save_checkpoints_steps = 10 
    self.save_summary_steps = 10 
    
    # tpu stuff
    self.num_tpu_cores = 8
    self.iterations_per_loop = 1000
    
config = Config()

## Setup TPU

In [0]:
assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

In [0]:
# Setup TPU related config
tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

def get_run_config(config):
  return tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=config.output_dir,
    save_checkpoints_steps=config.save_checkpoints_steps,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=config.iterations_per_loop,
        num_shards=config.num_tpu_cores,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))


## BERT

In [0]:
class ToxicProcessor(DataProcessor):
  
  def __init__(self, num_to_load=None):
    self.num_to_load = num_to_load
    self.label_list = ['0', '1']
  
  def get_train_examples(self, data_dir):
    df = self._read_csv(os.path.join(data_dir, 'train.csv'))
    if self.num_to_load:
      df = df.iloc[:self.num_to_load]
    df['comment_text'].astype(str)
    examples = []
    for i, row in df.iterrows():
      guid = f"train-{i}"
      label = '1' if row['target'] >= 0.5 else '0'
      text_a = str(row['comment_text'])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=None, label=label)
      )
    return examples
  
  @classmethod
  def _read_csv(cls, file_path):
    df = pd.read_csv(file_path)
    return df

In [0]:
def build_model(config, input_ids, input_mask, segment_ids, label_ids):
  tags = set()
  tags.add('train')
  
  with tf.variable_scope('bert'):
    bert_module = hub.Module(config.module_handle, tags=tags, trainable=True)
    bert_inputs = dict(
          input_ids=input_ids,
          input_mask=input_mask,
          segment_ids=segment_ids
    )
    bert_outputs = bert_module(
          inputs=bert_inputs,
          signature="tokens",
          as_dict=True
    )
    output_layer = bert_outputs["pooled_output"]
    hidden_size = output_layer.shape[-1].value
  
  # classifier layers on top of BERT
  
  with tf.variable_scope("classifier"):
    fc_weights = tf.get_variable(
        "fc_weights", [1, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02)
    )

    fc_aux_weights = tf.get_variable(
        "fc_aux_weights", [6, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02)
    )

    fc_bias = tf.get_variable(
        "fc_bias", [1], initializer=tf.zeros_initializer()
    )

    fc_aux_bias = tf.get_variable(
        "fc_aux_bias", [6], initializer=tf.zeros_initializer()
    )
    
    output_layer = tf.nn.dropout(output_layer, rate=0.4)
    logits_fc = tf.matmul(output_layer, fc_weights, transpose_b=True)
    logits_fc = tf.nn.bias_add(logits_fc, fc_bias)
    
    # not used for now, for custom loss calculation 
    logits_fc_aux = tf.matmul(output_layer, fc_aux_weights, transpose_b=True)
    logits_fc_aux = tf.nn.bias_add(logits_fc_aux, fc_aux_bias)
    
    logits = tf.concat([logits_fc, logits_fc_aux], axis=1, name='logits')
    
    # for now, use only labels 
    probs = tf.nn.sigmoid(logits_fc, name='probs')
    
    preds = tf.squeeze(tf.cast((probs >= 0.5), tf.float32), name='preds')
    
    logits_for_loss = tf.reshape(tf.slice(logits, [0, 0] , [-1, 1]), [-1])
    
    labels = tf.cast(label_ids, tf.float32, name='labels')
    
    eval_op = tf.metrics.accuracy(labels=labels, predictions=preds)
    accuracy = tf.reduce_mean(tf.cast(tf.math.equal(preds, labels), tf.float32), name='accuracy')
    #tf.summary.scalar('accuracy', accuracy)
    
  with tf.variable_scope("optimizer"):
    loss = tf.nn.sigmoid_cross_entropy_with_logits(
      labels=labels,
      logits=logits_for_loss
    )
    loss = tf.reduce_mean(loss, name='loss')
    train_op = optimization.create_optimizer(loss, config.lr, config.num_train_steps, config.num_warmup_steps, True) # use tpu
    
  return loss, train_op, eval_op, accuracy

In [0]:
def build_input_fn(features, seq_length, batch_size, mode):

  def input_fn(params):
    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    all_label_ids = []

    for feature in features:
      all_input_ids.append(feature.input_ids)
      all_input_mask.append(feature.input_mask)
      all_segment_ids.append(feature.segment_ids)
      all_label_ids.append(feature.label_id)

    num_examples = len(features)

    dataset = tf.data.Dataset.from_tensor_slices({
          "input_ids":
              tf.constant(
                  all_input_ids, shape=[num_examples, seq_length],
                  dtype=tf.int32),
          "input_mask":
              tf.constant(
                  all_input_mask,
                  shape=[num_examples, seq_length],
                  dtype=tf.int32),
          "segment_ids":
              tf.constant(
                  all_segment_ids,
                  shape=[num_examples, seq_length],
                  dtype=tf.int32),
          "label_ids":
              tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
      })

    if mode == tf.estimator.ModeKeys.TRAIN:
      dataset = dataset.repeat()
      dataset = dataset.shuffle(buffer_size=100)
      dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)
    elif mode == tf.estimator.ModeKeys.EVAL:
      dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)

    return dataset
  
  return input_fn

In [0]:
# ref: https://www.tensorflow.org/guide/custom_estimators

def model_fn(features, labels, mode, params):
  input_ids = features["input_ids"]
  input_mask = features["input_mask"]
  segment_ids = features["segment_ids"]
  label_ids = features["label_ids"]
    
  loss, train_op, eval_op, accuracy = build_model(
      params['config'],
      input_ids, 
      input_mask,
      segment_ids,
      label_ids
  )
    
  if mode == tf.estimator.ModeKeys.TRAIN:
    spec = tf.contrib.tpu.TPUEstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=train_op
    )
  elif mode == tf.estimator.ModeKeys.EVAL:
    
    def metric_fn(loss, eval_op):
      return {
          "eval_accuracy": eval_op,
          "eval_loss": loss
      }
    eval_metrics = (metric_fn, [loss, eval_op])
      
    spec = tf.contrib.tpu.TPUEstimatorSpec(
        mode=mode,
        loss=loss,
        eval_metrics=eval_metrics
    )

  return spec  

## Data

In [0]:
processor = ToxicProcessor(num_to_load=100000)

In [0]:
examples = processor.get_train_examples('gdrive/My Drive/code/jigsaw-bias-toxicity/data/')

In [0]:
tokenizer = create_tokenizer_from_hub_module(config.module_handle)

In [0]:
features = convert_examples_to_features(examples, processor.label_list, config.maxlen, tokenizer)

## Train

In [0]:
config.num_train_steps =  int(len(features) / config.train_batch_size * config.epochs)
config.num_eval_steps = int(len(features) / config.eval_batch_size)
config.num_warmup_steps = int(config.num_train_steps * 0.05)

# Force TF Hub writes to the GS bucket we provide.
# To run with TPU all files need to be accessible on Google Storage
os.environ['TFHUB_CACHE_DIR'] = config.output_dir

classifier = tf.contrib.tpu.TPUEstimator(
  use_tpu=True,
  model_fn=model_fn,
  config=get_run_config(config),
  train_batch_size=config.train_batch_size,
  eval_batch_size=config.eval_batch_size,
  predict_batch_size=config.predict_batch_size,
  params = {'config': config }
)

`%tensorboard --logdir tmp/`

Train model

In [0]:
%%time 
classifier.train(
    input_fn= build_input_fn(features, config.maxlen, config.bs, tf.estimator.ModeKeys.TRAIN),
    max_steps=config.num_train_steps
)

In [0]:
eval_result = classifier.evaluate(
    input_fn=build_input_fn(features, config.maxlen, config.bs, tf.estimator.ModeKeys.EVAL),
    steps=config.num_eval_steps
)

In [0]:
eval_result