https://towardsdatascience.com/pre-training-bert-from-scratch-with-cloud-tpu-6e2f71028379

**Note:** code for training has been used from these resources!

# Setting up the environment


In [1]:
!pip install sentencepiece
!git clone https://github.com/google-research/bert

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 3.5MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.85
Cloning into 'bert'...
remote: Enumerating objects: 336, done.[K
remote: Total 336 (delta 0), reused 0 (delta 0), pack-reused 336
Receiving objects: 100% (336/336), 297.11 KiB | 4.01 MiB/s, done.
Resolving deltas: 100% (183/183), done.


In [2]:
import os
import sys
import json
import nltk
import random
import logging
import tensorflow as tf
import sentencepiece as spm

from glob import glob
from google.colab import auth, drive
from tensorflow.keras.utils import Progbar

sys.path.append("bert")

from bert import modeling, optimization, tokenization
from bert.run_pretraining import input_fn_builder, model_fn_builder

auth.authenticate_user()
  
# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s :  %(message)s')
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
log.handlers = [sh]

if 'COLAB_TPU_ADDR' in os.environ:
  log.info("Using TPU runtime")
  USE_TPU = True
  TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']

  with tf.Session(TPU_ADDRESS) as session:
    log.info('TPU address is ' + TPU_ADDRESS)
    # Upload credentials to TPU.
    with open('/content/adc.json', 'r') as f:
      auth_info = json.load(f)
    tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
    
else:
  log.warning('Not connected to TPU runtime')
  USE_TPU = False


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



2020-03-01 16:01:53,534 :  Using TPU runtime
2020-03-01 16:01:53,540 :  TPU address is grpc://10.48.25.18:8470


# Loading the corpus

In [0]:
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [0]:
%cd /content/drive/My\ Drive
!pwd

/content/drive/My Drive
/content/drive/My Drive


# Building the Vocab

In [0]:
PRC_DATA_FPATH = 'urdu_filtered.txt'

MODEL_PREFIX = "tokenizer"
#VOC_SIZE = 42927 # For Roman-Urdu
VOC_SIZE = 43783 # For urdu
SUBSAMPLE_SIZE = 12800000
NUM_PLACEHOLDERS = 256

In [0]:
SPM_COMMAND = ('--input={} --model_prefix={} '
               '--vocab_size={} --input_sentence_size={} '
               '--shuffle_input_sentence=true ' 
               '--bos_id=-1 --eos_id=-1 --hard_vocab_limit=false').format(
               PRC_DATA_FPATH, MODEL_PREFIX, 
               VOC_SIZE - NUM_PLACEHOLDERS, SUBSAMPLE_SIZE)

spm.SentencePieceTrainer.Train(SPM_COMMAND)

True

In [0]:
def read_sentencepiece_vocab(filepath):
  voc = []
  with open(filepath, encoding='utf-8') as fi:
    for line in fi:
      voc.append(line.split("\t")[0])
  # skip the first <unk> token
  voc = voc[1:]
  return voc

snt_vocab = read_sentencepiece_vocab("{}.vocab".format(MODEL_PREFIX))
print("Learnt vocab size: {}".format(len(snt_vocab)))
print("Sample tokens: {}".format(random.sample(snt_vocab, 10)))

Learnt vocab size: 29251
Sample tokens: ['سینٹ', 'الفاظ', '▁الحمرا', '▁پارسائی', 'وئل', '▁شائستہ', '▁ڈنکا', '▁ھجری', 'ارب', '▁نزیر']


In [0]:
def parse_sentencepiece_token(token):
    if token.startswith("▁"):
        return token[1:]
    else:
        return "##" + token
        
bert_vocab = list(map(parse_sentencepiece_token, snt_vocab))

ctrl_symbols = ["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"]
bert_vocab = ctrl_symbols + bert_vocab

bert_vocab += ["[UNUSED_{}]".format(i) for i in range(VOC_SIZE - len(bert_vocab))]
print(len(bert_vocab))

43783


In [0]:
VOC_FNAME = "vocab.txt"

with open(VOC_FNAME, "w") as fo:
  for token in bert_vocab:
    fo.write(token+"\n")

In [0]:
cd /content/drive/My\ Drive/

/content/drive/My Drive


# Generating Pretraining Data

In [0]:
!mkdir ./shards
!split -a 4 -l 256000 -d $PRC_DATA_FPATH ./shards/shard_

In [0]:
MAX_SEQ_LENGTH = 128
MASKED_LM_PROB = 0.15 
MAX_PREDICTIONS = 20 
DO_LOWER_CASE = True

PRETRAINING_DIR = "pretraining_data"
# controls how many parallel processes xargs can create
PROCESSES = 2

In [0]:
XARGS_CMD = ("ls ./shards/ | "
             "xargs -n 1 -P {} -I{} "
             "python3 /content/bert/create_pretraining_data.py "
             "--input_file=./shards/{} "
             "--output_file={}/{}.tfrecord "
             "--vocab_file={} "
             "--do_lower_case={} "
             "--max_predictions_per_seq={} "
             "--max_seq_length={} "
             "--masked_lm_prob={} "
             "--random_seed=34 "
             "--dupe_factor=5")

XARGS_CMD = XARGS_CMD.format(PROCESSES, '{}', '{}', PRETRAINING_DIR, '{}', 
                             VOC_FNAME, DO_LOWER_CASE, 
                             MAX_PREDICTIONS, MAX_SEQ_LENGTH, MASKED_LM_PROB)
                             
tf.gfile.MkDir(PRETRAINING_DIR)
!$XARGS_CMD



W0227 14:37:43.945695 140097273104256 module_wrapper.py:139] From /content/bert/create_pretraining_data.py:437: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.


W0227 14:37:43.946086 140097273104256 module_wrapper.py:139] From /content/bert/create_pretraining_data.py:437: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.


W0227 14:37:43.946351 140097273104256 module_wrapper.py:139] From /content/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.



W0227 14:37:43.949395 140523838797696 module_wrapper.py:139] From /content/bert/create_pretraining_data.py:437: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.


W0227 14:37:43.949831 140523838797696 module_wrapper.py:139] From /content/bert/create_pretraining_data.py:437: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.loggi

# BERT-base Config

In [0]:
BUCKET_NAME = "bert_urdu_bucket"
MODEL_DIR = "bert_model" 
tf.gfile.MkDir(MODEL_DIR)

if not BUCKET_NAME:
  log.warning("WARNING: BUCKET_NAME is not set. "
              "You will not be able to train the model.")

In [0]:
bert_base_config = {
  "attention_probs_dropout_prob": 0.1, 
  "directionality": "bidi", 
  "hidden_act": "gelu", 
  "hidden_dropout_prob": 0.1, 
  "hidden_size": 768, 
  "initializer_range": 0.02, 
  "intermediate_size": 3072, 
  "max_position_embeddings": 512, 
  "num_attention_heads": 12, 
  "num_hidden_layers": 12, 
  "pooler_fc_size": 768, 
  "pooler_num_attention_heads": 12, 
  "pooler_num_fc_layers": 3, 
  "pooler_size_per_head": 128, 
  "pooler_type": "first_token_transform", 
  "type_vocab_size": 2, 
  "vocab_size": VOC_SIZE
}

#with open("{}/bert_config.json".format(MODEL_DIR), "w") as fo:
#  json.dump(bert_base_config, fo, indent=2)
  
#with open("{}/{}".format(MODEL_DIR, VOC_FNAME), "w") as fo:
#  for token in bert_vocab:
#    fo.write(token+"\n")

# Training the Model

In [0]:
if BUCKET_NAME:
  !gsutil -m cp -r $MODEL_DIR $PRETRAINING_DIR gs://$BUCKET_NAME

Copying file://bert_model/vocab.txt [Content-Type=text/plain]...
Copying file://bert_model/bert_config.json [Content-Type=application/json]...
/ [0/6 files][    0.0 B/388.3 MiB]   0% Done                                    / [0/6 files][    0.0 B/388.3 MiB]   0% Done                                    Copying file://pretraining_data/shard_0001.tfrecord [Content-Type=application/octet-stream]...
/ [0/6 files][    0.0 B/388.3 MiB]   0% Done                                    Copying file://pretraining_data/shard_0000.tfrecord [Content-Type=application/octet-stream]...
Copying file://pretraining_data/shard_0002.tfrecord [Content-Type=application/octet-stream]...
Copying file://pretraining_data/shard_0003.tfrecord [Content-Type=application/octet-stream]...
| [6/6 files][388.3 MiB/388.3 MiB] 100% Done                                    
Operation completed over 6 objects/388.3 MiB.                                    


In [8]:
BUCKET_NAME = "bert_urdu_bucket"
MODEL_DIR = "bert_model" 
PRETRAINING_DIR = "pretraining_data" 
VOC_FNAME = "vocab.txt" 

# Input data pipeline config
TRAIN_BATCH_SIZE = 64
MAX_PREDICTIONS = 20 
MAX_SEQ_LENGTH = 128 
MASKED_LM_PROB = 0.15

# Training procedure config
EVAL_BATCH_SIZE = 64
LEARNING_RATE = 2e-5
TRAIN_STEPS = 1000000 
SAVE_CHECKPOINTS_STEPS = 2500
NUM_TPU_CORES = 8

if BUCKET_NAME:
  BUCKET_PATH = "gs://{}".format(BUCKET_NAME)
else:
  BUCKET_PATH = "."

BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_DIR)
DATA_GCS_DIR = "{}/{}".format(BUCKET_PATH, PRETRAINING_DIR)

VOCAB_FILE = os.path.join(BERT_GCS_DIR, VOC_FNAME)
CONFIG_FILE = os.path.join(BERT_GCS_DIR, "bert_config.json")

INIT_CHECKPOINT = tf.train.latest_checkpoint(BERT_GCS_DIR)

bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE)
input_files = tf.gfile.Glob(os.path.join(DATA_GCS_DIR,'*tfrecord'))

log.info("Using checkpoint: {}".format(INIT_CHECKPOINT))
log.info("Using {} data shards".format(len(input_files)))

2020-03-01 16:02:20,758 :  From /content/bert/modeling.py:93: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

2020-03-01 16:02:21,844 :  Using checkpoint: gs://bert_urdu_bucket/bert_model/model.ckpt-825000
2020-03-01 16:02:21,845 :  Using 4 data shards


In [9]:
model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=INIT_CHECKPOINT,
      learning_rate=LEARNING_RATE,
      num_train_steps=TRAIN_STEPS,
      num_warmup_steps=10,
      use_tpu=USE_TPU,
      use_one_hot_embeddings=True)

tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=BERT_GCS_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=SAVE_CHECKPOINTS_STEPS,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=USE_TPU,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE)
  
train_input_fn = input_fn_builder(
        input_files=input_files,
        max_seq_length=MAX_SEQ_LENGTH,
        max_predictions_per_seq=MAX_PREDICTIONS,
        is_training=True)

2020-03-01 16:02:22,869 :  Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x7ff668477158>) includes params argument, but params are not passed to Estimator.
2020-03-01 16:02:22,871 :  Using config: {'_model_dir': 'gs://bert_urdu_bucket/bert_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 2500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.48.25.18:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff667169c50>, '

In [0]:
estimator.train(input_fn=train_input_fn, max_steps=TRAIN_STEPS)

2020-03-01 16:02:29,092 :  Querying Tensorflow master (grpc://10.48.25.18:8470) for TPU system metadata.
2020-03-01 16:02:29,107 :  Found TPU system:
2020-03-01 16:02:29,108 :  *** Num TPU Cores: 8
2020-03-01 16:02:29,109 :  *** Num TPU Workers: 1
2020-03-01 16:02:29,110 :  *** Num TPU Cores Per Worker: 8
2020-03-01 16:02:29,111 :  *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 2231307234061926342)
2020-03-01 16:02:29,114 :  *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 4265091849796749888)
2020-03-01 16:02:29,116 :  *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 4206219071163314961)
2020-03-01 16:02:29,117 :  *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 13323203557327540114)
2020-03-01 16:02:29,119 :  *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TP