https://towardsdatascience.com/pre-training-bert-from-scratch-with-cloud-tpu-6e2f71028379

In [2]:
!git clone https://github.com/google-research/bert

Cloning into 'bert'...
remote: Enumerating objects: 340, done.[K
remote: Total 340 (delta 0), reused 0 (delta 0), pack-reused 340[K
Receiving objects: 100% (340/340), 328.28 KiB | 4.50 MiB/s, done.
Resolving deltas: 100% (182/182), done.


In [3]:
import json
from google.oauth2 import service_account
from google.cloud import tpu_v1
#auth_file="lemmingsinthewind-6f1cdbe0f4b8.json"
zone="us-central1-a"
tpuname="projects/lemmingsinthewind/locations/{zone}/nodes/bertme".format(zone=zone)



client = tpu_v1.TpuClient()

In [4]:
try:
    request = tpu_v1.GetNodeRequest(
        name=tpuname,
    )

    # Make the request
    response = client.get_node(request=request)
except:
    response=None

# Handle the response
print(response)

name: "projects/lemmingsinthewind/locations/us-central1-a/nodes/bertme"
accelerator_type: "v3-8"
ip_address: "10.110.1.2"
state: READY
tensorflow_version: "1.15.5"
network: "projects/1053790116294/global/networks/default"
cidr_block: "10.110.1.0/29"
port: "8470"
service_account: "service-501306953789@cloud-tpu.iam.gserviceaccount.com"
create_time {
  seconds: 1660455199
  nanos: 286475120
}
scheduling_config {
}
network_endpoints {
  ip_address: "10.110.1.2"
  port: 8470
}
health: HEALTHY
use_service_networking: true
api_version: V1



In [5]:
if response==None:
    # Initialize request argument(s)
    sc = tpu_v1.SchedulingConfig()
    sc.preemptible=False
    node = tpu_v1.Node()
    node.accelerator_type = "v3-8"
    node.tensorflow_version = "1.15.5"
    node.scheduling_config = sc
    node.use_service_networking=True
    node.network="projects/hazel-goal-319318/global/networks/default"
    request = tpu_v1.CreateNodeRequest(
            parent="projects/lemmingsinthewind/locations/us-central1-a",
         node_id='bertme',
        node=node
    )
    # Make the request
    operation = client.create_node(request=request)

    print("Waiting for operation to complete...")

    response = operation.result()

    # Handle the response
    print(response)

In [6]:
import os
import sys
import nltk
import random
import logging
import tensorflow as tf
import sentencepiece as spm

from glob import glob
from tensorflow.keras.utils import Progbar

sys.path.append("bert")

from bert import modeling, optimization, tokenization
from bert.run_pretraining import input_fn_builder, model_fn_builder

  
# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s :  %(message)s')
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
log.handlers = [sh]

log.info("Using TPU runtime")
USE_TPU = True
TPU_ADDRESS = 'grpc://'+response.ip_address+':'+response.port
with tf.Session(TPU_ADDRESS) as session:
    log.info('TPU address is ' + TPU_ADDRESS)
    # Upload credentials to TPU.
    print(session)
    
    
    

2022-08-23 15:44:39.225410: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.11.0





2022-08-23 15:44:41,638 :  Using TPU runtime


In [7]:
tpur =tf.distribute.cluster_resolver.TPUClusterResolver(
    tpu='bertme', zone=zone, project='lemmingsinthewind', job_name='worker'
)
tpur.cluster_spec()

<tensorflow.python.training.server_lib.ClusterSpec at 0x7f9b065ef2d0>

In [8]:
BUCKET_NAME = "chunkbert_training" #@param {type:"string"}
MODEL_DIR = "bert_model3" #@param {type:"string"}
VOC_FNAME= 'bert-wordpiecev3-vocab.txt'
PRETRAINING_DIR = "pretraining_data_docker4" #@param {type:"string"}

tf.gfile.MkDir(MODEL_DIR)

if not BUCKET_NAME:
  log.warning("WARNING: BUCKET_NAME is not set. "
              "You will not be able to train the model.")

2022-08-23 15:44:49,354 :  From /tmp/ipykernel_1/751807384.py:6: The name tf.gfile.MkDir is deprecated. Please use tf.io.gfile.mkdir instead.



In [9]:
bert_base_config = {
  "attention_probs_dropout_prob": 0.1, 
  "directionality": "bidi", 
  "hidden_act": "gelu", 
  "hidden_dropout_prob": 0.1, 
  "hidden_size": 768, 
  "initializer_range": 0.02, 
  "intermediate_size": 3072, 
  "max_position_embeddings": 512, 
  "num_attention_heads": 12, 
  "num_hidden_layers": 24, 
  "pooler_fc_size": 768, 
  "pooler_num_attention_heads": 12, 
  "pooler_num_fc_layers": 3, 
  "pooler_size_per_head": 128, 
  "pooler_type": "first_token_transform", 
  "type_vocab_size": 2, 
  "vocab_size": 128000
}

with open("{}/bert_config.json".format(MODEL_DIR), "w") as fo:
  json.dump(bert_base_config, fo, indent=2)
  

In [10]:
if BUCKET_NAME:
  !gsutil -m cp -r $MODEL_DIR gs://$BUCKET_NAME

Copying file://bert_model3/bert_config.json [Content-Type=application/json]...
Copying file://bert_model3/.ipynb_checkpoints/bert_config-checkpoint.json [Content-Type=application/json]...
/ [2/2 files][   1008 B/   1008 B] 100% Done                                    
Operation completed over 2 objects/1008.0 B.                                     


In [11]:
# Input data pipeline config
TRAIN_BATCH_SIZE = 256 #@param {type:"integer"}
MAX_PREDICTIONS = 77 #@param {type:"integer"}
MAX_SEQ_LENGTH = 512 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param

# Training procedure config
EVAL_BATCH_SIZE = 64
LEARNING_RATE = 1e-4
TRAIN_STEPS = 1000000 #@param {type:"integer"}
SAVE_CHECKPOINTS_STEPS = 2500 #@param {type:"integer"}
NUM_TPU_CORES = 8

if BUCKET_NAME:
  BUCKET_PATH = "gs://{}".format(BUCKET_NAME)
else:
  BUCKET_PATH = "."

BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_DIR)
DATA_GCS_DIR = "{}/{}".format(BUCKET_PATH, PRETRAINING_DIR)

VOCAB_FILE = os.path.join(BERT_GCS_DIR, VOC_FNAME)
CONFIG_FILE = os.path.join(BERT_GCS_DIR, "bert_config.json")

INIT_CHECKPOINT = tf.train.latest_checkpoint(BERT_GCS_DIR)

bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE)
input_files = tf.gfile.Glob(os.path.join(DATA_GCS_DIR,'*tfrecord'))

log.info("Using checkpoint: {}".format(INIT_CHECKPOINT))
log.info("Using {} data shards".format(len(input_files)))

2022-08-23 15:45:38,182 :  From /home/jupyter/bert/modeling.py:93: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

2022-08-23 15:45:38,363 :  From /tmp/ipykernel_1/2499664015.py:28: The name tf.gfile.Glob is deprecated. Please use tf.io.gfile.glob instead.

2022-08-23 15:45:38,420 :  Using checkpoint: None
2022-08-23 15:45:38,422 :  Using 29 data shards


In [12]:
model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=INIT_CHECKPOINT,
      learning_rate=LEARNING_RATE,
      num_train_steps=TRAIN_STEPS,
      num_warmup_steps=10000,
      use_tpu=True,
      use_one_hot_embeddings=True)


run_config = tf.contrib.tpu.RunConfig(
    cluster=tpur,
    model_dir=BERT_GCS_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=SAVE_CHECKPOINTS_STEPS,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=USE_TPU,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE)
  
train_input_fn = input_fn_builder(
        input_files=input_files,
        max_seq_length=MAX_SEQ_LENGTH,
        max_predictions_per_seq=MAX_PREDICTIONS,
        is_training=True)

2022-08-23 15:46:20,434 :  
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

2022-08-23 15:46:26,926 :  Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x7f9b06618200>) includes params argument, but params are not passed to Estimator.
2022-08-23 15:46:26,928 :  Using config: {'_model_dir': 'gs://chunkbert_training/bert_model3', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 2500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.110.1.2:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_k

In [None]:
estimator.train(input_fn=train_input_fn, max_steps=TRAIN_STEPS)

2022-08-23 15:46:27,033 :  Querying Tensorflow master (grpc://10.110.1.2:8470) for TPU system metadata.
2022-08-23 15:46:27.246282: W tensorflow/core/distributed_runtime/rpc/grpc_session.cc:370] GrpcSession::ListDevices will initialize the session with an empty graph and other defaults because the session has not yet been created.
2022-08-23 15:46:27,284 :  Found TPU system:
2022-08-23 15:46:27,285 :  *** Num TPU Cores: 8
2022-08-23 15:46:27,287 :  *** Num TPU Workers: 1
2022-08-23 15:46:27,289 :  *** Num TPU Cores Per Worker: 8
2022-08-23 15:46:27,291 :  *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 7448239727632855550)
2022-08-23 15:46:27,293 :  *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 17791925804359808359)
2022-08-23 15:46:27,295 :  *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 2447259900016301126)
2022-08-23 15:46:27,296 : 