In [None]:
!pip install sentencepiece
!pip install -U tensorflow==1.15

In [None]:
import os
import sys
import json
import nltk
import random

import tensorflow as tf
import sentencepiece as spm

from glob import glob

In [None]:
tf.__version__

In [None]:
# Detect TPU
if os.environ["TPU_NAME"]:
    USE_TPU = True
else:
    USE_TPU = False

In [None]:
!wget https://github.com/google-research/bert/archive/refs/heads/master.zip
!unzip master.zip
!mv bert-master/ bert

In [None]:
sys.path.append("/kaggle/working/bert/")

from bert import modeling, optimization, tokenization
from bert.run_pretraining import input_fn_builder, model_fn_builder

In [None]:
!wget http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2016/mono/OpenSubtitles.raw.en.gz -O dataset.txt.gz
!gunzip dataset.txt.gz
! ls -lh
# !tailx -5f dataset.txt

In [None]:
TRIAL_RUN = True
if TRIAL_RUN:
    LINE_COUNT = 1000
    FNAME = "dataset_" + str(LINE_COUNT) + ".txt"
    os.system("head -n " + str(LINE_COUNT) + " dataset.txt > " + FNAME)


In [None]:
DATA_PATH = "dataset.txt"
if TRIAL_RUN:
    DATA_PATH = FNAME
print("Data Path: {}".format(DATA_PATH))
MODEL_PREFIX = "tokenizer"
VOC_SIZE = 800
NUM_PLACEHOLDERS = 50

SPM_COMMAND = ('--input={} --model_prefix={} '
               '--vocab_size={} '
               '--shuffle_input_sentence=true ' 
               '--bos_id=-1 --eos_id=-1').format(
               DATA_PATH, MODEL_PREFIX, 
               VOC_SIZE - NUM_PLACEHOLDERS)

spm.SentencePieceTrainer.Train(SPM_COMMAND)

In [None]:
def read_sentencepiece_vocab(filepath):
    voc = []
    with open(filepath, encoding='utf-8') as fi:
        for line in fi:
            voc.append(line.split("\t")[0])
    # skip the first <unk> token
    voc = voc[1:]
    return voc

snt_vocab = read_sentencepiece_vocab("{}.vocab".format(MODEL_PREFIX))
print("Learnt vocab size: {}".format(len(snt_vocab)))
print("Sample tokens: {}".format(random.sample(snt_vocab, 10)))

In [None]:
def parse_sentencepiece_token(token):
    if token.startswith("▁"):
        return token[1:]
    else:
        return "##" + token
        
bert_vocab = list(map(parse_sentencepiece_token, snt_vocab))

ctrl_symbols = ["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"]
bert_vocab = ctrl_symbols + bert_vocab

bert_vocab += ["[UNUSED_{}]".format(i) for i in range(VOC_SIZE - len(bert_vocab))]
print(len(bert_vocab))

In [None]:
VOC_FNAME = "en-vocab.txt"

with open(VOC_FNAME, "w") as fo:
    for token in bert_vocab:
        fo.write(token+"\n")

In [None]:
sample_text = "legal jurisdictions exercise their right to determine who is recognized as being a lawyer"
tokenizer = tokenization.FullTokenizer(VOC_FNAME)
tokenizer.tokenize(sample_text)

In [None]:
!mkdir data_splits/
if TRIAL_RUN:
    !split -l 1000 -d dataset_1000.txt data_splits/data_
else:
    !split -l 10000 -d dataset.txt data_splits/data_

In [None]:
MAX_SEQ_LENGTH = 128
MASKED_LM_PROB = 0.15
MAX_PREDICTIONS = 20
DO_LOWER_CASE = True

PRETRAINING_DIR = "pretraining_data"

PROCESSES = 4

In [None]:

XARGS_CMD = ("ls data_splits/ | "
             "xargs -n 1 -P {} -I{} "
             "python bert/create_pretraining_data.py "
             "--input_file=data_splits/{} "
             "--output_file={}/{}.tfrecord "
             "--vocab_file={} "
             "--do_lower_case={} "
             "--max_predictions_per_seq={} "
             "--max_seq_length={} "
             "--masked_lm_prob={} "
             "--random_seed=42 "
             "--dupe_factor=5")

XARGS_CMD = XARGS_CMD.format(PROCESSES, '{}', '{}', PRETRAINING_DIR, '{}', 
                             VOC_FNAME, DO_LOWER_CASE, 
                             MAX_PREDICTIONS, MAX_SEQ_LENGTH, MASKED_LM_PROB)
                             
tf.gfile.MkDir(PRETRAINING_DIR)
!$XARGS_CMD

In [None]:
BUCKET_NAME = "bert_resourses"
MODEL_DIR = "bert_model"
tf.io.gfile.mkdir(MODEL_DIR)


In [None]:
bert_base_config = {
  "attention_probs_dropout_prob": 0.1, 
  "directionality": "bidi", 
  "hidden_act": "gelu", 
  "hidden_dropout_prob": 0.1, 
  "hidden_size": 768, 
  "initializer_range": 0.02, 
  "intermediate_size": 3072, 
  "max_position_embeddings": 512, 
  "num_attention_heads": 12, 
  "num_hidden_layers": 12, 
  "pooler_fc_size": 768, 
  "pooler_num_attention_heads": 12, 
  "pooler_num_fc_layers": 3, 
  "pooler_size_per_head": 128, 
  "pooler_type": "first_token_transform", 
  "type_vocab_size": 2, 
  "vocab_size": VOC_SIZE
}

with open("{}/bert_config.json".format(MODEL_DIR), "w") as fo:
    json.dump(bert_base_config, fo, indent=4)


with open("{}/{}".format(MODEL_DIR, VOC_FNAME), "w") as fo:
    for token in bert_vocab:
        fo.write(token+"\n")

In [None]:
## upload content to GCS (authenticate here and upload from here or download data and model directory and upload to GCS to train on TPU)
if BUCKET_NAME:
  !gsutil -m cp -r $MODEL_DIR $PRETRAINING_DIR gs://$BUCKET_NAME

In [None]:
BUCKET_NAME = "bert_resourses"
MODEL_DIR = "bert_model"
PRETRAINING_DIR = "pretraining_data"
VOC_FNAME = "vocab.txt"

# Input data pipeline config
TRAIN_BATCH_SIZE = 128
MAX_PREDICTIONS = 20
MAX_SEQ_LENGTH = 128
MASKED_LM_PROB = 0.15

# Training procedure config
EVAL_BATCH_SIZE = 64
LEARNING_RATE = 2e-5
TRAIN_STEPS = 100
SAVE_CHECKPOINTS_STEPS = 25
NUM_TPU_CORES = 8

if BUCKET_NAME:
    BUCKET_PATH = "gs://{}".format(BUCKET_NAME)
else:
    BUCKET_PATH = "."

BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_DIR)
DATA_GCS_DIR = "{}/{}".format(BUCKET_PATH, PRETRAINING_DIR)

VOCAB_FILE = os.path.join(BERT_GCS_DIR, VOC_FNAME)
CONFIG_FILE = os.path.join(BERT_GCS_DIR, "bert_config.json")

INIT_CHECKPOINT = tf.train.latest_checkpoint(BERT_GCS_DIR)

bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE)
input_files = tf.gfile.Glob(os.path.join(DATA_GCS_DIR,'*tfrecord'))


In [None]:
model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=INIT_CHECKPOINT,
      learning_rate=LEARNING_RATE,
      num_train_steps=TRAIN_STEPS,
      num_warmup_steps=10,
      use_tpu=USE_TPU,
      use_one_hot_embeddings=True)

tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=BERT_GCS_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=SAVE_CHECKPOINTS_STEPS,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=USE_TPU,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE)
  
train_input_fn = input_fn_builder(
        input_files=input_files,
        max_seq_length=MAX_SEQ_LENGTH,
        max_predictions_per_seq=MAX_PREDICTIONS,
        is_training=True)

In [None]:
estimator.train(input_fn=train_input_fn, max_steps=TRAIN_STEPS)