In [1]:
import os
import time
import math
import logging
import yaml

import tensorflow_text as text
import tensorflow as tf

import dataset
import model
import traintest
import telemetry
import common.training as training

2023-06-17 01:11:27.465770: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-06-17 01:11:27.465794: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-06-17 01:11:29.107575: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-06-17 01:11:29.107604: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-06-17 01:11:29.107626: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (mingkaichen-Macmini): /proc/driver/nvidia/version does not exist


# Setup

In [2]:
logging.basicConfig(filename="tmp/4_persentence_pretrain_mlm_15p.log", 
                    format='%(asctime)s %(message)s', 
                    filemode='w')
logger=logging.getLogger() 
logger.setLevel(logging.DEBUG) 

In [3]:
with open("configs/pretrain_1.yaml") as f:
    _args = yaml.safe_load(f.read())
    tokenizer_filename = _args["tokenizer_model"]
    training_args = _args["training_args"]
    model_args = _args["model_args"]
    training_settings = _args["training_settings"]

with open("configs/ps_dataset_1.yaml") as f:
    _args = yaml.safe_load(f.read())
    dataset_path = _args["dataset"]
    dataset_shard_dirpath = _args["shard_directory"]
    tokenizer_setup = _args["tokenizer_args"]

with open(tokenizer_filename, 'rb') as f:
    tokenizer = text.SentencepieceTokenizer(model=f.read(), 
                                            out_type=tf.int32, 
                                            add_bos=tokenizer_setup["add_bos"], 
                                            add_eos=tokenizer_setup["add_eos"])
    
# generate or retrieve cached values
cached_args = dataset.cache_values("configs/pretrain_cache.yaml", { 
    "dataset_width": dataset.generate_dataset_width,
    "vocab_size": lambda _, tokenizer: int(tokenizer.vocab_size().numpy()),
}, dataset_path, tokenizer)

# Dataset Arguments

In [4]:
batch_shape = (training_args["batch_size"], cached_args["dataset_width"])
latent_shape = (
    training_args["batch_size"], 
    model_args["latent_dim"], 
    cached_args["dataset_width"],
)
training_shards, training_keys = dataset.setup_shards(dataset_shard_dirpath, training_args,
                                                      tokenizer, logger=logger)

# Model and Training Arguments

In [5]:
learning_rate = training.CustomSchedule(model_args["model_dim"])
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

_builder = model.InitParamBuilder()
for param in cached_args:
    _builder.add_param(cached_args[param], param)
for param in model_args:
    _builder.add_param(model_args[param], param) 
_args = _builder.build()

pretrainer = model.PretrainerMLM(
    tokenizer=tokenizer,
    params=_args,
    metadata=model.PretrainerMLMMetadataBuilder().\
        tokenizer_meta(tokenizer_filename).\
        optimizer_iter(optimizer.iterations).build())
run_test = traintest.build_tester(pretrainer,
                                  samples=[
                                      "este é um problema que temos que resolver.", 
                                      "this is a problem we have to solve .", 
                                      "os meus vizinhos ouviram sobre esta ideia.", 
                                      "and my neighboring homes heard about this idea .", 
                                      "vou então muito rapidamente partilhar convosco algumas histórias de algumas coisas mágicas que aconteceram.", 
                                      "so i \'ll just share with you some stories very quickly of some magical things that have happened .", 
                                      "este é o primeiro livro que eu fiz.", 
                                      "this is the first book i've ever done.",
                                  ],
                                  mask_sizes=batch_shape,
                                  logger=logger)
train_batch, train_loss, train_accuracy = traintest.build_pretrainer(pretrainer, optimizer, batch_shape)

ckpt = tf.train.Checkpoint(pretrainer=pretrainer, optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt,
                                          directory="tmp/checkpoints/train_15p_ps",
                                          max_to_keep=50)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    logger.info('Latest checkpoint restored!!')

QUOTA_BUCKET_CAPACITY = 50
QUOTA_BUCKET_RECOVER = 10
QUOTA_BUCKET_RECOVER_RATE = 50
SKIP_LOSS_WINDOW = 200

bucket = training.QuotaBucket(training_settings["skip_bad_loss"]["warmup"],
                            bucket_capacity=QUOTA_BUCKET_CAPACITY,
                            bucket_recover=QUOTA_BUCKET_RECOVER,
                            bucket_recover_rate=QUOTA_BUCKET_RECOVER_RATE)
prev_losses = training.LossWindow(capacity=SKIP_LOSS_WINDOW)
nan_reporter = telemetry.tokens_reporter(logger, tokenizer)

def run_epochs(epochs, skip_shards=None):
    """ run all epoch """
    trainer = traintest.EpochPretrainer(
        traintest.EpochPretrainerInitBuilder().\
            training_settings(training_settings).\
            training_loss(train_loss).\
            training_accuracy(train_accuracy).\
            training_shards(training_shards).\
            training_cb(lambda batch, lengths, loss_check: train_batch(
                batch, lengths, loss_check,
                mask_rate=training_args["mask_rate"],
                context_rate=training_args["context_rate"])).\
            ckpt_manager(ckpt_manager).\
            bucket(bucket).\
            prev_losses(prev_losses).build())
    for epoch in range(epochs):
        start = time.time()
        sublogger = telemetry.PrefixAdapter(logger, 'Epoch {}'.format(epoch+1))
        trainer.run_epoch(skip_shards, logger=sublogger, nan_reporter=nan_reporter)

        if (epoch + 1) % training_settings["epochs_per_save"] == 0:
            logger.info('Saving checkpoint for epoch %d at %s', epoch+1, ckpt_manager.save())

        if (epoch + 1) % training_settings["epochs_per_test"] == 0:
            run_test() # run every epoch

# Training

In [6]:
run_test()

In [None]:
run_epochs(1)

save_name = 'export/berte_pretrain_mlm_15p'
tf.saved_model.save(pretrainer, save_name)