In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/husein/t5/prepare/mesolitica-tpu.json'

In [2]:
# !wget https://f000.backblazeb2.com/file/malaya-model/bert-bahasa/seq2seq-base-500k-09-09-2020.tar.gz
# !tar -xf seq2seq-base-500k-09-09-2020.tar.gz
# !rm seq2seq-base-500k-09-09-2020.tar.gz

In [3]:
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_problems
from tensor2tensor.data_generators import translate
from tensor2tensor.utils import registry
from tensor2tensor import problems
import tensorflow as tf
import os
import logging

logger = logging.getLogger()
tf.logging.set_verbosity(tf.logging.DEBUG)

In [4]:
import sentencepiece as spm

vocab = 'sp10m.cased.t5.model'
sp = spm.SentencePieceProcessor()
sp.Load(vocab)

class Encoder:
    def __init__(self, sp):
        self.sp = sp
        self.vocab_size = sp.GetPieceSize() + 100
    
    def encode(self, s):
        return self.sp.EncodeAsIds(s)
    
    def decode(self, ids, strip_extraneous=False):
        return self.sp.DecodeIds(list(ids))
    
encoder = Encoder(sp)

In [5]:
from tqdm import tqdm
from glob import glob

@registry.register_problem
class Seq2Seq(text_problems.Text2TextProblem):

    @property
    def approx_vocab_size(self):
        return 32100
    
    @property
    def is_generate_per_split(self):
        return False
            
    def feature_encoders(self, data_dir):
        encoder = Encoder(sp)
        return {
            "inputs": encoder,
            "targets": encoder
        }


In [6]:
PROBLEM = 'seq2_seq'
t2t_problem = problems.problem(PROBLEM)

In [7]:
import tensorflow as tf
import os

ckpt_path = 'base/model.ckpt-500000'
# ckpt_path = tf.train.latest_checkpoint('gs://mesolitica-tpu-general/t2t-base/')
# ckpt_path

In [8]:
from tensor2tensor import models
from tensor2tensor import problems
from tensor2tensor.layers import common_layers
from tensor2tensor.utils import trainer_lib
from tensor2tensor.utils import t2t_model
from tensor2tensor.utils import registry
from tensor2tensor.utils import metrics
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_problems
from tensor2tensor.data_generators import translate
from tensor2tensor.utils import registry







In [14]:
def input_fn_builder(
    input_files,
    max_seq_length_encoder,
    max_seq_length_decoder,
    is_training,
    num_cpu_threads = 4,
):

    data_fields = {
        'inputs': tf.VarLenFeature(tf.int64),
        'targets': tf.VarLenFeature(tf.int64),
    }
    data_len = {
        'inputs': max_seq_length_encoder,
        'targets': max_seq_length_decoder,
    }

    def parse(serialized_example):

        features = tf.parse_single_example(
            serialized_example, features = data_fields
        )
        for k in features.keys():
            features[k] = features[k].values
            features[k] = tf.pad(
                features[k], [[0, data_len[k] - tf.shape(features[k])[0]]]
            )
            features[k].set_shape((data_len[k]))

        return features

    def input_fn(params):
        batch_size = params['batch_size']

        if is_training:
            d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
            d = d.repeat()
            d = d.shuffle(buffer_size = len(input_files))
            cycle_length = min(num_cpu_threads, len(input_files))
            d = d.apply(
                tf.contrib.data.parallel_interleave(
                    tf.data.TFRecordDataset,
                    sloppy = is_training,
                    cycle_length = cycle_length,
                )
            )
            d = d.shuffle(buffer_size = 100)
        else:
            d = tf.data.TFRecordDataset(input_files)
            d = d.repeat()
        d = d.map(parse, num_parallel_calls = 32)
        d = d.apply(
            tf.contrib.data.map_and_batch(
                lambda record: _decode_record(record, data_fields),
                batch_size = batch_size,
                num_parallel_batches = num_cpu_threads,
                drop_remainder = True,
            )
        )
        return d

    return input_fn

def _decode_record(example, name_to_features):
    for name in list(example.keys()):
        t = example[name]
        if t.dtype == tf.int64:
            t = tf.to_int32(t)
        example[name] = t

    return example

In [11]:
input_files = tf.gfile.Glob('gs://mesolitica-tpu-general/t2t/data/seq2*')
len(input_files)

251

In [19]:
train_input_fn = input_fn_builder(
    input_files = input_files,
    max_seq_length_encoder = 1024,
    max_seq_length_decoder = 1024,
    is_training = True,
)
dataset = train_input_fn({'batch_size': 2})
dataset = dataset._make_one_shot_iterator().get_next()
X = dataset['inputs']
Y = dataset['targets']
X, Y

(<tf.Tensor 'IteratorGetNext_1:0' shape=(2, 1024) dtype=int32>,
 <tf.Tensor 'IteratorGetNext_1:1' shape=(2, 1024) dtype=int32>)

In [20]:
class Model:
    def __init__(self, X, Y, HPARAMS = "transformer_base", DATA_DIR = 't2t/data'):
        
        self.X = X
        self.Y = Y
        
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        maxlen_decode = tf.reduce_max(self.X_seq_len)
        
        x = tf.expand_dims(tf.expand_dims(self.X, -1), -1)
        y = tf.expand_dims(tf.expand_dims(self.Y, -1), -1)
        
        features = {
            "inputs": x,
            "targets": y,
            "target_space_id": tf.constant(1, dtype=tf.int32),
        }
        self.features = features
        
        Modes = tf.estimator.ModeKeys
        hparams = trainer_lib.create_hparams(HPARAMS, data_dir=DATA_DIR, problem_name=PROBLEM)
        hparams.filter_size = 3072
        hparams.hidden_size = 768
        hparams.num_heads = 12
        hparams.num_hidden_layers = 8
        hparams.vocab_divisor = 128
        hparams.label_smoothing = 0.0
        hparams.shared_embedding_and_softmax_weights = False
        hparams.dropout = 0.1
        hparams.max_length = 1024
        hparams.multiproblem_mixing_schedule = "pretrain"

        hparams.optimizer = "Adafactor"
        hparams.learning_rate_warmup_steps = 10000
        hparams.learning_rate_schedule = "rsqrt_decay"
        
        translate_model = registry.model('transformer')(hparams, Modes.TRAIN)
        self.translate_model = translate_model
        logits, _ = translate_model(features)
        self.logits = logits

In [21]:
class StudentModel:
    def __init__(self, X, Y, HPARAMS = "transformer_base", DATA_DIR = 't2t/data'):
        
        with tf.compat.v1.variable_scope('student') as vs:
        
            self.X = X
            self.Y = Y

            self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
            maxlen_decode = tf.reduce_max(self.X_seq_len)

            x = tf.expand_dims(tf.expand_dims(self.X, -1), -1)
            y = tf.expand_dims(tf.expand_dims(self.Y, -1), -1)

            features = {
                "inputs": x,
                "targets": y,
                "target_space_id": tf.constant(1, dtype=tf.int32),
            }
            self.features = features

            Modes = tf.estimator.ModeKeys
            hparams = trainer_lib.create_hparams(HPARAMS, data_dir=DATA_DIR, problem_name=PROBLEM)
            hparams.filter_size = 1200
            hparams.hidden_size = 312
            hparams.num_heads = 12
            hparams.num_hidden_layers = 4
            hparams.vocab_divisor = 128
            hparams.label_smoothing = 0.0
            hparams.shared_embedding_and_softmax_weights = False
            hparams.dropout = 0.1
            hparams.max_length = 1024
            hparams.multiproblem_mixing_schedule = "pretrain"

            hparams.optimizer = "Adafactor"
            hparams.learning_rate_warmup_steps = 10000
            hparams.learning_rate_schedule = "rsqrt_decay"

            translate_model = registry.model('transformer')(hparams, Modes.TRAIN)
            self.translate_model = translate_model
            logits, _ = translate_model(features)
            self.logits = logits

In [22]:
model = Model(X, Y)
student = StudentModel(X, Y)

Instructions for updating:
reduction_indices is deprecated, use axis instead


Instructions for updating:
reduction_indices is deprecated, use axis instead


INFO:tensorflow:Setting T2TModel mode to 'train'


INFO:tensorflow:Setting T2TModel mode to 'train'


INFO:tensorflow:Using variable initializer: uniform_unit_scaling


INFO:tensorflow:Using variable initializer: uniform_unit_scaling






Instructions for updating:
Use `tf.cast` instead.


Instructions for updating:
Use `tf.cast` instead.


INFO:tensorflow:Transforming feature 'inputs' with symbol_modality_32128_768.bottom


INFO:tensorflow:Transforming feature 'inputs' with symbol_modality_32128_768.bottom


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


INFO:tensorflow:Transforming feature 'targets' with symbol_modality_32128_768.targets_bottom


INFO:tensorflow:Transforming feature 'targets' with symbol_modality_32128_768.targets_bottom


INFO:tensorflow:Building model body


INFO:tensorflow:Building model body


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where






INFO:tensorflow:Transforming body output with symbol_modality_32128_768.top


INFO:tensorflow:Transforming body output with symbol_modality_32128_768.top


INFO:tensorflow:Setting T2TModel mode to 'train'


INFO:tensorflow:Setting T2TModel mode to 'train'


INFO:tensorflow:Using variable initializer: uniform_unit_scaling


INFO:tensorflow:Using variable initializer: uniform_unit_scaling


INFO:tensorflow:Transforming feature 'inputs' with symbol_modality_32128_312.bottom


INFO:tensorflow:Transforming feature 'inputs' with symbol_modality_32128_312.bottom


INFO:tensorflow:Transforming feature 'targets' with symbol_modality_32128_312.targets_bottom


INFO:tensorflow:Transforming feature 'targets' with symbol_modality_32128_312.targets_bottom


INFO:tensorflow:Building model body


INFO:tensorflow:Building model body


INFO:tensorflow:Transforming body output with symbol_modality_32128_312.top


INFO:tensorflow:Transforming body output with symbol_modality_32128_312.top


In [23]:
def padded_cross_entropy_loss(logits, labels, smoothing = 0.0, vocab_size = 32128):
    with tf.name_scope('loss'):

        if labels is not None:
            with tf.name_scope('smoothing_cross_entropy'):
                confidence = 1.0 - smoothing
                vocab_float = tf.cast(vocab_size - 1, tf.float32)
                low_confidence = (1.0 - confidence) / vocab_float
                soft_targets = tf.one_hot(
                    labels,
                    depth = vocab_size,
                    on_value = confidence,
                    off_value = low_confidence,
                )
                xentropy = tf.nn.softmax_cross_entropy_with_logits(
                    logits = logits, labels = soft_targets
                )

                normalizing_constant = -(
                    confidence * tf.math.log(confidence)
                    + vocab_float
                    * low_confidence
                    * tf.math.log(low_confidence + 1e-20)
                )
                xentropy -= normalizing_constant

            weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
            return tf.reduce_sum(xentropy * weights), weights

        else:
            loss = tf.constant(0.0)

        return loss

In [24]:
student_logits = student.logits[:,:,0,0]
student_logits

<tf.Tensor 'strided_slice:0' shape=(2, 1024, 32128) dtype=float32>

In [25]:
student_task_xent, weights = padded_cross_entropy_loss(student_logits, student.Y)
student_task_xent, weights

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



(<tf.Tensor 'loss/Sum:0' shape=() dtype=float32>,
 <tf.Tensor 'loss/Cast:0' shape=(2, 1024) dtype=float32>)

In [26]:
distill_temperature = 1.0
teacher_targets = tf.nn.softmax(model.logits[:,:,0,0] / distill_temperature)

In [27]:
student_distill_xent = tf.nn.softmax_cross_entropy_with_logits_v2(
    labels=tf.stop_gradient(teacher_targets),
    logits=student_logits / distill_temperature)
student_distill_xent = tf.reduce_sum(student_distill_xent * weights)
student_distill_xent

<tf.Tensor 'Sum:0' shape=() dtype=float32>

In [28]:
student_distill_xent *= distill_temperature**2

In [29]:
task_balance = 0.5
phase_loss = task_balance * student_task_xent
phase_loss += (1 - task_balance) * student_distill_xent

In [35]:
loss = phase_loss / tf.reduce_sum(weights)
task_loss = student_task_xent / tf.reduce_sum(weights)
distill_loss = student_distill_xent / tf.reduce_sum(weights)

In [31]:
tf.train.create_global_step()
global_step = tf.train.get_global_step()
learning_rate_warmup_steps = 10000
learning_rate = tf.rsqrt(tf.maximum(tf.to_float(global_step), learning_rate_warmup_steps))

In [32]:
from tensor2tensor.utils import adafactor

optimizer = adafactor.AdafactorOptimizer(
    learning_rate = learning_rate,
    decay_rate = adafactor.adafactor_decay_rate_pow(0.8),
    beta1 = 0.0,
)
train_op = optimizer.minimize(loss, global_step = global_step)

In [33]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
var_lists = [v for v in tf.trainable_variables() if 'student/' not in v.name]
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, ckpt_path)

INFO:tensorflow:Restoring parameters from base/model.ckpt-500000


INFO:tensorflow:Restoring parameters from base/model.ckpt-500000


In [36]:
sess.run([loss, task_loss, distill_loss, train_op])

[6.537529, 6.585449, 6.4896092, None]

In [37]:
var_lists = [v for v in tf.trainable_variables() if 'student/' in v.name]
saver = tf.train.Saver(var_list = var_lists)
saver.save(sess, 'student/model.ckpt')

'student/model.ckpt'