In [1]:
import tensorflow as tf
import torch
from transformer.modeling import TinyBertForPreTraining, BertForMaskedLM, TinyBertForPreTraining
import bert.modeling, bert.tokenization

In [3]:
pt_model = TinyBertForPreTraining.from_pretrained('./tiny-bert-bahasa-cased-combined')

In [5]:
sess = tf.InteractiveSession()

In [6]:
import json

In [7]:
config = bert.modeling.BertConfig.from_dict(pt_model.config.to_dict())
json.dumps(config.__dict__)

'{"vocab_size": 32000, "hidden_size": 312, "num_hidden_layers": 4, "num_attention_heads": 12, "hidden_act": "gelu", "intermediate_size": 1200, "hidden_dropout_prob": 0.1, "attention_probs_dropout_prob": 0.1, "max_position_embeddings": 512, "type_vocab_size": 2, "initializer_range": 0.02, "pre_trained": "", "training": "", "cell": {}, "emb_size": 312, "structure": []}'

In [8]:
input_ids = tf.placeholder(tf.int32, [None, None])
input_mask = tf.placeholder(tf.int32, [None, None])
token_type_ids = tf.placeholder(tf.int32, [None, None])
model = bert.modeling.BertModel(config=config, is_training=False,
                                input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)

output_layer = model.get_sequence_output()
embedding = model.get_embedding_table()

with tf.variable_scope('cls/predictions'):
    with tf.variable_scope('transform'):
        input_tensor = tf.layers.dense(
            output_layer,
            units = config.hidden_size,
            activation = bert.modeling.get_activation(config.hidden_act),
            kernel_initializer = bert.modeling.create_initializer(
                config.initializer_range
            ),
        )
        input_tensor = bert.modeling.layer_norm(input_tensor)

    output_bias = tf.get_variable(
    'output_bias',
    shape = [config.vocab_size],
    initializer = tf.zeros_initializer(),
    )
    logits = tf.matmul(input_tensor, embedding, transpose_b = True)




The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [9]:
# sess.run(model.embedding_table)

In [10]:
import re

bert_variables = [v for v in tf.get_collection('variables') if 'bert' in v.name or 'cls' in v.name]
tf.variables_initializer(bert_variables).run()

# Based on: convert_tf_checkpoint_to_pytorch.py from pytorch-pretrained-BERT
for variable in bert_variables:
    name = variable.name.split(':')[0]
    name = name.split('/')
    array = variable.eval()
    # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
    # which are not required for using pretrained model
    if any(n in ["adam_v", "adam_m"] for n in name):
        print("Skipping {}".format("/".join(name)))
        continue
    pytorch_var = pt_model
    for m_name in name:
        if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
            l = re.split(r'_(\d+)', m_name)
        else:
            l = [m_name]
        if l[0] == 'kernel' or l[0] == 'gamma':
            pytorch_var = getattr(pytorch_var, 'weight')
        elif l[0] == 'output_bias' or l[0] == 'beta':
            pytorch_var = getattr(pytorch_var, 'bias')
        elif l[0] == 'output_weights':
            pytorch_var = getattr(pytorch_var, 'weight')
        elif l[0] == 'cls':
            pytorch_var = getattr(pytorch_var, 'cls')
        else:
            pytorch_var = getattr(pytorch_var, l[0])
        if len(l) >= 2:
            num = int(l[1])
            pytorch_var = pytorch_var[num]
    if m_name[-11:] == '_embeddings':
        pytorch_var = getattr(pytorch_var, 'weight')
    elif m_name == 'kernel':
        pytorch_var = pytorch_var.t()
    try:
        assert pytorch_var.shape == array.shape
    except AssertionError as e:
        print(e)
        e.args += (pytorch_var.shape, array.shape)
        raise

    # print("Extracting PyTorch weight {}".format(name))
    variable.load(pytorch_var.detach().cpu().numpy())

Instructions for updating:
Prefer Variable.assign which has equivalent behavior in 2.X.


In [11]:
bert_variables

[<tf.Variable 'bert/embeddings/word_embeddings:0' shape=(32000, 312) dtype=float32_ref>,
 <tf.Variable 'bert/embeddings/token_type_embeddings:0' shape=(2, 312) dtype=float32_ref>,
 <tf.Variable 'bert/embeddings/position_embeddings:0' shape=(512, 312) dtype=float32_ref>,
 <tf.Variable 'bert/embeddings/LayerNorm/beta:0' shape=(312,) dtype=float32_ref>,
 <tf.Variable 'bert/embeddings/LayerNorm/gamma:0' shape=(312,) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/query/kernel:0' shape=(312, 312) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/query/bias:0' shape=(312,) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/key/kernel:0' shape=(312, 312) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/key/bias:0' shape=(312,) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/value/kernel:0' shape=(312, 312) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/value/bia

In [12]:
[v for v in tf.get_collection('variables') if 'bert' in v.name or 'cls' in v.name]

[<tf.Variable 'bert/embeddings/word_embeddings:0' shape=(32000, 312) dtype=float32_ref>,
 <tf.Variable 'bert/embeddings/token_type_embeddings:0' shape=(2, 312) dtype=float32_ref>,
 <tf.Variable 'bert/embeddings/position_embeddings:0' shape=(512, 312) dtype=float32_ref>,
 <tf.Variable 'bert/embeddings/LayerNorm/beta:0' shape=(312,) dtype=float32_ref>,
 <tf.Variable 'bert/embeddings/LayerNorm/gamma:0' shape=(312,) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/query/kernel:0' shape=(312, 312) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/query/bias:0' shape=(312,) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/key/kernel:0' shape=(312, 312) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/key/bias:0' shape=(312,) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/value/kernel:0' shape=(312, 312) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/value/bia

In [13]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'bert-tiny-v1/model.ckpt')

'bert-tiny-v1/model.ckpt'