In [14]:
# !mkdir out-tiny
# !gsutil cp gs://mesolitica-general/albert-tiny-actual/model.ckpt-70000.data-00000-of-00001 out-tiny
# !gsutil cp gs://mesolitica-general/albert-tiny-actual/model.ckpt-70000.index out-tiny
# !gsutil cp gs://mesolitica-general/albert-tiny-actual/model.ckpt-70000.meta out-tiny

In [3]:
# !mkdir albert-base-2020-04-10
# !cp sp10m.cased.v10.* albert-base-2020-04-10
# !cp BASE_config.json albert-base-2020-04-10/config.json
# !cp out/model.ckpt-400000* albert-base-2020-04-10
# !tar cvzf albert-base-2020-04-10.tar.gz albert-base-2020-04-10

In [4]:
import modeling
import optimization
import tokenization
import tensorflow as tf
import numpy as np

In [5]:
# !pip3 install sentencepiece

In [6]:
tokenizer = tokenization.FullTokenizer(
      vocab_file='sp10m.cased.v10.vocab', do_lower_case=False,
      spm_model_file='sp10m.cased.v10.model')

INFO:tensorflow:loading sentence piece model


In [7]:
tokenizer.tokenize('Husein comel')

['▁Hu', 'se', 'in', '▁comel']

In [9]:
albert_config = modeling.AlbertConfig.from_json_file('TINY_config.json')
albert_config

<modeling.AlbertConfig at 0x7f4f2d765b38>

In [10]:
def gather_indexes(sequence_tensor, positions):
    """Gathers the vectors at the specific positions over a minibatch."""
    sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
    batch_size = sequence_shape[0]
    seq_length = sequence_shape[1]
    width = sequence_shape[2]

    flat_offsets = tf.reshape(
      tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
    flat_positions = tf.reshape(positions + flat_offsets, [-1])
    flat_sequence_tensor = tf.reshape(sequence_tensor,
                                    [batch_size * seq_length, width])
    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
    return output_tensor

class Model:
    def __init__(
        self,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.int32, [None, None])
        
        model = modeling.AlbertModel(
            config=albert_config,
            is_training=False,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        input_tensor = model.get_sequence_output()
        output_weights = model.get_embedding_table()
        
        with tf.variable_scope("cls/predictions"):
            with tf.variable_scope("transform"):
                input_tensor = tf.layers.dense(
                              input_tensor,
                              units=albert_config.embedding_size,
                              activation=modeling.get_activation(albert_config.hidden_act),
                              kernel_initializer=modeling.create_initializer(
                                  albert_config.initializer_range))
                input_tensor = modeling.layer_norm(input_tensor)
            
            output_bias = tf.get_variable(
                "output_bias",
                shape=[albert_config.vocab_size],
                initializer=tf.zeros_initializer())
            logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            log_probs = tf.nn.log_softmax(logits, axis=-1)

In [11]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model()

sess.run(tf.global_variables_initializer())

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [15]:
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
cls = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'cls')
saver = tf.train.Saver(var_list = var_lists + cls)
saver.restore(sess, 'out-tiny/model.ckpt-70000')

INFO:tensorflow:Restoring parameters from out-tiny/model.ckpt-70000


In [16]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'albert-tiny/model.ckpt')

'albert-tiny/model.ckpt'

In [18]:
# !cp sp10m.cased.v10.* albert-base
# !cp BASE_config.json albert-base/config.json
# !tar cvzf albert-base.tar.gz albert-base

In [17]:
import os

out = 'albert-tiny-bahasa-cased'
os.makedirs(out, exist_ok=True)

In [18]:
from transformers import AlbertTokenizer, AlbertModel, AlbertConfig, AutoTokenizer, AutoModelWithLMHead, pipeline

In [19]:
tokenizer = AlbertTokenizer('sp10m.cased.v10.model', do_lower_case = False)
tokenizer.save_pretrained('albert-tiny-bahasa-cased')

('albert-tiny-bahasa-cased/spiece.model',
 'albert-tiny-bahasa-cased/special_tokens_map.json',
 'albert-tiny-bahasa-cased/added_tokens.json')

In [20]:
import torch
import logging
from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert


logging.basicConfig(level=logging.INFO)


def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
    # Initialise PyTorch model
    config = AlbertConfig.from_json_file(albert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = AlbertForMaskedLM(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_albert(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)

In [21]:
convert_tf_checkpoint_to_pytorch('albert-tiny/model.ckpt', 
                                 'TINY_config.json', 
                                 'albert-tiny-bahasa-cased/pytorch_model.bin')

Building PyTorch model from configuration: AlbertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "decoder_start_token_id": null,
  "do_sample": false,
  "down_scale_factor": 1,
  "early_stopping": false,
  "embedding_size": 128,
  "eos_token_id": 3,
  "finetuning_task": null,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 336,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 1344,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "albert",
  "net_structure_type": 0,
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_groups": 1,


INFO:transformers.modeling_albert:Converting TensorFlow checkpoint from /home/ubuntu/notebook/albert/albert-tiny/model.ckpt
INFO:transformers.modeling_albert:Loading TF weight bert/embeddings/LayerNorm/beta with shape [128]
INFO:transformers.modeling_albert:Loading TF weight bert/embeddings/LayerNorm/gamma with shape [128]
INFO:transformers.modeling_albert:Loading TF weight bert/embeddings/position_embeddings with shape [512, 128]
INFO:transformers.modeling_albert:Loading TF weight bert/embeddings/token_type_embeddings with shape [2, 128]
INFO:transformers.modeling_albert:Loading TF weight bert/embeddings/word_embeddings with shape [32000, 128]
INFO:transformers.modeling_albert:Loading TF weight bert/encoder/embedding_hidden_mapping_in/bias with shape [336]
INFO:transformers.modeling_albert:Loading TF weight bert/encoder/embedding_hidden_mapping_in/kernel with shape [128, 336]
INFO:transformers.modeling_albert:Loading TF weight bert/encoder/transformer/group_0/inner_group_0/LayerNorm/b

bert/embeddings/LayerNorm/beta
bert/embeddings/LayerNorm/gamma
bert/embeddings/position_embeddings
bert/embeddings/token_type_embeddings
bert/embeddings/word_embeddings
bert/encoder/embedding_hidden_mapping_in/bias
bert/encoder/embedding_hidden_mapping_in/kernel
bert/encoder/transformer/group_0/inner_group_0/LayerNorm/beta
bert/encoder/transformer/group_0/inner_group_0/LayerNorm/gamma
bert/encoder/transformer/group_0/inner_group_0/LayerNorm_1/beta
bert/encoder/transformer/group_0/inner_group_0/LayerNorm_1/gamma
bert/encoder/transformer/group_0/inner_group_0/attention_1/output/dense/bias
bert/encoder/transformer/group_0/inner_group_0/attention_1/output/dense/kernel
bert/encoder/transformer/group_0/inner_group_0/attention_1/self/key/bias
bert/encoder/transformer/group_0/inner_group_0/attention_1/self/key/kernel
bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/bias
bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/kernel
bert/encoder/transformer/gr

In [22]:
tokenizer = AlbertTokenizer.from_pretrained('./albert-tiny-bahasa-cased', do_lower_case = False)

INFO:transformers.tokenization_utils:Model name './albert-tiny-bahasa-cased' not found in model shortcut name list (albert-base-v1, albert-large-v1, albert-xlarge-v1, albert-xxlarge-v1, albert-base-v2, albert-large-v2, albert-xlarge-v2, albert-xxlarge-v2). Assuming './albert-tiny-bahasa-cased' is a path, a model identifier, or url to a directory containing tokenizer files.
INFO:transformers.tokenization_utils:Didn't find file ./albert-tiny-bahasa-cased/added_tokens.json. We won't load it.
INFO:transformers.tokenization_utils:loading file ./albert-tiny-bahasa-cased/spiece.model
INFO:transformers.tokenization_utils:loading file None
INFO:transformers.tokenization_utils:loading file ./albert-tiny-bahasa-cased/special_tokens_map.json
INFO:transformers.tokenization_utils:loading file ./albert-tiny-bahasa-cased/tokenizer_config.json


In [24]:
config = AlbertConfig('TINY_config.json')
config.vocab_size = 32000
config.intermediate_size = 1344
config.hidden_size = 336
config.num_attention_heads = 12
config.num_hidden_groups = 1
config.num_hidden_layers = 4

In [25]:
model = AutoModelWithLMHead.from_pretrained('./albert-tiny-bahasa-cased/pytorch_model.bin', config = config)

INFO:transformers.modeling_utils:loading weights file ./albert-tiny-bahasa-cased/pytorch_model.bin


In [26]:
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [27]:
fill_mask('tolonglah gov buat something, kami dah [MASK]')

[{'sequence': '[CLS] tolonglah gov buat something, kami dah kahwin[SEP]',
  'score': 0.020490359514951706,
  'token': 1326},
 {'sequence': '[CLS] tolonglah gov buat something, kami dah?[SEP]',
  'score': 0.011642860248684883,
  'token': 28},
 {'sequence': '[CLS] tolonglah gov buat something, kami dah tahu[SEP]',
  'score': 0.007895083166658878,
  'token': 178},
 {'sequence': '[CLS] tolonglah gov buat something, kami dah lama[SEP]',
  'score': 0.00789356604218483,
  'token': 222},
 {'sequence': '[CLS] tolonglah gov buat something, kami dah ni[SEP]',
  'score': 0.007744117174297571,
  'token': 34}]

In [28]:
model.save_pretrained('albert-tiny-bahasa-cased')

INFO:transformers.configuration_utils:Configuration saved in albert-tiny-bahasa-cased/config.json
INFO:transformers.modeling_utils:Model weights saved in albert-tiny-bahasa-cased/pytorch_model.bin


In [23]:
# !transformers-cli upload ./albert-tiny-bahasa-cased

In [29]:
model = AutoModelWithLMHead.from_pretrained('huseinzol05/albert-tiny-bahasa-cased', config = config)

INFO:filelock:Lock 139976865798072 acquired on /home/ubuntu/.cache/torch/transformers/012922c7e0fdfe2ddc58e35274debe862ce8a2dc627e3f5cb784f983ae8f3be6.de6d44e60aa3f72beeb9726ba6ff879337721eb27c49f517dc145e348b7a34d4.lock
INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/albert-tiny-bahasa-cased/pytorch_model.bin not found in cache or force_download set to True, downloading to /home/ubuntu/.cache/torch/transformers/tmpqp10_0_j


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=23019367.0, style=ProgressStyle(descrip…

INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/albert-tiny-bahasa-cased/pytorch_model.bin in cache at /home/ubuntu/.cache/torch/transformers/012922c7e0fdfe2ddc58e35274debe862ce8a2dc627e3f5cb784f983ae8f3be6.de6d44e60aa3f72beeb9726ba6ff879337721eb27c49f517dc145e348b7a34d4
INFO:transformers.file_utils:creating metadata file for /home/ubuntu/.cache/torch/transformers/012922c7e0fdfe2ddc58e35274debe862ce8a2dc627e3f5cb784f983ae8f3be6.de6d44e60aa3f72beeb9726ba6ff879337721eb27c49f517dc145e348b7a34d4
INFO:filelock:Lock 139976865798072 released on /home/ubuntu/.cache/torch/transformers/012922c7e0fdfe2ddc58e35274debe862ce8a2dc627e3f5cb784f983ae8f3be6.de6d44e60aa3f72beeb9726ba6ff879337721eb27c49f517dc145e348b7a34d4.lock
INFO:transformers.modeling_utils:loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/albert-tiny-bahasa-cased/pytorch_model.bin from cache at /home/ubuntu/.cache/torch/transformers/012922c7e0fdfe2




In [30]:
tokenizer = AlbertTokenizer.from_pretrained('huseinzol05/albert-tiny-bahasa-cased', do_lower_case = False)

INFO:transformers.tokenization_utils:Model name 'huseinzol05/albert-tiny-bahasa-cased' not found in model shortcut name list (albert-base-v1, albert-large-v1, albert-xlarge-v1, albert-xxlarge-v1, albert-base-v2, albert-large-v2, albert-xlarge-v2, albert-xxlarge-v2). Assuming 'huseinzol05/albert-tiny-bahasa-cased' is a path, a model identifier, or url to a directory containing tokenizer files.
INFO:filelock:Lock 139976863721400 acquired on /home/ubuntu/.cache/torch/transformers/9af39e08496cfbc2beca52ea108da93f4e0faf2165f204dfe41625af07cc265c.62912bc1f6182c2bdac801dba22c51182bb7bdbc199b220c540bbb4dada8ed34.lock
INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/albert-tiny-bahasa-cased/spiece.model not found in cache or force_download set to True, downloading to /home/ubuntu/.cache/torch/transformers/tmpu9eoku8_


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=778641.0, style=ProgressStyle(descripti…

INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/albert-tiny-bahasa-cased/spiece.model in cache at /home/ubuntu/.cache/torch/transformers/9af39e08496cfbc2beca52ea108da93f4e0faf2165f204dfe41625af07cc265c.62912bc1f6182c2bdac801dba22c51182bb7bdbc199b220c540bbb4dada8ed34
INFO:transformers.file_utils:creating metadata file for /home/ubuntu/.cache/torch/transformers/9af39e08496cfbc2beca52ea108da93f4e0faf2165f204dfe41625af07cc265c.62912bc1f6182c2bdac801dba22c51182bb7bdbc199b220c540bbb4dada8ed34
INFO:filelock:Lock 139976863721400 released on /home/ubuntu/.cache/torch/transformers/9af39e08496cfbc2beca52ea108da93f4e0faf2165f204dfe41625af07cc265c.62912bc1f6182c2bdac801dba22c51182bb7bdbc199b220c540bbb4dada8ed34.lock





INFO:filelock:Lock 139978021282872 acquired on /home/ubuntu/.cache/torch/transformers/aa2391aa261ccc65ae4b6269f8e7db027364d5f1b76fbf52ff4b41561d644810.4f0d42b1849e2d6fd72c735fba48dff0d2f0a55f5d1961e79bcfce337d354167.lock
INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/albert-tiny-bahasa-cased/special_tokens_map.json not found in cache or force_download set to True, downloading to /home/ubuntu/.cache/torch/transformers/tmp4xauqpf0


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=156.0, style=ProgressStyle(description_…

INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/albert-tiny-bahasa-cased/special_tokens_map.json in cache at /home/ubuntu/.cache/torch/transformers/aa2391aa261ccc65ae4b6269f8e7db027364d5f1b76fbf52ff4b41561d644810.4f0d42b1849e2d6fd72c735fba48dff0d2f0a55f5d1961e79bcfce337d354167
INFO:transformers.file_utils:creating metadata file for /home/ubuntu/.cache/torch/transformers/aa2391aa261ccc65ae4b6269f8e7db027364d5f1b76fbf52ff4b41561d644810.4f0d42b1849e2d6fd72c735fba48dff0d2f0a55f5d1961e79bcfce337d354167
INFO:filelock:Lock 139978021282872 released on /home/ubuntu/.cache/torch/transformers/aa2391aa261ccc65ae4b6269f8e7db027364d5f1b76fbf52ff4b41561d644810.4f0d42b1849e2d6fd72c735fba48dff0d2f0a55f5d1961e79bcfce337d354167.lock





INFO:filelock:Lock 139976865793584 acquired on /home/ubuntu/.cache/torch/transformers/af67b61a872b064e1b4e3f539ecb7aa59f4700db79093e8c74f3809e5d7e946b.3889713104075cfee9e96090bcdd0dc753733b3db9da20d1dd8b2cd1030536a2.lock
INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/albert-tiny-bahasa-cased/tokenizer_config.json not found in cache or force_download set to True, downloading to /home/ubuntu/.cache/torch/transformers/tmp5_srz_sh


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…

INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/albert-tiny-bahasa-cased/tokenizer_config.json in cache at /home/ubuntu/.cache/torch/transformers/af67b61a872b064e1b4e3f539ecb7aa59f4700db79093e8c74f3809e5d7e946b.3889713104075cfee9e96090bcdd0dc753733b3db9da20d1dd8b2cd1030536a2
INFO:transformers.file_utils:creating metadata file for /home/ubuntu/.cache/torch/transformers/af67b61a872b064e1b4e3f539ecb7aa59f4700db79093e8c74f3809e5d7e946b.3889713104075cfee9e96090bcdd0dc753733b3db9da20d1dd8b2cd1030536a2
INFO:filelock:Lock 139976865793584 released on /home/ubuntu/.cache/torch/transformers/af67b61a872b064e1b4e3f539ecb7aa59f4700db79093e8c74f3809e5d7e946b.3889713104075cfee9e96090bcdd0dc753733b3db9da20d1dd8b2cd1030536a2.lock
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/albert-tiny-bahasa-cased/spiece.model from cache at /home/ubuntu/.cache/torch/transformers/9af39e08496cfbc2beca




In [31]:
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)
fill_mask('makan ayam dengan [MASK]')

[{'sequence': '[CLS] makan ayam dengan ayam[SEP]',
  'score': 0.05121927708387375,
  'token': 629},
 {'sequence': '[CLS] makan ayam dengan sayur[SEP]',
  'score': 0.04497420787811279,
  'token': 1639},
 {'sequence': '[CLS] makan ayam dengan nasi[SEP]',
  'score': 0.039827536791563034,
  'token': 453},
 {'sequence': '[CLS] makan ayam dengan rendang[SEP]',
  'score': 0.032997727394104004,
  'token': 2451},
 {'sequence': '[CLS] makan ayam dengan makan[SEP]',
  'score': 0.031354598701000214,
  'token': 129}]