In [1]:
# # !mkdir out
!gsutil cp gs://mesolitica-general/albert-base-actual/model.ckpt-30000.data-00000-of-00001 out
!gsutil cp gs://mesolitica-general/albert-base-actual/model.ckpt-30000.index out
!gsutil cp gs://mesolitica-general/albert-base-actual/model.ckpt-30000.meta out

Copying gs://mesolitica-general/albert-base-actual/model.ckpt-30000.data-00000-of-00001...
\ [1 files][138.2 MiB/138.2 MiB]                                                
Operation completed over 1 objects/138.2 MiB.                                    
Copying gs://mesolitica-general/albert-base-actual/model.ckpt-30000.index...
/ [1 files][  2.0 KiB/  2.0 KiB]                                                
Operation completed over 1 objects/2.0 KiB.                                      
Copying gs://mesolitica-general/albert-base-actual/model.ckpt-30000.meta...
/ [1 files][  2.1 MiB/  2.1 MiB]                                                
Operation completed over 1 objects/2.1 MiB.                                      


In [2]:
import modeling
import optimization
import tokenization
import tensorflow as tf
import numpy as np

In [3]:
# !pip3 install sentencepiece

In [4]:
tokenizer = tokenization.FullTokenizer(
      vocab_file='sp10m.cased.v10.vocab', do_lower_case=False,
      spm_model_file='sp10m.cased.v10.model')

INFO:tensorflow:loading sentence piece model


In [5]:
tokenizer.tokenize('Husein comel')

['▁Hu', 'se', 'in', '▁comel']

In [6]:
albert_config = modeling.AlbertConfig.from_json_file('BASE_config.json')
albert_config

<modeling.AlbertConfig at 0x7f54b9ebe9b0>

In [7]:
def gather_indexes(sequence_tensor, positions):
    """Gathers the vectors at the specific positions over a minibatch."""
    sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
    batch_size = sequence_shape[0]
    seq_length = sequence_shape[1]
    width = sequence_shape[2]

    flat_offsets = tf.reshape(
      tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
    flat_positions = tf.reshape(positions + flat_offsets, [-1])
    flat_sequence_tensor = tf.reshape(sequence_tensor,
                                    [batch_size * seq_length, width])
    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
    return output_tensor

class Model:
    def __init__(
        self,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.int32, [None, None])
        
        model = modeling.AlbertModel(
            config=albert_config,
            is_training=False,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        input_tensor = model.get_sequence_output()
        output_weights = model.get_embedding_table()
        
        with tf.variable_scope("cls/predictions"):
            with tf.variable_scope("transform"):
                input_tensor = tf.layers.dense(
                              input_tensor,
                              units=albert_config.embedding_size,
                              activation=modeling.get_activation(albert_config.hidden_act),
                              kernel_initializer=modeling.create_initializer(
                                  albert_config.initializer_range))
                input_tensor = modeling.layer_norm(input_tensor)
            
            output_bias = tf.get_variable(
                "output_bias",
                shape=[albert_config.vocab_size],
                initializer=tf.zeros_initializer())
            logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            log_probs = tf.nn.log_softmax(logits, axis=-1)

In [8]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model()

sess.run(tf.global_variables_initializer())

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [9]:
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
cls = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'cls')
saver = tf.train.Saver(var_list = var_lists + cls)
saver.restore(sess, 'out/model.ckpt-30000')

INFO:tensorflow:Restoring parameters from out/model.ckpt-30000


In [10]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'albert-base/model.ckpt')

'albert-base/model.ckpt'

In [11]:
import os

out = 'albert-base-bahasa-cased'
os.makedirs(out, exist_ok=True)

In [12]:
from transformers import AlbertTokenizer, AlbertModel, AlbertConfig, AutoTokenizer, AutoModelWithLMHead, pipeline

In [13]:
tokenizer = AlbertTokenizer('sp10m.cased.v10.model', do_lower_case = False)
tokenizer.save_pretrained('albert-base-bahasa-cased')

('albert-base-bahasa-cased/spiece.model',
 'albert-base-bahasa-cased/special_tokens_map.json',
 'albert-base-bahasa-cased/added_tokens.json')

In [14]:
import torch
import logging
from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert


logging.basicConfig(level=logging.INFO)


def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
    # Initialise PyTorch model
    config = AlbertConfig.from_json_file(albert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = AlbertForMaskedLM(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_albert(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)

In [16]:
# convert_tf_checkpoint_to_pytorch('albert-base/model.ckpt', 'BASE_config.json', 'albert-base-bahasa-cased/pytorch_model.bin')

In [17]:
tokenizer = AlbertTokenizer.from_pretrained('./albert-base-bahasa-cased', do_lower_case = False)

INFO:transformers.tokenization_utils:Model name './albert-base-bahasa-cased' not found in model shortcut name list (albert-base-v1, albert-large-v1, albert-xlarge-v1, albert-xxlarge-v1, albert-base-v2, albert-large-v2, albert-xlarge-v2, albert-xxlarge-v2). Assuming './albert-base-bahasa-cased' is a path, a model identifier, or url to a directory containing tokenizer files.
INFO:transformers.tokenization_utils:Didn't find file ./albert-base-bahasa-cased/added_tokens.json. We won't load it.
INFO:transformers.tokenization_utils:loading file ./albert-base-bahasa-cased/spiece.model
INFO:transformers.tokenization_utils:loading file None
INFO:transformers.tokenization_utils:loading file ./albert-base-bahasa-cased/special_tokens_map.json
INFO:transformers.tokenization_utils:loading file ./albert-base-bahasa-cased/tokenizer_config.json


In [18]:
config = AlbertConfig('BASE_config.json')
config.vocab_size = 32000
config.intermediate_size = 3072
config.hidden_size = 768
config.num_attention_heads = 12
config.num_hidden_groups = 1

In [19]:
model = AutoModelWithLMHead.from_pretrained('./albert-base-bahasa-cased/pytorch_model.bin', config = config)

INFO:transformers.modeling_utils:loading weights file ./albert-base-bahasa-cased/pytorch_model.bin


In [20]:
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [21]:
fill_mask('makan ayam dengan [MASK]')

[{'sequence': '[CLS] makan ayam dengan ayam[SEP]',
  'score': 0.044952988624572754,
  'token': 629},
 {'sequence': '[CLS] makan ayam dengan sayur[SEP]',
  'score': 0.03621877357363701,
  'token': 1639},
 {'sequence': '[CLS] makan ayam dengan ikan[SEP]',
  'score': 0.034429922699928284,
  'token': 758},
 {'sequence': '[CLS] makan ayam dengan nasi[SEP]',
  'score': 0.032447945326566696,
  'token': 453},
 {'sequence': '[CLS] makan ayam dengan rendang[SEP]',
  'score': 0.028885239735245705,
  'token': 2451}]

In [23]:
model = AutoModelWithLMHead.from_pretrained('huseinzol05/albert-base-bahasa-cased', config = config)

INFO:transformers.modeling_utils:loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/albert-base-bahasa-cased/pytorch_model.bin from cache at /home/ubuntu/.cache/torch/transformers/ea11a3ad24741e88ffe3afdba3b4e9f717f246fa1735f969817c4016c768ff34.322beaf401f5ff6adcbf4123172f5afb9d7ba4020398dcf569f42745da818c5e


In [25]:
tokenizer = AlbertTokenizer.from_pretrained('huseinzol05/albert-base-bahasa-cased', do_lower_case = False)

INFO:transformers.tokenization_utils:Model name 'huseinzol05/albert-base-bahasa-cased' not found in model shortcut name list (albert-base-v1, albert-large-v1, albert-xlarge-v1, albert-xxlarge-v1, albert-base-v2, albert-large-v2, albert-xlarge-v2, albert-xxlarge-v2). Assuming 'huseinzol05/albert-base-bahasa-cased' is a path, a model identifier, or url to a directory containing tokenizer files.
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/albert-base-bahasa-cased/spiece.model from cache at /home/ubuntu/.cache/torch/transformers/5e5d2b3ecd5e53c40b88133bc5ccf6c527407004bf26ac19df9764e2e196798c.62912bc1f6182c2bdac801dba22c51182bb7bdbc199b220c540bbb4dada8ed34
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/albert-base-bahasa-cased/added_tokens.json from cache at None
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface

In [26]:
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)
fill_mask('makan ayam dengan [MASK]')

[{'sequence': '[CLS] makan ayam dengan ayam[SEP]',
  'score': 0.044952988624572754,
  'token': 629},
 {'sequence': '[CLS] makan ayam dengan sayur[SEP]',
  'score': 0.03621877357363701,
  'token': 1639},
 {'sequence': '[CLS] makan ayam dengan ikan[SEP]',
  'score': 0.034429922699928284,
  'token': 758},
 {'sequence': '[CLS] makan ayam dengan nasi[SEP]',
  'score': 0.032447945326566696,
  'token': 453},
 {'sequence': '[CLS] makan ayam dengan rendang[SEP]',
  'score': 0.028885239735245705,
  'token': 2451}]