In [1]:
from transformers import AlbertTokenizer, BertModel, BertConfig, AutoTokenizer, AutoModelWithLMHead, pipeline

In [3]:
import os

out = 'tiny-bert-bahasa-cased'
os.makedirs(out, exist_ok=True)

In [5]:
tokenizer = AlbertTokenizer('sp10m.cased.bert.model', unk_token='[UNK]',pad_token='[PAD]', do_lower_case = False)
tokenizer.save_pretrained('tiny-bert-bahasa-cased')

('tiny-bert-bahasa-cased/spiece.model',
 'tiny-bert-bahasa-cased/special_tokens_map.json',
 'tiny-bert-bahasa-cased/added_tokens.json')

In [6]:
tokenizer = AlbertTokenizer.from_pretrained('./tiny-bert-bahasa-cased', 
                                            unk_token='[UNK]',pad_token='[PAD]', do_lower_case = False)

In [16]:
# !transformers-cli convert --model_type bert \
#   --tf_checkpoint ../bert/tiny-bert-v1/model.ckpt \
#   --config ../bert/tiny-bert-v1/config.json \
#   --pytorch_dump_output tiny-bert-bahasa-cased/pytorch_model.bin

In [17]:
config = BertConfig('tiny-bert-bahasa-cased-combined/config.json')
config.vocab_size = 32000
config.hidden_size = 312
config.intermediate_size = 1200

In [18]:
model = AutoModelWithLMHead.from_pretrained('./tiny-bert-bahasa-cased-combined/pytorch_model.bin', 
                                            config = config)

In [19]:
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [20]:
fill_mask('makan ayam dengan [MASK]')

[{'sequence': '[CLS] makan ayam dengan berbual[SEP]',
  'score': 0.00015769545279908925,
  'token': 17859},
 {'sequence': '[CLS] makan ayam dengan kembar[SEP]',
  'score': 0.0001448775001335889,
  'token': 8289},
 {'sequence': '[CLS] makan ayam dengan memaklumkan[SEP]',
  'score': 0.00013484008377417922,
  'token': 6881},
 {'sequence': '[CLS] makan ayam dengan Senarai[SEP]',
  'score': 0.00013061291247140616,
  'token': 11698},
 {'sequence': '[CLS] makan ayam dengan Tiga[SEP]',
  'score': 0.00012453157978598028,
  'token': 4232}]

In [21]:
model.save_pretrained('tiny-bert-bahasa-cased')

In [13]:
# !transformers-cli upload ./tiny-bert-bahasa-cased

In [22]:
model = BertModel.from_pretrained('huseinzol05/tiny-bert-bahasa-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1023.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=96362955.0, style=ProgressStyle(descrip…




In [23]:
tokenizer = AlbertTokenizer.from_pretrained('huseinzol05/tiny-bert-bahasa-cased', 
                                            unk_token='[UNK]',pad_token='[PAD]', do_lower_case = False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=778744.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=156.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




In [24]:
import torch

In [25]:
input_ids = torch.tensor([tokenizer.encode("husein tk suka mkan ayam", add_special_tokens=True)])

In [26]:
with torch.no_grad():
    last_hidden_states = model(input_ids)[0]
    
last_hidden_states

tensor([[[-0.0894,  0.3321, -0.2377,  ...,  0.9428,  5.1734, -0.6816],
         [-0.1833, -0.0646, -0.3805,  ...,  0.4032,  7.5724, -0.1227],
         [ 1.0851,  0.4930,  1.0498,  ...,  0.3571,  6.7428, -0.2703],
         ...,
         [ 0.1324,  0.5455, -0.4745,  ..., -0.1279,  4.4239, -0.1930],
         [-0.2522,  0.0767, -0.0873,  ...,  0.6880,  4.6996, -0.3993],
         [ 0.2774,  0.3506, -0.1620,  ...,  0.9974,  5.2789, -0.9876]]])

In [29]:
model = AutoModelWithLMHead.from_pretrained('huseinzol05/tiny-bert-bahasa-cased')
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)
fill_mask('makan ayam dengan [MASK]')

[{'sequence': '[CLS] makan ayam dengan berbual[SEP]',
  'score': 0.00015769545279908925,
  'token': 17859},
 {'sequence': '[CLS] makan ayam dengan kembar[SEP]',
  'score': 0.0001448775001335889,
  'token': 8289},
 {'sequence': '[CLS] makan ayam dengan memaklumkan[SEP]',
  'score': 0.00013484008377417922,
  'token': 6881},
 {'sequence': '[CLS] makan ayam dengan Senarai[SEP]',
  'score': 0.00013061291247140616,
  'token': 11698},
 {'sequence': '[CLS] makan ayam dengan Tiga[SEP]',
  'score': 0.00012453157978598028,
  'token': 4232}]