In [1]:
import os

out = 'bert-base-bahasa-standard-cased'
os.makedirs(out, exist_ok=True)

In [2]:
directory = out

In [3]:
from transformers import BertTokenizer, BertModel, BertConfig, AutoTokenizer, AutoModelWithLMHead, pipeline

In [4]:
# !pip3 install torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html

In [5]:
tokenizer = BertTokenizer('BERT.wordpiece', do_lower_case = False)
tokenizer.save_pretrained('bert-base-bahasa-standard-cased')

('bert-base-bahasa-standard-cased/vocab.txt',
 'bert-base-bahasa-standard-cased/special_tokens_map.json',
 'bert-base-bahasa-standard-cased/added_tokens.json')

In [6]:
tokenizer = BertTokenizer.from_pretrained('./bert-base-bahasa-standard-cased', do_lower_case = False)

In [7]:
# !transformers-cli convert --model_type bert \
#   --tf_checkpoint bert-large/model.ckpt-700000 \
#   --config LARGE_config.json \
#   --pytorch_dump_output bert-large-bahasa-standard-cased/pytorch_model.bin

In [10]:
# import transformers.convert_bert_original_tf_checkpoint_to_pytorch

# transformers.convert_bert_original_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
# 'bert-base/model.ckpt-1000000', 'BASE_config.json', 'bert-base-bahasa-standard-cased/pytorch_model.bin')

In [11]:
config = BertConfig(f'{directory}/config.json')
config.vocab_size = 32000

In [12]:
config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 32000
}

In [13]:
model = AutoModelWithLMHead.from_pretrained('./bert-base-bahasa-standard-cased/pytorch_model.bin', config = config)

INFO:transformers.modeling_utils:loading weights file ./bert-base-bahasa-standard-cased/pytorch_model.bin
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
INFO:transformers.modeling_utils:All the weights of BertForMaskedLM were initialized from the model checkpoint at ./bert-base-bahasa-standard-cased/pytorch_model.bin.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use BertForMaskedLM for predictions without further training.


In [14]:
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [15]:
fill_mask('mahathir mohamad sebenarnya [MASK] terhadap rakyatnya')

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  masked_index = (input_ids == self.tokenizer.mask_token_id).nonzero()


[{'sequence': '[CLS] mahathir mohamad sebenarnya zalim terhadap rakyatnya [SEP]',
  'score': 0.08475544303655624,
  'token': 16278,
  'token_str': 'zalim'},
 {'sequence': '[CLS] mahathir mohamad sebenarnya bertanggungjawab terhadap rakyatnya [SEP]',
  'score': 0.06203313171863556,
  'token': 4657,
  'token_str': 'bertanggungjawab'},
 {'sequence': '[CLS] mahathir mohamad sebenarnya marah terhadap rakyatnya [SEP]',
  'score': 0.045952457934617996,
  'token': 6638,
  'token_str': 'marah'},
 {'sequence': '[CLS] mahathir mohamad sebenarnya benci terhadap rakyatnya [SEP]',
  'score': 0.033943504095077515,
  'token': 14934,
  'token_str': 'benci'},
 {'sequence': '[CLS] mahathir mohamad sebenarnya khianat terhadap rakyatnya [SEP]',
  'score': 0.029009034857153893,
  'token': 27803,
  'token_str': 'khianat'}]

In [16]:
fill_mask('mahathir sebenarnya sangat [MASK] tanah airnya')

[{'sequence': '[CLS] mahathir sebenarnya sangat mencintai tanah airnya [SEP]',
  'score': 0.11776357144117355,
  'token': 14727,
  'token_str': 'mencintai'},
 {'sequence': '[CLS] mahathir sebenarnya sangat suka tanah airnya [SEP]',
  'score': 0.1127161756157875,
  'token': 3085,
  'token_str': 'suka'},
 {'sequence': '[CLS] mahathir sebenarnya sangat sayangkan tanah airnya [SEP]',
  'score': 0.036616381257772446,
  'token': 22562,
  'token_str': 'sayangkan'},
 {'sequence': '[CLS] mahathir sebenarnya sangat menyayangi tanah airnya [SEP]',
  'score': 0.03625521436333656,
  'token': 27640,
  'token_str': 'menyayangi'},
 {'sequence': '[CLS] mahathir sebenarnya sangat menghargai tanah airnya [SEP]',
  'score': 0.024249471724033356,
  'token': 8159,
  'token_str': 'menghargai'}]

In [17]:
model.save_pretrained('bert-base-bahasa-standard-cased')

INFO:transformers.configuration_utils:Configuration saved in bert-base-bahasa-standard-cased/config.json
INFO:transformers.modeling_utils:Model weights saved in bert-base-bahasa-standard-cased/pytorch_model.bin


In [37]:
!transformers-cli upload ./bert-base-bahasa-standard-cased

/bin/bash: transformers-cli: command not found
