In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
import torch
import numpy as np
import transformers
from transformers import BertTokenizer, AutoModelForMaskedLM, BertConfig, BertForMaskedLM

In [3]:
!ls ./malay-cased-bert-base-mlm

checkpoint-570000  eval_results_mlm_wwm.txt  trainer_state.json
checkpoint-580000  pytorch_model.bin	     training_args.bin
checkpoint-590000  runs			     train_results.txt
checkpoint-600000  special_tokens_map.json   vocab.txt
checkpoint-610000  tokenizer_config.json
config.json	   tokenizer.json


In [4]:
model = BertForMaskedLM.from_pretrained('./malay-cased-bert-base-mlm/checkpoint-610000')

In [5]:
tokenizer = BertTokenizer.from_pretrained('./malay-cased-bert-base')

In [9]:
model.push_to_hub('bert-base-standard-bahasa-cased', organization='mesolitica')

Cloning https://huggingface.co/mesolitica/bert-base-standard-bahasa-cased into local empty directory.


Upload file pytorch_model.bin:   0%|          | 4.00k/422M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/mesolitica/bert-base-standard-bahasa-cased
   13c596b..ef8fa11  main -> main



'https://huggingface.co/mesolitica/bert-base-standard-bahasa-cased/commit/ef8fa1193d5aa1107abcbfae19306d01a3333adf'

In [10]:
tokenizer.push_to_hub('bert-base-standard-bahasa-cased', organization='mesolitica')

To https://huggingface.co/mesolitica/bert-base-standard-bahasa-cased
   ef8fa11..a98734c  main -> main



'https://huggingface.co/mesolitica/bert-base-standard-bahasa-cased/commit/a98734cce021b1723ad91021a16fd65aa8715dac'

In [None]:
!cd bert-base-standard-bahasa-cased && git pull

In [None]:
!cp -r malay-cased-bert-base-mlm/runs bert-base-standard-bahasa-cased
!cd bert-base-standard-bahasa-cased && git add . && git commit -m 'add tensorboard' && git push

In [11]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline

model = AutoModelForMaskedLM.from_pretrained('./malay-cased-bert-base-mlm/checkpoint-610000')
tokenizer = AutoTokenizer.from_pretrained(
    './malay-cased-bert-base',
    do_lower_case = False,
)
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [15]:
fill_mask('Permohonan Najib untuk dengar isu perlembagaan [MASK] .')

[{'score': 0.22050447762012482,
  'token': 9687,
  'token_str': 'ditolak',
  'sequence': 'Permohonan Najib untuk dengar isu perlembagaan ditolak.'},
 {'score': 0.09319040179252625,
  'token': 2673,
  'token_str': 'dibuat',
  'sequence': 'Permohonan Najib untuk dengar isu perlembagaan dibuat.'},
 {'score': 0.06558076292276382,
  'token': 5730,
  'token_str': 'dikemukakan',
  'sequence': 'Permohonan Najib untuk dengar isu perlembagaan dikemukakan.'},
 {'score': 0.04289316013455391,
  'token': 3812,
  'token_str': 'diterima',
  'sequence': 'Permohonan Najib untuk dengar isu perlembagaan diterima.'},
 {'score': 0.03790011256933212,
  'token': 14732,
  'token_str': 'dibatalkan',
  'sequence': 'Permohonan Najib untuk dengar isu perlembagaan dibatalkan.'}]

In [13]:
fill_mask('aku ni tak faham lah, kau sebenarnya nak [MASK] .')

[{'score': 0.07294066250324249,
  'token': 3569,
  'token_str': 'faham',
  'sequence': 'aku ni tak faham lah, kau sebenarnya nak faham.'},
 {'score': 0.05976617708802223,
  'token': 1956,
  'token_str': 'tahu',
  'sequence': 'aku ni tak faham lah, kau sebenarnya nak tahu.'},
 {'score': 0.05713663622736931,
  'token': 4248,
  'token_str': 'berubah',
  'sequence': 'aku ni tak faham lah, kau sebenarnya nak berubah.'},
 {'score': 0.028323516249656677,
  'token': 2723,
  'token_str': 'belajar',
  'sequence': 'aku ni tak faham lah, kau sebenarnya nak belajar.'},
 {'score': 0.01882651261985302,
  'token': 4330,
  'token_str': 'tolong',
  'sequence': 'aku ni tak faham lah, kau sebenarnya nak tolong.'}]

In [16]:
fill_mask('sya nak makan [MASK] .')

[{'score': 0.06390859186649323,
  'token': 2423,
  'token_str': 'je',
  'sequence': 'sya nak makan je.'},
 {'score': 0.058914054185152054,
  'token': 1485,
  'token_str': 'apa',
  'sequence': 'sya nak makan apa.'},
 {'score': 0.044890888035297394,
  'token': 1617,
  'token_str': 'banyak',
  'sequence': 'sya nak makan banyak.'},
 {'score': 0.03408944234251976,
  'token': 1473,
  'token_str': 'lagi',
  'sequence': 'sya nak makan lagi.'},
 {'score': 0.031660545617341995,
  'token': 2124,
  'token_str': 'sekali',
  'sequence': 'sya nak makan sekali.'}]

In [18]:
fill_mask('awak nak [MASK] apa?')

[{'score': 0.4603753983974457,
  'token': 1722,
  'token_str': 'buat',
  'sequence': 'awak nak buat apa?'},
 {'score': 0.07606375217437744,
  'token': 1868,
  'token_str': 'makan',
  'sequence': 'awak nak makan apa?'},
 {'score': 0.059539202600717545,
  'token': 2532,
  'token_str': 'cakap',
  'sequence': 'awak nak cakap apa?'},
 {'score': 0.045212969183921814,
  'token': 1619,
  'token_str': 'kata',
  'sequence': 'awak nak kata apa?'},
 {'score': 0.03384365886449814,
  'token': 1801,
  'token_str': 'jadi',
  'sequence': 'awak nak jadi apa?'}]