In [1]:
import os

out = 'xlnet-base-bahasa-cased'
os.makedirs(out, exist_ok=True)

In [4]:
from transformers import XLNetTokenizer, XLNetModel, XLNetConfig, AutoTokenizer, AutoModelWithLMHead, pipeline

In [5]:
tokenizer = XLNetTokenizer('sp10m.cased.v9.model', do_lower_case = False)
tokenizer.save_pretrained('xlnet-base-bahasa-cased')

('xlnet-base-bahasa-cased/spiece.model',
 'xlnet-base-bahasa-cased/special_tokens_map.json',
 'xlnet-base-bahasa-cased/added_tokens.json')

In [6]:
tokenizer = XLNetTokenizer.from_pretrained('./xlnet-base-bahasa-cased', do_lower_case = False)

In [9]:
# !transformers-cli convert --model_type xlnet \
#   --tf_checkpoint xlnet-base-26-03-2020/model.ckpt-192000 \
#   --config xlnet-base-26-03-2020/config.json \
#   --pytorch_dump_output xlnet-base-bahasa-cased

In [17]:
directory = 'xlnet-base-bahasa-cased'
config = XLNetConfig(f'{directory}/config.json')
config.vocab_size = 32000
config.d_inner = 3072
config.d_model = 768
config.n_head = 12
config.n_layer = 12

In [18]:
config

XLNetConfig {
  "architectures": null,
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": null,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "do_sample": false,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_ids": null,
  "ff_activation": "gelu",
  "finetuning_task": null,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_layer": 12,
  "num_beams": 1,
  "num_labels": 2,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": null,
  "pruned_heads": {},
  "repetition_penalty": 1.0,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type

In [19]:
model = AutoModelWithLMHead.from_pretrained('./xlnet-base-bahasa-cased/pytorch_model.bin', config = config)

In [20]:
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [22]:
fill_mask('makan ayam dengan <mask>')

[{'sequence': 'makan ayam dengan trans<sep><cls>',
  'score': 0.03274673596024513,
  'token': 6666},
 {'sequence': 'makan ayam dengan<eod><sep><cls>',
  'score': 0.024489127099514008,
  'token': 7},
 {'sequence': 'makan ayam dengan pre<sep><cls>',
  'score': 0.017077995464205742,
  'token': 3207},
 {'sequence': 'makan ayam dengan smua<sep><cls>',
  'score': 0.015039748512208462,
  'token': 5283},
 {'sequence': 'makan ayam dengan dkat<sep><cls>',
  'score': 0.01295025646686554,
  'token': 12913}]

In [23]:
model.save_pretrained('xlnet-base-bahasa-cased')

In [24]:
model = XLNetModel.from_pretrained('huseinzol05/xlnet-base-bahasa-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1196.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467039971.0, style=ProgressStyle(descri…




In [25]:
tokenizer = XLNetTokenizer.from_pretrained('huseinzol05/xlnet-base-bahasa-cased', do_lower_case = False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=778744.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=202.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




In [26]:
import torch

In [27]:
input_ids = torch.tensor([tokenizer.encode("husein tk suka mkan ayam", add_special_tokens=True)])

In [28]:
with torch.no_grad():
    last_hidden_states = model(input_ids)[0]
    
last_hidden_states


tensor([[[-0.2750, -0.1085, -0.1126,  ..., -3.1199, -5.3235, -3.4431],
         [ 3.9551, -0.0135, -2.7542,  ..., -1.8817,  0.4461,  0.7132],
         [ 3.3713, -0.5035, -2.0820,  ..., -1.2420, -3.6033,  0.8162],
         ...,
         [-1.6819,  0.0522, -1.6247,  ..., -0.2171, -1.2538, -2.2032],
         [ 2.4307,  3.4743, -1.6003,  ..., -3.7947,  5.2761, -0.1022],
         [ 1.1901,  4.7745, -3.9913,  ..., -2.2523,  0.3405,  2.0578]]])

In [30]:
model = AutoModelWithLMHead.from_pretrained('huseinzol05/xlnet-base-bahasa-cased')
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)
fill_mask('makan ayam dengan <mask>')

[{'sequence': 'makan ayam dengan trans<sep><cls>',
  'score': 0.03274673596024513,
  'token': 6666},
 {'sequence': 'makan ayam dengan<eod><sep><cls>',
  'score': 0.024489127099514008,
  'token': 7},
 {'sequence': 'makan ayam dengan pre<sep><cls>',
  'score': 0.017077995464205742,
  'token': 3207},
 {'sequence': 'makan ayam dengan smua<sep><cls>',
  'score': 0.015039748512208462,
  'token': 5283},
 {'sequence': 'makan ayam dengan dkat<sep><cls>',
  'score': 0.01295025646686554,
  'token': 12913}]