In [1]:
from transformers import AlbertTokenizer, BertModel, BertConfig, AutoTokenizer, AutoModelWithLMHead, pipeline

In [2]:
import os

out = 'bert-base-bahasa'
os.makedirs(out, exist_ok=True)

In [3]:
tokenizer = AlbertTokenizer('sp10m.cased.bert.model', unk_token='[UNK]',pad_token='[PAD]', do_lower_case = False)
tokenizer.save_pretrained('bert-base-bahasa')

('bert-base-bahasa/spiece.model',
 'bert-base-bahasa/special_tokens_map.json',
 'bert-base-bahasa/added_tokens.json')

In [4]:
tokenizer = AlbertTokenizer.from_pretrained('./bert-base-bahasa', 
                                            unk_token='[UNK]',pad_token='[PAD]', do_lower_case = False)

In [5]:
config = BertConfig(f'tiny-bert-bahasa-cased-parliament/config.json')
config.vocab_size = 32000
config.hidden_size = 312
config.intermediate_size = 1200

In [6]:
config

BertConfig {
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "do_sample": false,
  "eos_token_ids": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 312,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1200,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": null,
  "pruned_heads": {},
  "repetition_penalty": 1.0,
  "temperature": 1.0,
  "top_k": 50,
  "top_p": 1.0,
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size

In [11]:
model = AutoModelWithLMHead.from_pretrained('./tiny-bert-bahasa-cased-combined/pytorch_model.bin', config = config)

In [12]:
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [14]:
fill_mask('makan ayam dengan [MASK]')

[{'sequence': '[CLS] makan ayam dengan menengah[SEP]',
  'score': 0.00014604821626562625,
  'token': 4705},
 {'sequence': '[CLS] makan ayam dengan berita[SEP]',
  'score': 0.00014147661568131298,
  'token': 1395},
 {'sequence': '[CLS] makan ayam dengan diakses[SEP]',
  'score': 0.00014008581638336182,
  'token': 22560},
 {'sequence': '[CLS] makan ayam dengan bertutur[SEP]',
  'score': 0.00013591423339676112,
  'token': 20570},
 {'sequence': '[CLS] makan ayam dengan percaya[SEP]',
  'score': 0.0001338910369668156,
  'token': 937}]

In [1]:
!tar cvzf tiny-bert-01-04-2020-twitter.tar.gz tiny-bert-bahasa-cased-combined

tiny-bert-bahasa-cased-combined/
tiny-bert-bahasa-cased-combined/pytorch_model.bin
tiny-bert-bahasa-cased-combined/log.txt
tiny-bert-bahasa-cased-combined/config.json
tiny-bert-bahasa-cased-combined/.ipynb_checkpoints/


In [2]:
import boto3

bucketName = 'huseinhouse-storage'
Key = 'tiny-bert-01-04-2020-twitter.tar.gz'
outPutname = "bert-bahasa/tiny-bert-01-04-2020-twitter.tar.gz"

s3 = boto3.client('s3')
s3.upload_file(Key,bucketName,outPutname)