In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys
# sys.path.append('../')
# os.chdir('../')

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
from tqdm import tqdm

from transformers import BertConfig, BertTokenizer
from nltk.tokenize import word_tokenize

from nergrit.word_classification import BertForWordClassification
from nergrit.forward_fn import forward_word_classification
from nergrit.metrics import ner_metrics_fn
from nergrit.data import NerKtpDataset, NerDataLoader

In [3]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [4]:
# Set random seed
set_seed(26092020)

# Load IndoBert Model

In [5]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p2')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p2')
config.num_labels = NerKtpDataset.NUM_LABELS

# Instantiate model
model = BertForWordClassification.from_pretrained('indobenchmark/indobert-base-p2', config=config)

Some weights of BertForWordClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
model

BertForWordClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

In [7]:
count_param(model)

124494405

# Prepare Named Entity Recognition Dataset (NERGrit)

In [8]:
train_dataset_path = 'data/idcard/ktp_ner_train_dataset.csv'
valid_dataset_path = 'data/idcard/ktp_ner_valid_dataset.csv'
test_dataset_path = 'data/idcard/ktp_ner_test_dataset.csv'

In [65]:
%%time
train_dataset = NerKtpDataset(train_dataset_path, tokenizer, lowercase=False)
valid_dataset = NerKtpDataset(valid_dataset_path, tokenizer, lowercase=False)
test_dataset = NerKtpDataset(test_dataset_path, tokenizer, lowercase=False)

100%|██████████| 8250/8250 [00:42<00:00, 192.20it/s]
100%|██████████| 1788/1788 [00:08<00:00, 198.84it/s]
100%|██████████| 962/962 [00:04<00:00, 199.77it/s]

CPU times: user 57.1 s, sys: 150 ms, total: 57.2 s
Wall time: 56.9 s





In [66]:
train_loader = NerDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=8, num_workers=16, shuffle=True)  
valid_loader = NerDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=8, num_workers=16, shuffle=False)  
test_loader = NerDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=8, num_workers=16, shuffle=False)

In [67]:
w2i, i2w = NerKtpDataset.LABEL2INDEX, NerKtpDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'U-FLD_PROV': 0, 'B-VAL_PROV': 1, 'L-VAL_PROV': 2, 'U-FLD_KAB': 3, 'U-VAL_KAB': 4, 'U-FLD_NIK': 5, 'U-VAL_NIK': 6, 'U-FLD_NAMA': 7, 'B-VAL_NAMA': 8, 'L-VAL_NAMA': 9, 'B-FLD_TTL': 10, 'L-FLD_TTL': 11, 'B-VAL_TTL': 12, 'L-VAL_TTL': 13, 'B-FLD_GDR': 14, 'L-FLD_GDR': 15, 'U-VAL_GDR': 16, 'B-FLD_GLD': 17, 'L-FLD_GLD': 18, 'U-VAL_GLD': 19, 'U-FLD_ADR': 20, 'B-VAL_ADR': 21, 'I-VAL_ADR': 22, 'L-VAL_ADR': 23, 'U-FLD_RTW': 24, 'U-VAL_RTW': 25, 'U-FLD_KLH': 26, 'U-VAL_KLH': 27, 'U-FLD_KCM': 28, 'U-VAL_KCM': 29, 'U-FLD_RLG': 30, 'U-VAL_RLG': 31, 'B-FLD_KWN': 32, 'L-FLD_KWN': 33, 'B-VAL_KWN': 34, 'L-VAL_KWN': 35, 'U-FLD_KRJ': 36, 'U-VAL_KRJ': 37, 'U-FLD_WRG': 38, 'U-VAL_WRG': 39, 'B-FLD_BLK': 40, 'L-FLD_BLK': 41, 'B-VAL_BLK': 42, 'L-VAL_BLK': 43, 'U-VAL_SGP': 44, 'U-VAL_SGD': 45, 'B-VAL_KAB': 46, 'L-VAL_KAB': 47, 'U-VAL_NAMA': 48, 'B-VAL_KLH': 49, 'L-VAL_KLH': 50, 'B-VAL_KRJ': 51, 'I-VAL_KRJ': 52, 'L-VAL_KRJ': 53, 'B-VAL_SGP': 54, 'L-VAL_SGP': 55, 'I-VAL_TTL': 56, 'L-VAL_KCM': 57, 'B-VAL_KCM': 58,

In [68]:
batch = next(iter(train_loader))
subword_batch, mask_batch, subword_to_word_indices_batch, seq_label_batch, seq_list = batch
seq_label_batch

array([[   0,    1,    2,    3,    4,    5,    6,    7,    8,   61,    9,
          10,   11,   12,   13,   14,   15,   16,   17,   18,   19,   20,
          21,   22,   22,   22,   23,   24,   25,   26,   27,   28,   29,
          30,   31,   32,   33,   59,   36,   37,   38,   39,   40,   41,
          42,   43,   44,   45, -100, -100, -100, -100, -100],
       [   0,    1,    2,    3,    4,    5,    6,    7,    8,   61,    9,
          10,   11,   12,   13,   14,   15,   16,   17,   18,   19,   20,
          21,   22,   22,   22,   23,   24,   25,   26,   27,   28,   29,
          30,   31,   32,   33,   59,   36,   51,   53,   38,   39,   40,
          41,   42,   43,   44,   45, -100, -100, -100, -100],
       [   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
          11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,
          23,   24,   25,   26,   27,   28,   29,   30,   31,   32,   33,
          59,   36,   51,   53,   38,   39,   40,   41,   42

# Test model on sample sentences

In [69]:
def word_subword_tokenize(sentence, tokenizer):
    # Add CLS token
    subwords = [tokenizer.cls_token_id]
    subword_to_word_indices = [-1] # For CLS

    # Add subwords
    for word_idx, word in enumerate(sentence):
        subword_list = tokenizer.encode(word, add_special_tokens=False)
        subword_to_word_indices += [word_idx for i in range(len(subword_list))]
        subwords += subword_list

    # Add last SEP token
    subwords += [tokenizer.sep_token_id]
    subword_to_word_indices += [-1]

    return subwords, subword_to_word_indices

In [70]:
text = word_tokenize('Nama Lalu Erfandi Maula Yusnu')
subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]

pd.DataFrame({'words': text, 'label': labels})

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [71]:
text = word_tokenize('Alamat JL MUSIUM NO 19 A')
subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().cpu().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]

pd.DataFrame({'words': text, 'label': labels})

Unnamed: 0,words,label
0,Alamat,U-FLD_ADR
1,JL,B-VAL_ADR
2,MUSIUM,I-VAL_ADR
3,NO,I-VAL_ADR
4,19,I-VAL_ADR
5,A,L-VAL_ADR


In [72]:
text = word_tokenize('PROVINSI NUSA TENGGARA BARAT')
subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().cpu().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]

pd.DataFrame({'words': text, 'label': labels})

Unnamed: 0,words,label
0,PROVINSI,U-FLD_PROV
1,NUSA,B-VAL_PROV
2,TENGGARA,I-VAL_PROV
3,BARAT,L-VAL_PROV


# Fine Tuning & Evaluation

In [73]:
optimizer = optim.Adam(model.parameters(), lr=2e-5)
model = model.cuda()

In [None]:
# Train
n_epochs = 8
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = ner_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = ner_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = ner_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

(Epoch 1) TRAIN LOSS:0.0663 LR:0.00002000: 100%|██████████| 1032/1032 [01:52<00:00,  9.15it/s]
  0%|          | 0/224 [00:00<?, ?it/s]

(Epoch 1) TRAIN LOSS:0.0663 ACC:0.99 F1:0.98 REC:0.98 PRE:0.98 LR:0.00002000


VALID LOSS:0.1019 ACC:0.99 F1:0.98 REC:0.98 PRE:0.98: 100%|██████████| 224/224 [00:39<00:00,  5.65it/s]
  0%|          | 0/1032 [00:00<?, ?it/s]

(Epoch 1) VALID LOSS:0.1019 ACC:0.99 F1:0.98 REC:0.98 PRE:0.98


(Epoch 2) TRAIN LOSS:0.0482 LR:0.00002000: 100%|██████████| 1032/1032 [01:52<00:00,  9.15it/s]
  0%|          | 0/224 [00:00<?, ?it/s]

(Epoch 2) TRAIN LOSS:0.0482 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00002000


VALID LOSS:0.0935 ACC:0.99 F1:0.98 REC:0.98 PRE:0.98: 100%|██████████| 224/224 [00:39<00:00,  5.65it/s]
  0%|          | 0/1032 [00:00<?, ?it/s]

(Epoch 2) VALID LOSS:0.0935 ACC:0.99 F1:0.98 REC:0.98 PRE:0.98


(Epoch 3) TRAIN LOSS:0.0374 LR:0.00002000: 100%|██████████| 1032/1032 [01:52<00:00,  9.17it/s]
  0%|          | 0/224 [00:00<?, ?it/s]

(Epoch 3) TRAIN LOSS:0.0374 ACC:1.00 F1:0.99 REC:0.99 PRE:0.99 LR:0.00002000


VALID LOSS:0.0972 ACC:0.99 F1:0.98 REC:0.98 PRE:0.98: 100%|██████████| 224/224 [00:39<00:00,  5.64it/s]
  0%|          | 0/1032 [00:00<?, ?it/s]

(Epoch 3) VALID LOSS:0.0972 ACC:0.99 F1:0.98 REC:0.98 PRE:0.98


(Epoch 4) TRAIN LOSS:0.0311 LR:0.00002000: 100%|██████████| 1032/1032 [01:53<00:00,  9.11it/s]
  0%|          | 0/224 [00:00<?, ?it/s]

(Epoch 4) TRAIN LOSS:0.0311 ACC:1.00 F1:0.99 REC:0.99 PRE:0.99 LR:0.00002000


VALID LOSS:0.0933 ACC:0.99 F1:0.98 REC:0.98 PRE:0.98: 100%|██████████| 224/224 [00:39<00:00,  5.62it/s]
  0%|          | 0/1032 [00:00<?, ?it/s]

(Epoch 4) VALID LOSS:0.0933 ACC:0.99 F1:0.98 REC:0.98 PRE:0.98


(Epoch 5) TRAIN LOSS:0.0256 LR:0.00002000: 100%|██████████| 1032/1032 [01:52<00:00,  9.16it/s]
  0%|          | 0/224 [00:00<?, ?it/s]

(Epoch 5) TRAIN LOSS:0.0256 ACC:1.00 F1:0.99 REC:0.99 PRE:0.99 LR:0.00002000


VALID LOSS:0.0916 ACC:0.99 F1:0.98 REC:0.98 PRE:0.98: 100%|██████████| 224/224 [00:39<00:00,  5.64it/s]
  0%|          | 0/1032 [00:00<?, ?it/s]

(Epoch 5) VALID LOSS:0.0916 ACC:0.99 F1:0.98 REC:0.98 PRE:0.98


(Epoch 6) TRAIN LOSS:0.0209 LR:0.00002000: 100%|██████████| 1032/1032 [01:53<00:00,  9.12it/s]
  0%|          | 0/224 [00:00<?, ?it/s]

(Epoch 6) TRAIN LOSS:0.0209 ACC:1.00 F1:0.99 REC:0.99 PRE:0.99 LR:0.00002000


VALID LOSS:0.0872 ACC:0.99 F1:0.98 REC:0.98 PRE:0.98: 100%|██████████| 224/224 [00:39<00:00,  5.66it/s]
  0%|          | 0/1032 [00:00<?, ?it/s]

(Epoch 6) VALID LOSS:0.0872 ACC:0.99 F1:0.98 REC:0.98 PRE:0.98


(Epoch 7) TRAIN LOSS:0.0185 LR:0.00002000: 100%|██████████| 1032/1032 [01:53<00:00,  9.12it/s]
  0%|          | 0/224 [00:00<?, ?it/s]

(Epoch 7) TRAIN LOSS:0.0185 ACC:1.00 F1:0.99 REC:0.99 PRE:0.99 LR:0.00002000


VALID LOSS:0.0896 ACC:0.99 F1:0.98 REC:0.98 PRE:0.98: 100%|██████████| 224/224 [00:39<00:00,  5.65it/s]
  0%|          | 0/1032 [00:00<?, ?it/s]

(Epoch 7) VALID LOSS:0.0896 ACC:0.99 F1:0.98 REC:0.98 PRE:0.98


(Epoch 8) TRAIN LOSS:0.0168 LR:0.00002000:  57%|█████▋    | 584/1032 [01:04<00:48,  9.28it/s]

In [None]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('pred.txt', index=False)

print(df)

# Test fine-tuned model with sample sentences

In [None]:
text = word_tokenize('Jl Jendral Sudirman No 15')
subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().cpu().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]

pd.DataFrame({'words': text, 'label': labels})

In [None]:
text = word_tokenize('Budi pergi ke mall kelapa gading membeli kue bantal')
subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().cpu().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]

pd.DataFrame({'words': text, 'label': labels})

In [None]:
text = word_tokenize('Saya sudah sampai di depan menara bca')
subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().cpu().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]

pd.DataFrame({'words': text, 'label': labels})