# Finetuning NERGrit
NERGrit is a Named Entity Recognition dataset with 3 possible entity tags (`PERSON`, `PLACE`, `ORGANIZATION`) in IOB chunking format

In [1]:
import os, sys
sys.path.append('../')
os.chdir('../')

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
from tqdm import tqdm

from transformers import BertConfig, BertTokenizer
from nltk.tokenize import word_tokenize

from modules.word_classification import BertForWordClassification
from utils.forward_fn import forward_word_classification
from utils.metrics import ner_metrics_fn
from utils.data_utils import NerGritDataset, NerDataLoader

In [2]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [3]:
# Set random seed
set_seed(26092020)

# Load IndoBERT Model

In [4]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = NerGritDataset.NUM_LABELS

# Instantiate model
model = BertForWordClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Some weights of BertForWordClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model

BertForWordClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

In [6]:
count_param(model)

124446727

# Prepare Named Entity Recognition Dataset (NERGrit)

In [7]:
# train_dataset_path = './dataset/nergrit_ner-grit/train_preprocess.txt'
# valid_dataset_path = './dataset/nergrit_ner-grit/valid_preprocess.txt'
test_dataset_path = './dataset/nergrit_ner-grit/test_preprocess_masked_label.txt'

train_dataset_path = './../train_preprocess.txt'
valid_dataset_path = './../valid_preprocess.txt'

In [8]:
print(len(train_dataset_path))

25


In [9]:
train_dataset = NerGritDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = NerGritDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = NerGritDataset(test_dataset_path, tokenizer, lowercase=True)

batch_size = 128

train_loader = NerDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=batch_size, num_workers=16, shuffle=True)  
valid_loader = NerDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=batch_size, num_workers=16, shuffle=False)  
test_loader = NerDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=batch_size, num_workers=16, shuffle=False)

In [10]:
w2i, i2w = NerGritDataset.LABEL2INDEX, NerGritDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'I-PERSON': 0, 'B-ORGANISATION': 1, 'I-ORGANISATION': 2, 'B-PLACE': 3, 'I-PLACE': 4, 'O': 5, 'B-PERSON': 6}
{0: 'I-PERSON', 1: 'B-ORGANISATION', 2: 'I-ORGANISATION', 3: 'B-PLACE', 4: 'I-PLACE', 5: 'O', 6: 'B-PERSON'}


# Test model on sample sentences

In [11]:
def word_subword_tokenize(sentence, tokenizer):
    # Add CLS token
    subwords = [tokenizer.cls_token_id]
    subword_to_word_indices = [-1] # For CLS

    # Add subwords
    for word_idx, word in enumerate(sentence):
        subword_list = tokenizer.encode(word, add_special_tokens=False)
        subword_to_word_indices += [word_idx for i in range(len(subword_list))]
        subwords += subword_list

    # Add last SEP token
    subwords += [tokenizer.sep_token_id]
    subword_to_word_indices += [-1]

    return subwords, subword_to_word_indices

In [12]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
# text = word_tokenize('Bung Tomo adalah pahlawan nasional Republik Indonesia')
text = word_tokenize("jl. amd, komplek borneo lestari, blok 2, no. 30")
subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]

pd.DataFrame({'words': text, 'label': labels})

Unnamed: 0,words,label
0,jl,B-PERSON
1,.,B-PERSON
2,amd,I-ORGANISATION
3,",",B-PERSON
4,komplek,I-ORGANISATION
5,borneo,I-ORGANISATION
6,lestari,I-PERSON
7,",",B-PERSON
8,blok,O
9,2,I-PERSON


In [14]:
# text = word_tokenize('Budi pergi ke mall kelapa gading membeli kue bantal')
text = word_tokenize("raya samb gede, 299 toko bb kids")
subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().cpu().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]

pd.DataFrame({'words': text, 'label': labels})

Unnamed: 0,words,label
0,raya,B-PERSON
1,samb,B-ORGANISATION
2,gede,I-PLACE
3,",",B-ORGANISATION
4,299,B-PLACE
5,toko,I-PERSON
6,bb,I-PERSON
7,kids,I-PERSON


In [15]:
# text = word_tokenize('Saya sudah sampai di depan menara bca')
text = word_tokenize("laundry kiloan restu ibu rasamala")
subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().cpu().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]

pd.DataFrame({'words': text, 'label': labels})

Unnamed: 0,words,label
0,laundry,B-PERSON
1,kiloan,B-PERSON
2,restu,B-PERSON
3,ibu,B-PERSON
4,rasamala,I-PLACE


# Fine Tuning & Evaluation

In [16]:
optimizer = optim.Adam(model.parameters(), lr=2e-5)
model = model.cuda()

In [34]:
# Train
n_epochs = 1

min_loss = sys.maxsize
max_f1 = 0
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = ner_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = ner_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = ner_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))
    if total_loss/(i+1) < min_loss and metrics["F1"] > max_f1 :
        print("save model checkpoint")
        min_loss = total_loss/(i+1)
        max_f1 = metrics["F1"] 

(Epoch 1) TRAIN LOSS:0.4593 LR:0.00002000: 100%|██████████| 1483/1483 [06:02<00:00,  4.09it/s]
  0%|          | 0/165 [00:00<?, ?it/s]

(Epoch 1) TRAIN LOSS:0.4593 ACC:0.99 F1:0.94 REC:0.93 PRE:0.94 LR:0.00002000


VALID LOSS:0.4507 ACC:0.98 F1:0.89 REC:0.90 PRE:0.89: 100%|██████████| 165/165 [01:19<00:00,  2.07it/s]


(Epoch 1) VALID LOSS:0.4507 ACC:0.98 F1:0.89 REC:0.90 PRE:0.89
save model checkpoint


In [35]:
# Evaluate on test
# model.eval()
# torch.set_grad_enabled(False)

# total_loss, total_correct, total_labels = 0, 0, 0
# list_hyp, list_label = [], []

# pbar = tqdm(test_loader, leave=True, total=len(test_loader))
# for i, batch_data in enumerate(pbar):
#     _, batch_hyp, _ = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
#     list_hyp += batch_hyp

# # Save prediction
# df = pd.DataFrame({'label':list_hyp}).reset_index()
# df.to_csv('pred.txt', index=False)

# print(df)

# Test fine-tuned model with sample sentences

In [36]:
# text = word_tokenize('Bung Tomo adalah pahlawan nasional Republik Indonesia')
text = word_tokenize("jl. amd, komplek borneo lestari, blok 2, no. 30")

subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().cpu().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]

pd.DataFrame({'words': text, 'label': labels})

Unnamed: 0,words,label
0,jl,B-PLACE
1,.,B-PLACE
2,amd,I-PLACE
3,",",O
4,komplek,B-ORGANISATION
5,borneo,I-ORGANISATION
6,lestari,I-ORGANISATION
7,",",O
8,blok,O
9,2,O


In [37]:
# text = word_tokenize('Budi pergi ke mall kelapa gading membeli kue bantal')
# text = word_tokenize("raya samb gede, 299 toko bb kids")
text = word_tokenize("urip sumoharjo 59 mangkujayan cv. tri saka buana ponorogo")
subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().cpu().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]

pd.DataFrame({'words': text, 'label': labels})

Unnamed: 0,words,label
0,urip,B-PLACE
1,sumoharjo,I-PLACE
2,59,O
3,mangkujayan,B-ORGANISATION
4,cv,I-ORGANISATION
5,.,I-ORGANISATION
6,tri,I-ORGANISATION
7,saka,I-ORGANISATION
8,buana,I-ORGANISATION
9,ponorogo,O


In [38]:
# text = word_tokenize('Saya sudah sampai di depan menara bca')
text = word_tokenize("laundry kiloan restu ibu rasamala")
subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().cpu().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]

pd.DataFrame({'words': text, 'label': labels})

Unnamed: 0,words,label
0,laundry,B-ORGANISATION
1,kiloan,I-ORGANISATION
2,restu,I-ORGANISATION
3,ibu,I-ORGANISATION
4,rasamala,B-PLACE


In [39]:
def extract_poi_street(text) :
    if text == "" : return "/"
    text = word_tokenize(text)
    subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

    subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
    subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
    logits = model(subwords, subword_to_word_indices)[0]

    preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().cpu().numpy()
    if preds.size == 1 :
        preds = np.array([preds])
    labels = [i2w[preds[i]] for i in range(len(preds))]

    poi = ""
    street = ""
    i = 0
    while i < len(text) :
        if labels[i] == "B-PLACE":
            street += text[i] + " "
            i += 1
            while i < len(labels) and (labels[i] == "B-PLACE" or labels[i] == "I-PLACE") :
                street += text[i] + " "
                i += 1
            street = street[:-1]
        elif labels[i] == "B-ORGANISATION":
            poi += text[i] + " "
            i += 1
            while i < len(labels) and (labels[i] == "B-ORGANISATION" or labels[i] == "I-ORGANISATION"):
                poi += text[i] + " "
                i += 1
            poi = poi[:-1]
        else :
            i += 1
    return "{}/{}".format(poi, street)

#     return pd.DataFrame({'words': text, 'label': labels})

In [40]:
s = "primkob pabri adiwerna"
s = "jalan mh thamrin, sei rengas i kel. medan kota"
s = "laundry kiloan restu ibu rasamala"
s = "adi"
s = "urip sumoharjo 59 mangkujayan cv. tri saka buana ponorogo"

extract_poi_street(s)

'mangkujayan cv . tri saka buana/urip sumoharjo'

In [41]:
df = pd.read_csv("../test.csv")

In [42]:
df

Unnamed: 0,id,raw_address
0,0,s. par 53 sidanegara 4 cilacap tengah
1,1,"angg per, baloi indah kel. lubuk baja"
2,2,"asma laun, mand imog,"
3,3,"ud agung rej, raya nga sri wedari karanganyar"
4,4,"cut mutia, 35 baiturrahman"
...,...,...
49995,49995,toko mbak farid semboro semboro
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi"
49997,49997,"mart dan roti bakar malabar, nasio,"
49998,49998,graha indah pamulang jl. mujair raya bambu apu...


In [43]:
df["POI/street"] = df["raw_address"].apply(extract_poi_street)

In [44]:
df = df.drop(columns=["raw_address"])

In [45]:
df

Unnamed: 0,id,POI/street
0,0,/s. par
1,1,/angg per
2,2,/mand imog
3,3,/raya nga
4,4,/cut mutia
...,...,...
49995,49995,toko mbak farid/
49996,49996,vie - tk . ridho kids/vete 3 cari
49997,49997,mart dan roti bakar malabar/nasio
49998,49998,graha indah/jl . mujair raya


In [46]:
from datetime import datetime 
import pytz 
  
SGT = pytz.timezone('Singapore')
datetime_sgt = datetime.now(SGT) 
time_now = datetime_sgt.strftime('%Y:%m:%d-%H:%M:%S')

In [47]:
df.to_csv("../submission-{}.csv".format(time_now), index=False)