In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
import os
import json
import logging
import random
from pathlib import Path

import cv2 as cv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm, trange

import torch
import torch.nn as nn
import torch.optim as optim

import pytorch_lightning as pl
from pytorch_lightning import loggers as pl_loggers



from laylm.data import utils 
from laylm.data import loader
from laylm.data.dataset import IDCardDataset
from laylm.config import label as label_cfg
from laylm.trainer.task import TaskLayoutLM


from transformers import BertTokenizer, AutoModel
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    BertConfig,
    BertTokenizer,
    BertForTokenClassification,
    LayoutLMConfig,
    LayoutLMForTokenClassification,
    get_linear_schedule_with_warmup,
)

from seqeval.metrics import (
    classification_report,
    f1_score,
    precision_score,
    recall_score,
    accuracy_score,
)

In [8]:
SAVED_CHECKPOINT_PATH = "../checkpoints/"
SAVED_LOGS_PATH = "../logs/"

SUMMARY = "top"
MANUAL_SEED = 1261
MAX_EPOCH = 10
MAX_STEPS = None
VALCHECK_INTERVAL = 2000
NUM_GPUS = 1
DISTRIBUTED_BACKEND = None
LOG_FREQ = 100
DETERMINISTIC = True
BENCHMARK = True
CHECKPOINT_PATH = None
BATCH_SIZE = 4
NUM_WORKERS=16

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [9]:
tokenizer = BertTokenizer.from_pretrained(
    "indobenchmark/indobert-base-p2",
    do_lower_case=True,
    cache_dir=None,
)

In [10]:
path = '/data/idcard/combined/1606753021/'
train_loader, valid_loader = loader.get_loader(
    path, tokenizer=tokenizer, 
    batch_size=BATCH_SIZE, 
    num_workers=NUM_WORKERS,
    rand_seq=True,
    rand_seq_prob=0.5
)

In [11]:
len(valid_loader.dataset),(len(train_loader.dataset))

(19039, 76153)

In [12]:
config = LayoutLMConfig.from_pretrained(
    "microsoft/layoutlm-base-uncased",
    num_labels=label_cfg.num_labels,
    cache_dir=None
)

In [13]:
model = LayoutLMForTokenClassification.from_pretrained(
    'microsoft/layoutlm-base-uncased',
    config=config,
#     return_dict=True
)

model.resize_token_embeddings(len(tokenizer))

Some weights of the model checkpoint at microsoft/layoutlm-base-uncased were not used when initializing LayoutLMForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft

Embedding(30521, 768)

In [14]:
task = TaskLayoutLM(model, tokenizer)

In [15]:
ckpt_path = '../checkpoints/layoutlm-v2-epoch=6.ckpt'
task = TaskLayoutLM.load_from_checkpoint(ckpt_path, model=model, tokenizer=tokenizer)
model = task.model
model = model.to(device)

In [12]:
# DEFAULTS used by the Trainer
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filepath=SAVED_CHECKPOINT_PATH,
#     save_top_k=1,
    verbose=True,
#     monitor='val_loss',
#     mode='min',
    prefix='layoutlm-v2'
)

tb_logger = pl_loggers.TensorBoardLogger(SAVED_LOGS_PATH)
pl.trainer.seed_everything(MANUAL_SEED)

1261

In [None]:
trainer = pl.Trainer(
    weights_summary=SUMMARY,
    max_epochs=MAX_EPOCH,
    max_steps=MAX_STEPS,
    val_check_interval=VALCHECK_INTERVAL,
    gpus=NUM_GPUS,
    distributed_backend=DISTRIBUTED_BACKEND,
    log_every_n_steps=LOG_FREQ,
    deterministic=DETERMINISTIC,
    benchmark=BENCHMARK,
    logger=tb_logger, 
    checkpoint_callback=checkpoint_callback, 
    resume_from_checkpoint=CHECKPOINT_PATH
)

trainer.fit(task, train_loader, valid_loader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                           | Params
---------------------------------------------------------
0 | model | LayoutLMForTokenClassification | 112 M 


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 0: val_loss reached 0.05990 (best 0.05990), saving model to ../checkpoints/layoutlm-v2-epoch=0.ckpt as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 0: val_loss reached 0.04429 (best 0.04429), saving model to ../checkpoints/layoutlm-v2-epoch=0-v0.ckpt as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 0: val_loss reached 0.03862 (best 0.03862), saving model to ../checkpoints/layoutlm-v2-epoch=0.ckpt as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 0: val_loss reached 0.03537 (best 0.03537), saving model to ../checkpoints/layoutlm-v2-epoch=0-v0.ckpt as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 0: val_loss reached 0.03408 (best 0.03408), saving model to ../checkpoints/layoutlm-v2-epoch=0.ckpt as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 0: val_loss reached 0.03183 (best 0.03183), saving model to ../checkpoints/layoutlm-v2-epoch=0-v0.ckpt as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 0: val_loss reached 0.03039 (best 0.03039), saving model to ../checkpoints/layoutlm-v2-epoch=0.ckpt as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 0: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 0: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 1: val_loss reached 0.03005 (best 0.03005), saving model to ../checkpoints/layoutlm-v2-epoch=1.ckpt as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 1: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 1: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 1: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 1: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 1: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 1: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 1: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 1: val_loss reached 0.03001 (best 0.03001), saving model to ../checkpoints/layoutlm-v2-epoch=1-v0.ckpt as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 2: val_loss reached 0.03001 (best 0.03001), saving model to ../checkpoints/layoutlm-v2-epoch=2.ckpt as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 2: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 2: val_loss reached 0.02991 (best 0.02991), saving model to ../checkpoints/layoutlm-v2-epoch=2-v0.ckpt as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 2: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 2: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 2: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 2: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 2: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 2: val_loss reached 0.02976 (best 0.02976), saving model to ../checkpoints/layoutlm-v2-epoch=2.ckpt as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 3: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 3: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 3: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 3: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 3: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 3: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 3: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 3: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 3: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 4: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 4: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 4: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 4: val_loss reached 0.02971 (best 0.02971), saving model to ../checkpoints/layoutlm-v2-epoch=4.ckpt as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 4: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 4: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 4: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 4: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 4: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 5: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 5: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 5: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 5: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 5: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 5: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 5: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 5: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 5: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 6: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 6: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 6: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 6: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 6: val_loss reached 0.02965 (best 0.02965), saving model to ../checkpoints/layoutlm-v2-epoch=6.ckpt as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 6: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 6: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 6: val_loss was not in top 1


In [None]:
# ckpt_path = '../checkpoints/layoutlm-layoutlm.ckpt-v1.ckpt'
# loaded_ckpt = torch.load(ckpt_path)

In [17]:
ckpt_path = '../checkpoints/layoutlm-v2-epoch=6.ckpt'
task = TaskLayoutLM.load_from_checkpoint(ckpt_path, model=model, tokenizer=tokenizer)
model = task.model
model = model.to(device)

In [43]:
batch = next(iter(valid_loader))
# batch[0]

# valid_loader.dataset[0]

In [44]:
inputs = {
    "input_ids": batch[0].to(device),
    "attention_mask": batch[1].to(device),
#     "token_type_ids": batch[2].to(device),
    "labels": batch[3].to(device),
    "bbox": batch[4].to(device)
}
outputs = model(**inputs)

In [4]:
from laylm.trainer import metrics
words, label_preds, label_gts = metrics.normalized_words_labels_preds(inputs, outputs, tokenizer)

accuracy_score(label_preds, label_gts)
f1_score(label_preds, label_gts)
precision_score(label_preds, label_gts)
recall_score(label_preds, label_gts)
# classification_report(label_preds, label_gts)

pd.set_option('display.max_columns', None)
pd.set_option('max_columns', None)
pd.set_option("max_rows", None)
pd.DataFrame({
    'words': words[0],
    'pred': label_preds[0],
    'gt': label_gts[0]
})

NameError: name 'inputs' is not defined

In [17]:
input_data = {
    'input_ids':[],
    'attention_mask': [],
    'bbox': []
}

In [23]:
torch.save(model.state_dict(), '../weights/layoutlm_v2_ktp_1606753021.pth')

In [25]:
state_dict = torch.load('../weights/layoutlm_v2_ktp_1606753021.pth')

In [16]:
from laylm.data.dataset import IDCardAnnoDataset
from laylm.data import utils
from laylm.config import token as token_cfg
from sklearn.utils import shuffle


path = '/data/idcard/combined/1606753021/'
annoset = IDCardAnnoDataset(path, tokenizer)

In [19]:
objects = annoset[2005]
objects = shuffle(objects)

data_dict = utils.annoset_transform(objects, tokenizer, max_seq_length=512)
inputs_data = utils.annoset_inputs(data_dict, device=device)

outputs = model(**inputs_data)

label_preds = normalized_prediction(outputs, tokenizer)
data_dict['labels'] = label_preds[0]
data = clean_prediction_data(data_dict, tokenizer)
data = rebuild_prediction_data(data)
data

{'provinsi': 'JAWA TIMUR',
 'kabupaten': 'BOJONEGORO',
 'nik': '3522261306877447',
 'nama': 'NUZULIA RAHMAWATI',
 'ttl': 'BOJONEGORO, 13-06-1987',
 'gender': 'LAKI-LAKI',
 'goldar': 'O',
 'alamat': 'BARU JL. SOMBA UJUNGPANDANG OPU 107',
 'rtrw': '021/023',
 'kelurahan': 'SAMBONGREJO',
 'kecamatan': 'GONDANG',
 'agama': 'ISLAM',
 'perkawinan': 'CERAI HIDUP',
 'pekerjaan': 'PETANI/PEKEBUN',
 'kewarganegaraan': 'WNI',
 'berlaku': 'SEUMUR HIDUP',
 'sign_place': 'BOJONEGORO',
 'sign_date': '22-05-2016'}

In [230]:
# objects

In [18]:
from laylm.trainer import metrics

def normalized_prediction(outputs, tokenizer):
    preds = prediction_index(outputs)
    
    bsize = preds.shape[0]
    
    labels = []
    for idx in range(bsize):
        label_pred = []
        for pds in preds[idx].tolist():
            lbl = label_cfg.idx_to_label.get(pds, "O")
            label_pred.append(lbl)
        labels.append(label_pred)
    
    return labels

    
def prediction_index(outputs):
    if len(outputs)>1:
        preds = outputs[1]
    else:
        preds = outputs[0]
    preds = torch.argmax(preds, dim=2)
    return preds

def clean_prediction_data(data_dict, tokenizer):
    words = data_dict['words']
    boxes = data_dict['bboxes']
    tokens = data_dict['tokens']
    labels = data_dict['labels']
    gseq = data_dict['gseq']
    wseq = data_dict['wseq']

    data = {
        'words':[],
        'bboxes': [],
        'tokens': [],
        'labels': [],
        'gseq': [],
        'wseq': [],
    }

    for (w,b,t,l,gq,wq) in zip(words, boxes, tokens, labels, gseq, wseq):
        if not (w==tokenizer.cls_token or 
                w==tokenizer.sep_token or 
                w==tokenizer.pad_token):

            data['words'].append(w)
            data['bboxes'].append(b)
            data['tokens'].append(t)
            data['labels'].append(l)
            data['gseq'].append(gq)
            data['wseq'].append(wq)
            
    return data

def sort_multidim(data):
    sorter = lambda x: (x[2][1], x[1])
    # x[2][1] sort by y position
    # x[1] sort by BILOU
    
    return sorted(data, key=sorter)


def word_taken(data):
    str_out = ""
    for idx in range(len(data)):
        w = data[idx][0]
        if w!="" and len(w)!=0:
            str_out += w
            if idx!=len(data)-1:
                str_out += " "
            
    return str_out

from laylm.config import label as label_cfg

def rebuild_prediction_data(data):
    df = pd.DataFrame(data)
    dfg = df.groupby('gseq').aggregate({
        'words': 'min', 
        'bboxes':'last',
        'tokens':'sum',
        'labels':'first'
    })
    
    base_data = dict((k,[]) for k,v in label_cfg.base_label_name.items())
    for idx in range(len(dfg)):
        labels = dfg.iloc[idx]['labels']
        bbox = dfg.iloc[idx]['bboxes']
        if not labels=="O":
            bil, val = labels.split("-")
            val_type, val_label = val.split("_")
            if val_type=="VAL":
                word = dfg.iloc[idx]['words']
                key = label_cfg.label_to_name[val_label]
                base_data[key].append((word, bil, bbox))


    for k,v in base_data.items():
        sorted_data = sort_multidim(v)
        base_data[k] = word_taken(sorted_data)
    
    return base_data
    

{'provinsi': 'JAWA TENGAH',
 'kabupaten': 'KEBUMEN',
 'nik': '3305082709904677',
 'nama': 'ARINI ASARI',
 'ttl': 'KEBUMEN, 27-09-1990',
 'gender': 'PEREMPUAN',
 'goldar': 'A',
 'alamat': 'JL. PALANG MERAH 1',
 'rtrw': '002/021',
 'kelurahan': 'LEMBUPURWO',
 'kecamatan': 'MIRIT',
 'agama': 'ISLAM',
 'perkawinan': 'KAWIN',
 'pekerjaan': 'PETANI/PEKEBUN',
 'kewarganegaraan': 'WNI',
 'berlaku': 'SEUMUR HIDUP',
 'sign_place': 'KEBUMEN',
 'sign_date': '25-07-2015'}

In [246]:
# dfg
# dfg['bboxes']

In [265]:


    

base_data

{'provinsi': 'JAWA TENGAH',
 'kabupaten': 'KLATEN',
 'nik': '3310021810894916',
 'nama': 'INES CARON',
 'ttl': 'KLATEN, 18-10-1989',
 'gender': 'PEREMPUAN',
 'goldar': 'A',
 'alamat': 'JL.  RAYA CEMPAKA PUTIH BLOK B/16',
 'rtrw': '026/027',
 'kelurahan': 'MLESE',
 'kecamatan': 'GANTIWARNO',
 'agama': 'ISLAM',
 'perkawinan': 'CERAI MATI',
 'pekerjaan': 'BELUM/TIDAK BEKERJA',
 'kewarganegaraan': 'WNI',
 'berlaku': 'SEUMUR HIDUP',
 'sign_place': 'KLATEN',
 'sign_date': '08-06-2010'}

In [252]:
alamat = base_data['alamat']



[('JL.', 'B', [291.0, 265.0, 311.0, 295.0]),
 ('', 'I', [291.0, 299.0, 291.0, 299.0]),
 ('RAYA', 'I', [288.0, 305.0, 309.0, 364.0]),
 ('CEMPAKA', 'I', [283.0, 368.0, 307.0, 472.0]),
 ('PUTIH', 'I', [280.0, 477.0, 302.0, 541.0]),
 ('BLOK', 'I', [277.0, 546.0, 299.0, 604.0]),
 ('B/16', 'I', [275.0, 608.0, 296.0, 653.0])]

In [141]:
pd.set_option('display.max_columns', None)
pd.set_option('max_columns', None)
pd.set_option("max_rows", None)
pd.DataFrame(data_dict)

Unnamed: 0,words,bboxes,tokens,token_ids,mask,gseq,wseq,labels
0,[CLS],"[0, 0, 0, 0]",[CLS],2,1,-100,-100,L-VAL_KLH
1,PROVINSI,"[103.0, 302.0, 130.0, 434.0]",provinsi,2142,1,0,0,U-FLD_PROV
2,JAWA,"[103.0, 442.0, 130.0, 519.0]",jawa,1069,1,1,0,B-VAL_PROV
3,TENGAH,"[103.0, 526.0, 130.0, 641.0]",tengah,1172,1,2,0,L-VAL_PROV
4,KABUPATEN,"[132.0, 328.0, 160.0, 496.0]",kabupaten,1133,1,3,0,U-FLD_KAB
5,KENDAL,"[132.0, 503.0, 160.0, 615.0]",kendal,14964,1,4,0,U-VAL_KAB
6,NIK,"[171.0, 34.0, 207.0, 103.0]",nik,7443,1,5,0,U-FLD_NIK
7,:,"[171.0, 209.0, 207.0, 232.0]",:,30472,1,6,0,O
8,3324032212022779,"[171.0, 245.0, 207.0, 614.0]",33,5476,1,7,0,B-VAL_NIK
9,3324032212022779,"[171.0, 245.0, 207.0, 614.0]",##240,26411,1,7,1,I-VAL_NIK


In [76]:
# accuracy_score(label_preds, label_gts)
# f1_score(label_preds, label_gts)
# precision_score(label_preds, label_gts)
# recall_score(label_preds, label_gts)
# classification_report(label_preds, label_gts)

pd.set_option('display.max_columns', None)
pd.set_option('max_columns', None)
pd.set_option("max_rows", None)
pd.DataFrame({
    'word': data_dict['words'],
    'token': words[0],
    'pred': label_preds[0],
})

ValueError: arrays must all be same length

In [40]:
words

[['b',
  '/',
  '##26',
  '##nt',
  '00',
  '##o',
  'jel',
  ':',
  ':',
  '/',
  'alamat',
  ':',
  'jenis',
  'agama',
  ':',
  'rw',
  '25',
  'kewarganegaraan',
  'baru',
  'provinsi',
  '##81',
  ':',
  'kecamatan',
  'cerai',
  '-',
  'wni',
  'kresna',
  '12',
  '##03',
  'tgl',
  'jawa',
  '2013',
  ':',
  'gol',
  ',',
  'lumajang',
  '350',
  'jatim',
  'status',
  '/',
  ':',
  '08',
  'tempat',
  '##ir',
  'laki',
  'pekerjaan',
  'kun',
  'hingga',
  '37',
  'perkawinan',
  '##88',
  ':',
  '-',
  '01',
  'mati',
  '/',
  'islam',
  'desa',
  '##ambar',
  'laki',
  '##7',
  'jl',
  '/',
  'seumur',
  'lahir',
  ':',
  '-',
  'a',
  'kabupaten',
  'kel',
  'mahasiswa',
  'kelamin',
  '##6',
  '-',
  ':',
  ':',
  'hidup',
  ':',
  ':',
  '.',
  'lumajang',
  '.',
  '[UNK]',
  'lumajang',
  'nama',
  'berlaku',
  '-',
  'nik',
  'rt',
  ':',
  '##60',
  '007',
  'timur',
  'darah',
  'pelajar',
  '##ulyo',
  '-',
  '1960',
  '##80'],
 ['provinsi',
  'jawa',
  'timur',
  'ka

In [None]:
!pip install seqeval