In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# pl.__version__

In [3]:
import os
import json
import logging
import random
from pathlib import Path

import cv2 as cv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm, trange

import torch
import torch.nn as nn
import torch.optim as optim

import pytorch_lightning as pl
from pytorch_lightning import loggers as pl_loggers

In [4]:
import transformers
transformers.__version__

'3.5.0'

In [5]:
from iqradre.extract.data import utils 
from iqradre.extract.data import loader
from iqradre.extract.data.dataset import IDCardDataset
from iqradre.extract.config import label as label_cfg
from iqradre.extract.trainer.task import TaskLayoutLM


from transformers import BertTokenizer, AutoModel
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    BertConfig,
    BertTokenizer,
    BertForTokenClassification,
    LayoutLMConfig,
    LayoutLMForTokenClassification,
    get_linear_schedule_with_warmup,
)

from seqeval.metrics import (
    classification_report,
    f1_score,
    precision_score,
    recall_score,
    accuracy_score,
)

In [6]:
SAVED_CHECKPOINT_PATH = "../checkpoints/v2/"
SAVED_LOGS_PATH = "../logs/v2/"

SUMMARY = "top"
MANUAL_SEED = 1261
MAX_EPOCH = 10
MAX_STEPS = None
VALCHECK_INTERVAL = 2000
NUM_GPUS = 1
DISTRIBUTED_BACKEND = None
LOG_FREQ = 100
DETERMINISTIC = True
BENCHMARK = True
CHECKPOINT_PATH = None
BATCH_SIZE = 4
NUM_WORKERS=16

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [7]:
device = "cpu"

In [8]:
tokenizer = BertTokenizer.from_pretrained(
    "indobenchmark/indobert-base-p2",
    do_lower_case=True,
    cache_dir=None,
)

ValueError: Connection error, and we cannot find the requested files in the cached path. Please try again or make sure your Internet connection is on.

In [None]:
path = '/data/idcard/results/combined/layoutlm/20kv1'
train_loader, valid_loader = loader.get_loader(
    path, tokenizer=tokenizer, 
    batch_size=BATCH_SIZE, 
    num_workers=NUM_WORKERS,
    rand_seq=True,
    rand_seq_prob=0.5
)

In [103]:
len(valid_loader.dataset),(len(train_loader.dataset))

(13522, 54088)

In [104]:
res = valid_loader.dataset
# res[4]
input_ids, input_masks, segment_ids, label_ids, boxes, img, mask = res[2000]

In [105]:
img.shape

(750, 1000, 3)

In [None]:
config = LayoutLMConfig.from_pretrained(
    "microsoft/layoutlm-base-uncased",
    num_labels=label_cfg.num_labels,
    cache_dir=None
)

model = LayoutLMForTokenClassification.from_pretrained(
    'microsoft/layoutlm-base-uncased',
    config=config,
#     return_dict=True
)

model.resize_token_embeddings(len(tokenizer))
model = model.to(device)

In [121]:
class DownstreamModel(nn.Module):
    def __init__(self, layoutlm_model):
        self.layoutlm_model = layoutlm_model
        self.layoutlm_embedding = nn.Embedding(250,512)
        self.image_embedding = nn.Embedding(250,512)
        self.classifier = nn.Linear()
        
    def forward(
        self,
        input_ids=None,
        bbox=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        images=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        
        layoutlm_outputs = self.layoutlm_model(x)
        if len(outputs)>2:
            loss, logits = layoutlm_outputs
        else:
            logits = layoutlm_outputs
        
        lylm_embed = self.layoutlm_embedding(logits)
        img_embed = self.image_embedding(logits)
        combine = torch.cat([lylm_embed, img_embed])
        
        outputs = self.classifier(combine)
        return outputs
        
        

In [16]:
from torchvision.models.resnet import resnet34
resnet = resnet34(pretrained=True)
resnet.fc

Linear(in_features=512, out_features=1000, bias=True)

In [96]:
embed = nn.Embedding(15,2)
input = torch.LongTensor([
    [1,12]
])

input.shape
embed(input)

tensor([[[-1.4523,  1.7728],
         [-0.7685,  1.5386]]], grad_fn=<EmbeddingBackward>)

In [107]:
batch = next(iter(train_loader))
inputs = {
            "input_ids": batch[0].to(device),
            "attention_mask": batch[1].to(device),
            "token_type_ids": batch[2].to(device),
            "labels": batch[3].to(device),
            "bbox": batch[4].to(device)
        }
outputs = model.forward(**inputs)
loss, logits = outputs[0], outputs[1]

RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.FloatTensor instead (while checking arguments for embedding)

In [118]:
preds = torch.argmax(logits, dim=2)
preds.shape
embed = nn.Embedding(250, 512)
embed(preds)

tensor([[[ 0.7477, -1.7225,  0.9895,  ..., -1.0284, -0.1249, -0.7358],
         [ 0.1325, -1.2952,  1.7751,  ..., -0.1205,  0.1244, -0.5969],
         [ 0.1325, -1.2952,  1.7751,  ..., -0.1205,  0.1244, -0.5969],
         ...,
         [ 0.1289,  0.5206, -0.5304,  ...,  0.1863, -1.2301,  0.5379],
         [ 0.1289,  0.5206, -0.5304,  ...,  0.1863, -1.2301,  0.5379],
         [ 0.1289,  0.5206, -0.5304,  ...,  0.1863, -1.2301,  0.5379]],

        [[ 1.2384,  1.0449,  0.6403,  ..., -1.6067,  0.3973,  0.5023],
         [ 0.1650,  0.0522, -0.5625,  ..., -0.1893, -0.9771,  1.2926],
         [ 0.1325, -1.2952,  1.7751,  ..., -0.1205,  0.1244, -0.5969],
         ...,
         [ 0.1289,  0.5206, -0.5304,  ...,  0.1863, -1.2301,  0.5379],
         [ 0.1289,  0.5206, -0.5304,  ...,  0.1863, -1.2301,  0.5379],
         [ 0.0341, -0.6797,  1.5708,  ..., -0.8141,  0.7259,  0.2665]],

        [[ 0.7477, -1.7225,  0.9895,  ..., -1.0284, -0.1249, -0.7358],
         [ 0.1325, -1.2952,  1.7751,  ..., -0

In [62]:
bsize = preds.shape[0]
labels = []
for idx in range(bsize):
    label_pred = []
    for pds in preds[idx].tolist():
        lbl = label_cfg.idx_to_label.get(pds, "O")
        label_pred.append(lbl)
    labels.append(label_pred)


[['L-VAL_RLG',
  'I-FLD_WRG',
  'U-VAL_WRG',
  'U-VAL_GDR',
  'O',
  'L-VAL_NIK',
  'L-FLD_KAB',
  'U-VAL_GDR',
  'B-FLD_KCM',
  'U-VAL_GDR',
  'U-VAL_ADR',
  'U-VAL_GDR',
  'O',
  'U-VAL_ADR',
  'U-VAL_GDR',
  'L-FLD_KAB',
  'U-VAL_WRG',
  'U-VAL_GDR',
  'I-FLD_WRG',
  'U-VAL_WRG',
  'U-VAL_GDR',
  'I-FLD_WRG',
  'L-FLD_KAB',
  'U-VAL_ADR',
  'I-FLD_WRG',
  'B-VAL_SGP',
  'U-VAL_RLG',
  'I-VAL_KLH',
  'U-VAL_WRG',
  'U-VAL_GDR',
  'U-VAL_GDR',
  'U-VAL_GDR',
  'L-FLD_KAB',
  'L-FLD_KAB',
  'B-VAL_SGP',
  'L-FLD_KAB',
  'O',
  'B-VAL_SGP',
  'U-VAL_WRG',
  'U-VAL_GDR',
  'U-VAL_GDR',
  'B-VAL_KAB',
  'O',
  'U-VAL_WRG',
  'B-FLD_KAB',
  'L-FLD_KAB',
  'U-VAL_WRG',
  'I-FLD_WRG',
  'I-VAL_SGD',
  'U-VAL_WRG',
  'L-FLD_KAB',
  'U-VAL_GDR',
  'U-VAL_KCM',
  'L-FLD_KAB',
  'I-FLD_WRG',
  'U-VAL_RLG',
  'L-FLD_KAB',
  'I-FLD_WRG',
  'I-FLD_PROV',
  'U-VAL_WRG',
  'L-FLD_KAB',
  'U-VAL_KCM',
  'I-FLD_WRG',
  'L-FLD_KAB',
  'I-FLD_WRG',
  'B-VAL_SGP',
  'U-VAL_RLG',
  'U-VAL_WRG',
  'U-VAL_GD

In [15]:
task = TaskLayoutLM(model, tokenizer)

In [14]:
# ckpt_path = '../checkpoints/layoutlm-v2-epoch=6.ckpt'
# task = TaskLayoutLM.load_from_checkpoint(ckpt_path, model=model, tokenizer=tokenizer)
# model = task.model
# model = model.to(device)

In [16]:
# DEFAULTS used by the Trainer
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filepath=SAVED_CHECKPOINT_PATH,
#     save_top_k=1,
    verbose=True,
#     monitor='val_loss',
#     mode='min',
    prefix='layoutlm-v2'
)

tb_logger = pl_loggers.TensorBoardLogger(SAVED_LOGS_PATH)
pl.trainer.seed_everything(MANUAL_SEED)

Global seed set to 1261


1261

In [17]:
trainer = pl.Trainer(
    weights_summary=SUMMARY,
    max_epochs=MAX_EPOCH,
    max_steps=MAX_STEPS,
    val_check_interval=1000,
    gpus=1,
    distributed_backend=DISTRIBUTED_BACKEND,
    log_every_n_steps=LOG_FREQ,
    deterministic=DETERMINISTIC,
    benchmark=BENCHMARK,
    logger=tb_logger, 
    checkpoint_callback=checkpoint_callback, 
#     resume_from_checkpoint=CHECKPOINT_PATH
)

trainer.fit(task, train_loader, valid_loader)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AttributeError: '_IncompatibleKeys' object has no attribute 'named_parameters'

In [19]:
# ckpt_path = '../checkpoints/layoutlm-layoutlm.ckpt-v1.ckpt'
# loaded_ckpt = torch.load(ckpt_path)

In [16]:
ckpt_path = '../checkpoints/layoutlm-v2-epoch=6.ckpt'
task = TaskLayoutLM.load_from_checkpoint(ckpt_path, model=model, tokenizer=tokenizer)
model = task.model
model = model.to(device)

In [None]:
batch = next(iter(valid_loader))
# batch[0]

# valid_loader.dataset[0]

In [None]:
inputs = {
    "input_ids": batch[0].to(device),
    "attention_mask": batch[1].to(device),
#     "token_type_ids": batch[2].to(device),
    "labels": batch[3].to(device),
    "bbox": batch[4].to(device)
}
outputs = model(**inputs)

In [None]:
from laylm.trainer import metrics
words, label_preds, label_gts = metrics.normalized_words_labels_preds(inputs, outputs, tokenizer)

accuracy_score(label_preds, label_gts)
f1_score(label_preds, label_gts)
precision_score(label_preds, label_gts)
recall_score(label_preds, label_gts)
# classification_report(label_preds, label_gts)

pd.set_option('display.max_columns', None)
pd.set_option('max_columns', None)
pd.set_option("max_rows", None)
pd.DataFrame({
    'words': words[0],
    'pred': label_preds[0],
    'gt': label_gts[0]
})

In [None]:
input_data = {
    'input_ids':[],
    'attention_mask': [],
    'bbox': []
}

In [19]:
model = task.model
torch.save(model.state_dict(), '../weights/layoutlm_v2_ktp_20kv1_vacc_0.981_vloss_0.26.pth')

In [17]:
state_dict = torch.load('../weights/layoutlm_v2_ktp_20kv1_vacc_0.981_vloss_0.26.pth')
model = model.load_state_dict(state_dict)
# task.model = model

TypeError: cannot assign 'torch.nn.modules.module._IncompatibleKeys' as child module 'model' (torch.nn.Module or None expected)

In [21]:
from laylm.data.dataset import IDCardAnnoDataset
from laylm.data import utils
from laylm.config import token as token_cfg
from sklearn.utils import shuffle


path = '/data/idcard/combined/1606753021/'
annoset = IDCardAnnoDataset(path, tokenizer)

In [None]:
objects = annoset[2005]
objects = shuffle(objects)

data_dict = utils.annoset_transform(objects, tokenizer, max_seq_length=512)
inputs_data = utils.annoset_inputs(data_dict, device=device)

outputs = model(**inputs_data)

label_preds = normalized_prediction(outputs, tokenizer)
data_dict['labels'] = label_preds[0]
data = clean_prediction_data(data_dict, tokenizer)
data = rebuild_prediction_data(data)
data

In [None]:
annoset[0]

In [None]:
from laylm.trainer import metrics

def normalized_prediction(outputs, tokenizer):
    preds = prediction_index(outputs)
    
    bsize = preds.shape[0]
    
    labels = []
    for idx in range(bsize):
        label_pred = []
        for pds in preds[idx].tolist():
            lbl = label_cfg.idx_to_label.get(pds, "O")
            label_pred.append(lbl)
        labels.append(label_pred)
    
    return labels

    
def prediction_index(outputs):
    if len(outputs)>1:
        preds = outputs[1]
    else:
        preds = outputs[0]
    preds = torch.argmax(preds, dim=2)
    return preds

def clean_prediction_data(data_dict, tokenizer):
    words = data_dict['words']
    boxes = data_dict['bboxes']
    tokens = data_dict['tokens']
    labels = data_dict['labels']
    gseq = data_dict['gseq']
    wseq = data_dict['wseq']

    data = {
        'words':[],
        'bboxes': [],
        'tokens': [],
        'labels': [],
        'gseq': [],
        'wseq': [],
    }

    for (w,b,t,l,gq,wq) in zip(words, boxes, tokens, labels, gseq, wseq):
        if not (w==tokenizer.cls_token or 
                w==tokenizer.sep_token or 
                w==tokenizer.pad_token):

            data['words'].append(w)
            data['bboxes'].append(b)
            data['tokens'].append(t)
            data['labels'].append(l)
            data['gseq'].append(gq)
            data['wseq'].append(wq)
            
    return data

def sort_multidim(data):
    sorter = lambda x: (x[2][1], x[1])
    # x[2][1] sort by y position
    # x[1] sort by BILOU
    
    return sorted(data, key=sorter)


def word_taken(data):
    str_out = ""
    for idx in range(len(data)):
        w = data[idx][0]
        if w!="" and len(w)!=0:
            str_out += w
            if idx!=len(data)-1:
                str_out += " "
            
    return str_out

from laylm.config import label as label_cfg

def rebuild_prediction_data(data):
    df = pd.DataFrame(data)
    dfg = df.groupby('gseq').aggregate({
        'words': 'min', 
        'bboxes':'last',
        'tokens':'sum',
        'labels':'first'
    })
    
    base_data = dict((k,[]) for k,v in label_cfg.base_label_name.items())
    for idx in range(len(dfg)):
        labels = dfg.iloc[idx]['labels']
        bbox = dfg.iloc[idx]['bboxes']
        if not labels=="O":
            bil, val = labels.split("-")
            val_type, val_label = val.split("_")
            if val_type=="VAL":
                word = dfg.iloc[idx]['words']
                key = label_cfg.label_to_name[val_label]
                base_data[key].append((word, bil, bbox))


    for k,v in base_data.items():
        sorted_data = sort_multidim(v)
        base_data[k] = word_taken(sorted_data)
    
    return base_data
    

In [None]:
# dfg
# dfg['bboxes']

In [None]:


    

base_data

In [None]:
alamat = base_data['alamat']


In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('max_columns', None)
pd.set_option("max_rows", None)
pd.DataFrame(data_dict)

In [None]:
# accuracy_score(label_preds, label_gts)
# f1_score(label_preds, label_gts)
# precision_score(label_preds, label_gts)
# recall_score(label_preds, label_gts)
# classification_report(label_preds, label_gts)

pd.set_option('display.max_columns', None)
pd.set_option('max_columns', None)
pd.set_option("max_rows", None)
pd.DataFrame({
    'word': data_dict['words'],
    'token': words[0],
    'pred': label_preds[0],
})

In [None]:
words

In [None]:
!pip install seqeval