<div class="alert alert-success">  
</div>

<div>
    <h1 align="center">KoBERT Multi-label text classifier</h1></h1>
    <h4 align="center">By: Myeonghak Lee</h4>
</div>

<div class="alert alert-success">  
</div>

In [1]:
# Input Data 가공 파트

# import torchtext
import pandas as pd
import numpy as np

import os
import re

import config
from config import expand_pandas
from preprocess import preprocess

DATA_PATH=config.DATA_PATH

model_config=config.model_config

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
config.expand_pandas(max_rows=100, max_cols=100,width=1000,max_info_cols=500)

done


### **configs**

In [4]:
num_class=17
ver_num=1

except_labels=["변경/취소","예약기타"]

version_info="{:02d}".format(ver_num)
weight_path=f"../weights/weight_{version_info}.pt"

### **preprocess**

In [12]:
data=preprocess()

In [13]:
# data_orig=data.voc_total["종합본"]
data.make_table()

# put labels
data.label_process(num_labels=num_class, except_labels=except_labels)

True

In [14]:
orig=data.voc_total["종합본"]

In [15]:
label_cols=data.label_cols

In [16]:
df=data.data.copy()

In [17]:
voc_dataset=df.reset_index(drop=True)

# Modeling part

In [None]:
import torch
from torch import nn

from metrics_for_multilabel import calculate_metrics, colwise_accuracy

from bert_model import Data_for_BERT, BERTClassifier, EarlyStopping

from transformers import get_linear_schedule_with_warmup, AdamW

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [24]:
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(voc_dataset, voc_dataset["국내선"], test_size = 0.25, random_state = 42)

# train=pd.concat([train_input,train_target],axis=1)
# test=pd.concat([test_input,test_target],axis=1)

train=train_input.copy()
test=test_input.copy()

train=train.reset_index(drop=True)
test=test.reset_index(drop=True)

data_train = Data_for_BERT(train, model_config["max_len"], True, False, label_cols=label_cols)
data_test = Data_for_BERT(test, model_config["max_len"], True, False, label_cols=label_cols)

# 파이토치 모델에 넣을 수 있도록 데이터를 처리함. 
# data_train을 넣어주고, 이 테이터를 batch_size에 맞게 잘라줌. num_workers는 사용할 subprocess의 개수를 의미함(병렬 프로그래밍)

train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=model_config["batch_size"], num_workers=0)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=model_config["batch_size"], num_workers=0)

NameError: name 'Data_for_BERT' is not defined

In [18]:
# KoBERT 라이브러리에서 bertmodel을 호출함. .to() 메서드는 모델 전체를 GPU 디바이스에 옮겨 줌.
model = BERTClassifier(num_classes=num_class, dr_rate = model_config["dr_rate"]).to(device)

# 옵티마이저와 스케쥴 준비 (linear warmup과 decay)
no_decay = ['bias', 'LayerNorm.weight']

# no_decay에 해당하는 파라미터명을 가진 레이어들은 decay에서 배제하기 위해 weight_decay를 0으로 셋팅, 그 외에는 0.01로 decay
# weight decay란 l2 norm으로 파라미터 값을 정규화해주는 기법을 의미함
optimizer_grouped_parameters = [
    {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay' : 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]


# 옵티마이저는 AdamW, 손실함수는 BCE
# optimizer_grouped_parameters는 최적화할 파라미터의 그룹을 의미함
optimizer = AdamW(optimizer_grouped_parameters, lr= model_config["learning_rate"])
# loss_fn = nn.CrossEntropyLoss()
loss_fn=nn.BCEWithLogitsLoss()


# t_total = train_dataloader.dataset.labels.shape[0] * num_epochs
# linear warmup을 사용해 학습 초기 단계(배치 초기)의 learning rate를 조금씩 증가시켜 나가다, 어느 지점에 이르면 constant하게 유지
# 초기 학습 단계에서의 변동성을 줄여줌.

t_total = len(train_dataloader) * model_config["num_epochs"]
warmup_step = int(t_total * model_config["warmup_ratio"])
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)


# model_save_name = 'classifier'
# model_file='.pt'
# path = f"./bert_weights/{model_save_name}_{model_file}" 

In [23]:
def train_model(model, batch_size, patience, n_epochs,path):
    
    # to track the training loss as the model trains
    train_losses = []
    # to track the validation loss as the model trains
    valid_losses = []
    # to track the average training loss per epoch as the model trains
    avg_train_losses = []
    # to track the average validation loss per epoch as the model trains
    avg_valid_losses = [] 

    early_stopping = EarlyStopping(patience=patience, verbose=True, path=path)

    for epoch in range(1, n_epochs + 1):
        
        # initialize the early_stopping object
        model.train()
        train_epoch_pred=[]
        train_loss_record=[]

        for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(train_dataloader):
            optimizer.zero_grad()

            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            valid_length= valid_length
        
            # label = label.long().to(device)
            label = label.float().to(device)

            out= model(token_ids, valid_length, segment_ids)#.squeeze(1)
            
            loss = loss_fn(out, label)

            train_loss_record.append(loss)

            train_pred=out.detach().cpu().numpy()
            train_real=label.detach().cpu().numpy()

            train_batch_result = calculate_metrics(np.array(train_pred), np.array(train_real))
            
            if batch_id%50==0:
                print(f"batch number {batch_id}, train col-wise accuracy is : {train_batch_result['Column-wise Accuracy']}")
                

            # save prediction result for calculation of accuracy per batch
            train_epoch_pred.append(train_pred)

            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), model_config["max_grad_norm"])
            optimizer.step()
            scheduler.step()  # Update learning rate schedule

            train_losses.append(loss.item())

        train_epoch_pred=np.concatenate(train_epoch_pred)
        train_epoch_target=train_dataloader.dataset.labels
        train_epoch_result=calculate_metrics(target=train_epoch_target, pred=train_epoch_pred)
        
        print(f"=====Training Report: mean loss is {sum(train_loss_record)/len(train_loss_record)}=====")
        print(train_epoch_result)
        
        print("=====train done!=====")

        # if e % log_interval == 0:
        #     print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))

        # print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
        test_epoch_pred=[]
        test_loss_record=[]

        model.eval()
        with torch.no_grad():
            for batch_id, (token_ids, valid_length, segment_ids, test_label) in enumerate(test_dataloader):
                
                token_ids = token_ids.long().to(device)
                segment_ids = segment_ids.long().to(device)
                valid_length = valid_length
                
                # test_label = test_label.long().to(device)
                test_label = test_label.float().to(device)

                test_out = model(token_ids, valid_length, segment_ids)

                test_loss = loss_fn(test_out, test_label)

                test_loss_record.append(test_loss)
                
                valid_losses.append(test_loss.item())

                test_pred=test_out.detach().cpu().numpy()
                test_real=test_label.detach().cpu().numpy()

                test_batch_result = calculate_metrics(np.array(test_pred), np.array(test_real))

                if batch_id%50==0:
                    print(f"batch number {batch_id}, test col-wise accuracy is : {test_batch_result['Column-wise Accuracy']}")

                # save prediction result for calculation of accuracy per epoch
                test_epoch_pred.append(test_pred)

        test_epoch_pred=np.concatenate(test_epoch_pred)
        test_epoch_target=test_dataloader.dataset.labels
        test_epoch_result=calculate_metrics(target=test_epoch_target, pred=test_epoch_pred)

        print(f"=====Testing Report: mean loss is {sum(test_loss_record)/len(test_loss_record)}=====")
        print(test_epoch_result)

        train_loss = np.average(train_losses)
        valid_loss = np.average(valid_losses)
        avg_train_losses.append(train_loss)
        avg_valid_losses.append(valid_loss)

        # clear lists to track next epoch
        train_losses = []
        valid_losses = []

        # early_stopping needs the validation loss to check if it has decresed, 
        # and if it has, it will make a checkpoint of the current model
        early_stopping(valid_loss, model)

        if early_stopping.early_stop:
            print("Early stopping")
            break

    # load the last checkpoint with the best model
    model.load_state_dict(torch.load(path))

    return  model, avg_train_losses, avg_valid_losses
        

In [None]:
# early stopping patience; how long to wait after last time validation loss improved.
patience = 10
model, train_loss, valid_loss = train_model(model, 
                                            model_config["batch_size"],
                                            patience, 
                                            model_config["num_epochs"], 
                                            path=weight_path)


# test performance

In [54]:
weight_path="../weights/weight_01.pt"

In [55]:
model.load_state_dict(torch.load(weight_path))

<All keys matched successfully>

In [56]:
test_epoch_pred=[] 
test_loss_record=[] 
valid_losses=[]

model.eval() 
with torch.no_grad(): 
    for batch_id, (token_ids, valid_length, segment_ids, test_label) in enumerate(test_dataloader):

        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        
        # test_label = test_label.long().to(device)
        test_label = test_label.float().to(device)

        test_out = model(token_ids, valid_length, segment_ids)

        test_loss = loss_fn(test_out, test_label)

        test_loss_record.append(test_loss)
        
        valid_losses.append(test_loss.item())

        test_pred=test_out.detach().cpu().numpy()
        test_real=test_label.detach().cpu().numpy()

        test_batch_result = calculate_metrics(np.array(test_pred), np.array(test_real))

        if batch_id%50==0:
            print(f"batch number {batch_id}, test col-wise accuracy is : {test_batch_result['Column-wise Accuracy']}")

        # save prediction result for calculation of accuracy per epoch
        test_epoch_pred.append(test_pred)

        # if batch_id%10==0:
        #     print(test_batch_result["Accuracy"])
    test_epoch_pred=np.concatenate(test_epoch_pred) 
    test_epoch_target=test_dataloader.dataset.labels 
    test_epoch_result=calculate_metrics(target=test_epoch_target, pred=test_epoch_pred)

    # print(test_epoch_pred)
    # print(test_epoch_target)
    print(f"=====Testing Report: mean loss is {sum(test_loss_record)/len(test_loss_record)}=====")
    print(test_epoch_result)

batch number 0, test col-wise accuracy is : 0.9294117647058825
batch number 50, test col-wise accuracy is : 0.9411764705882353
batch number 100, test col-wise accuracy is : 0.9058823529411765
=====Testing Report: mean loss is 0.20872297883033752=====
{'Accuracy': 0.22437137330754353, 'Column-wise Accuracy': 0.9210376607122539, 'micro/precision': 0.7973273942093542, 'micro/recall': 0.372528616024974, 'micro/f1': 0.5078014184397163, 'macro/precision': 0.6019785504362263, 'macro/recall': 0.28350515905377016, 'macro/f1': 0.34598051554393844, 'samples/precision': 0.563023855577047, 'samples/recall': 0.3934235976789168, 'samples/f1': 0.4393478861563968}


In [58]:
import metrics_for_multilabel as metrics

In [59]:
metrics.mean_ndcg_score(test_epoch_target,test_epoch_pred, k=17)

0.840774483390301

In [60]:
acc_cnt=0
for n in range(test_epoch_pred.shape[0]):
    tar_cnt=np.count_nonzero(test_epoch_target[n])
    pred_=test_epoch_pred[n].argsort()[-tar_cnt:]
    tar_=test_epoch_target[n].argsort()[-tar_cnt:]
    acc_cnt+=len(set(pred_)&set(tar_))/len(pred_)
print(f"accuracy: {acc_cnt/test_epoch_pred.shape[0]}")

accuracy: 0.6367182462927145


In [None]:
calculate_metrics(target=test_epoch_target, pred=test_epoch_pred, threshold=-1)

In [62]:
label_cases_sorted_target=data.label_cols

In [63]:
transform = nlp.data.BERTSentenceTransform(tok, max_seq_length = max_len, pad=True, pair=False)

def get_prediction_from_txt(input_text, threshold=0.0):
    sentences = transform([input_text])
    get_pred=model(torch.tensor(sentences[0]).long().unsqueeze(0).to(device),torch.tensor(sentences[1]).unsqueeze(0),torch.tensor(sentences[2]).to(device))
    pred=np.array(get_pred.to("cpu").detach().numpy()[0] > threshold, dtype=float)
    pred=np.nonzero(pred)[0].tolist()
    print(f"분석 결과, 대화의 예상 태그는 {[label_cases_sorted_target[i] for i in pred]} 입니다.")
    true=np.nonzero(input_text_label)[0].tolist()
    print(f"실제 태그는 {[label_cases_sorted_target[i] for i in true]} 입니다.")



In [64]:
input_text_num=17
input_text=voc_dataset.iloc[input_text_num,0]
# input_text=test.iloc[input_text_num,0]
input_text_label=voc_dataset.iloc[input_text_num,1:].tolist()

In [65]:
get_prediction_from_txt(input_text, -1)

분석 결과, 대화의 예상 태그는 ['대기예약', '무상신규예약'] 입니다.
실제 태그는 ['무상신규예약'] 입니다.


# XAI

In [69]:
from captum_tools_vocvis import *

In [None]:
from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization

# model = BERTClassifier(bertmodel, dr_rate = 0.4).to(device)
# model.load_state_dict(torch.load(os.getcwd()+"/chat_voc_model.pt", map_location=device))
model.eval()

In [94]:
PAD_IND = tok.vocab.padding_token
PAD_IND = tok.convert_tokens_to_ids(PAD_IND)
token_reference = TokenReferenceBase(reference_token_idx=PAD_IND)
lig = LayerIntegratedGradients(model,model.bert.embeddings)

In [95]:
transform = nlp.data.BERTSentenceTransform(tok, max_seq_length = 64, pad=True, pair=False)

voc_label_dict_inverse={ele:label_cols.index(ele) for ele in label_cols}

voc_label_dict={label_cols.index(ele):ele for ele in label_cols}

In [96]:
def forward_with_sigmoid_for_bert(input,valid_length,segment_ids):
    return torch.sigmoid(model(input,valid_length,segment_ids))


In [97]:
def forward_for_bert(input,valid_length,segment_ids):
    return torch.nn.functional.softmax(model(input,valid_length,segment_ids),dim=1)

In [109]:
# accumalate couple samples in this array for visualization purposes
vis_data_records_ig = []

def interpret_sentence(model, sentence, min_len = 64, label = 0, n_steps=10):
    # text = [token for token in tok.sentencepiece(sentence)]
    # if len(text) < min_len:
    #     text += ['pad'] * (min_len - len(text))
    # indexed = tok.convert_tokens_to_ids(text)
    # print(text)
    
    # 토크나이징, 시퀀스 생성
    seq_tokens=transform([sentence])
    indexed=torch.tensor(seq_tokens[0]).long()#.to(device)
    valid_length=torch.tensor(seq_tokens[1]).long().unsqueeze(0)
    segment_ids=torch.tensor(seq_tokens[2]).long().unsqueeze(0).to(device)
    sentence=[token for token in tok.sentencepiece(sentence)]
    

    with torch.no_grad():
        model.zero_grad()

    input_indices = torch.tensor(indexed, device=device)
    input_indices = input_indices.unsqueeze(0)
    
    seq_length = min_len

    # predict
    pred = forward_with_sigmoid_for_bert(input_indices,valid_length,segment_ids).detach().cpu().numpy().argmax().item()
    print(forward_with_sigmoid_for_bert(input_indices,valid_length,segment_ids))
    pred_ind = round(pred)
    
    # generate reference indices for each sample
    reference_indices = token_reference.generate_reference(seq_length, device=device).unsqueeze(0)

    # compute attributions and approximation delta using layer integrated gradients
    attributions_ig, delta = lig.attribute(input_indices, reference_indices,\
                                           n_steps=n_steps, return_convergence_delta=True,target=label,\
                                           additional_forward_args=(valid_length,segment_ids))

    print('pred: ', Label.vocab.itos[pred_ind], '(', '%.2f'%pred, ')', ', delta: ', abs(delta))

    add_attributions_to_visualizer(attributions_ig, sentence, pred, pred_ind, label, delta, vis_data_records_ig)

In [110]:
def add_attributions_to_visualizer(attributions, input_text, pred, pred_ind, label, delta, vis_data_records):
    attributions = attributions.sum(dim=2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.cpu().detach().numpy()

    # storing couple samples in an array for visualization purposes
    vis_data_records.append(visualization.VisualizationDataRecord(
                            attributions,
                            pred,
                            voc_label_dict[pred_ind], #Label.vocab.itos[pred_ind],
                            voc_label_dict[label], # Label.vocab.itos[label],
                            100, # Label.vocab.itos[1],
                            attributions.sum(),       
                            input_text,
                            delta))

In [126]:
sentence=voc_dataset.iloc[22].text

visualize_text(vis_data_records_ig)

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
