# Environment Setup
* Python 3.7.12, Pytorch 1.10 + cu113
* AMD 5900X, RTX 3090Ti, Local Jupyter Notebook
* To install Kobert, use the following command  
```pip install git+https://git@github.com/SKTBrain/KoBERT.git@master kobert-transformers ```
* We use ```Accelerator``` by ```Huggingface``` in this project, so gpu needs essentially. (Can single GPU!)
* **Not to be confused**, for crf, there is pytorch-crf and TorchCRF. The library used in this project is TorchCRF.
* (Not for team projects) If you have any problems running the code after installing the library, please leave an issue.

# Datafile
* We use data ```./dataset/NLNE2202211219.json``` by default.

# 1. Default Setting

In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np

from torch.utils.tensorboard import SummaryWriter
from typing import List, Dict, Tuple, Any
from accelerate import Accelerator
from transformers import AdamW, TrainingArguments, get_linear_schedule_with_warmup
from transformers.trainer_pt_utils import get_parameter_names
from process import CustomDataset, collect_fn_bert, tagging, load_files, collect_fn
from bert_model import NERBertCRF, NERBertSVM
from svm import KernelSVM
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence
from kobert_transformers import get_tokenizer, get_kobert_model

In [None]:
def label_init() :
    label_list = ['PS', 'FD', 'TR', 'AF', 'OG', 'LC', 'CV', 'DT', 'TI', 'QT', 'EV', 'AM', 'PT', 'MT', "TM"] # Number of labels: 15
    label_fin = []
    label_fin += ['B-' + i for i in label_list]
    label_fin += ['I-' + i for i in label_list]
    label_fin += ['O']
    label_fin += ['[CLS]']
    label_fin += ['[SEP]']
    label_fin += ['[PAD]']
    label_to_idx = {label: idx for idx, label in enumerate(label_fin)}
    idx_to_label = {idx: label for idx, label in enumerate(label_fin)}
    return label_fin, label_to_idx, idx_to_label


In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# 2. Training Part, Bert
* At the end of the model, you can decide whether to use SVM or CRF.
* This is controlled by a variable called ```model_base```, and if you want to use an SVM, you can store an SVM in it, otherwise a CRF.
* In addition to the model architecture, if you want to change the training arguments, you can modify training_args.
    * However, due to the current low version of Transformer, some of the factors provided in the current documentation may not work.

In [None]:
# Training
model_base = "CRF"
traing_args = TrainingArguments(
    output_dir=f"./checkpoints/Bert_{model_base}_checkpoints",
    per_device_train_batch_size=32,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=12,
    warmup_ratio=0.1,
    warmup_steps=0,
    # **default_args
) # Optional

PATH = traing_args.output_dir

if not os.path.exists(PATH) :
    os.makedirs(PATH)

df = load_files() # Have to control path (argparser)
texts = df['form'].to_list()
ne = df['NE'].to_list()

train_texts, test_texts, train_ne, test_ne = train_test_split(texts, ne, test_size=0.2, random_state=42) # fix training dataset
train_dataset = CustomDataset(train_texts, train_ne)
test_dataset = CustomDataset(test_texts, test_ne)
train_loader = DataLoader(train_dataset, batch_size = traing_args.per_device_train_batch_size, shuffle=True, collate_fn=collect_fn_bert)
test_loader = DataLoader(test_dataset, batch_size = traing_args.per_device_eval_batch_size, shuffle=True, collate_fn=collect_fn_bert)

# Prepare Model, optimizer
if model_base == "CRF" :
    model = NERBertCRF()
else :
    model = NERBertSVM()

decay_parameters = get_parameter_names(model, [nn.LayerNorm])
decay_parameters = [name for name in decay_parameters if "bias" not in name]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
        "weight_decay": traing_args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr= traing_args.learning_rate, eps= traing_args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, traing_args.warmup_steps, len(train_loader) * traing_args.num_train_epochs)
accelerator = Accelerator(gradient_accumulation_steps=traing_args.gradient_accumulation_steps)
model, train_loader, test_loader, optimizer, scheduler = accelerator.prepare(model, train_loader, test_loader, optimizer, scheduler)

running_loss = 0.0
correct_label_wr, total_label_wr = 0, 0

writer = SummaryWriter('./runs/bert/' + model_base)

for epoch in range(traing_args.num_train_epochs) :
    model.train() 
    with tqdm(train_loader, unit="batch") as pbar:
        for i, data in enumerate(pbar) :
            optimizer.zero_grad()
            pbar.set_description(f"Epoch {epoch+1}")
            batch_train, batch_label = data['texts'], data['labels']
            with accelerator.accumulate(model) :
            # TODO: Seperate SVM & CRF
                now_loss, batch_labels, pad_output = model(batch_train, batch_label) # Output
                if model_base == "CRF" :
                    now_loss *= -1
                    running_loss += now_loss.mean()
                else :
                    running_loss += now_loss
                accelerator.backward(now_loss.mean())
                optimizer.step()
                scheduler.step()
            correct_label_wr += torch.sum((pad_output == batch_labels) & (batch_labels != 33) & (batch_labels != 30))
            total_label_wr += torch.sum((batch_labels != 33) & (batch_labels != 30))
            if i % 100 == 99 :
                writer.add_scalar('train/training_loss',
                                  running_loss / 1000,
                                  epoch * len(train_loader) + i + 1)
                running_loss = 0.0

            pbar.set_postfix(loss = now_loss.mean().item(), correct_label = correct_label_wr.item(), total_label = total_label_wr.item())
    # One epoch ends
    accuracy = correct_label_wr / total_label_wr
    writer.add_scalar('train/accuracy',
                        accuracy.item(),
                        epoch + 1)
    writer.add_scalar('train/correct labels',
                        correct_label_wr.item(),
                        epoch + 1)
    correct_label_wr, total_label_wr = 0, 0
    # Save state
    # accelerator.save_model(model, traing_args.output_dir)
    state = {
        'epoch' : epoch,
        'state_dict' : model.state_dict(),
        'optimizer' : optimizer.state_dict(),
        'loss' : now_loss
    }
    torch.save(state, PATH + model_base + f"{epoch+1}_edit.pkl")
    # Validation Part
    print("==========Validation Start==========")
    correct_val, totals_val = 0, 0
    model.eval()
    with torch.no_grad() :
        for data in test_loader :
            batch_test, batch_test_label = data['texts'], data['labels']
            _, batch_val_labels, val_pad_output = model(batch_test, batch_test_label)
            correct_val += torch.sum((val_pad_output == batch_val_labels) & (batch_val_labels != 33) & (batch_val_labels != 30))
            totals_val += torch.sum((batch_val_labels != 33) & (batch_val_labels != 30))
    accuracy_val = correct_val / totals_val
    print(f"Correct labels: {correct_val.item()}, total labels: {totals_val.item()}, so accuracy is: {accuracy_val.item()}")
    writer.add_scalar('val/validation_accuracy',
                      accuracy_val.item(),
                      epoch + 1)
    writer.add_scalar('val/validation_labels',
                      correct_val.item(),
                      epoch + 1)

# (Optional) Model & dataset Loading for Inference
* If you don't train, then please use these cells

In [None]:
# model_base = "Bert_CRF"
# PATH = f"./checkpoints/{model_name}_checkpoints/temps/"
checkpoint_num = 12 # Change checkpoint

PATH += f"{model_base[-3:]}{str(checkpoint_num)}_edit.pkl" # If other path, have to change

print(PATH)

In [None]:
# Optional
# df = load_files() # Have to control path (argparser)
# texts = df['form'].to_list()
# ne = df['NE'].to_list()

# # tokenizer
# train_texts, test_texts, train_ne, test_ne = train_test_split(texts, ne, test_size=0.2, random_state=42) # fix training dataset
# test_dataset = CustomDataset(test_texts, test_ne)

In [None]:
# traing_args = TrainingArguments(
#     output_dir=f"./checkpoints/Bert_{model_name[-3:]}_checkpoints/",
#     per_device_train_batch_size=32,
#     gradient_accumulation_steps=8,
#     learning_rate=5e-5,
#     weight_decay=0.01,
#     num_train_epochs=12,
#     warmup_ratio=0.1,
#     warmup_steps=0,
#     # **default_args
# ) # Optional
checkpoint = torch.load(PATH)

# 4. Inference & Export results by csv (Total label, correct label, accuracy)

In [None]:
test_loader = DataLoader(test_dataset, batch_size = 32, shuffle=True, collate_fn=collect_fn_bert)

if model_base == "CRF" :
    model = NERBertCRF()
else :
    model = NERBertSVM()
decay_parameters = get_parameter_names(model, [nn.LayerNorm])
decay_parameters = [name for name in decay_parameters if "bias" not in name]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
        "weight_decay": traing_args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr= traing_args.learning_rate, eps= traing_args.adam_epsilon)
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
accelerator = Accelerator()
model, test_loader, optimizer = accelerator.prepare(model, test_loader, optimizer)

model.eval()

In [None]:
# Answer label
idx_to_label, label_to_idx = model.idx_to_label, model.label_to_idx

In [None]:
print(idx_to_label)

In [None]:
correct_test, totals_test = 0, 0
totals_labels = [0] * len(label_to_idx)
correct_labels = [0] * len(label_to_idx)
with torch.no_grad() :
    for _, data in enumerate(tqdm(test_loader)) :
        batch_test, batch_test_label = data['texts'], data['labels']
        _, batch_val_labels, val_pad_output = model(batch_test, batch_test_label)
        correct_test += torch.sum((val_pad_output == batch_val_labels) & (batch_val_labels != 33) & (batch_val_labels != 30))
        correct_labels_c = torch.where(val_pad_output == batch_val_labels, val_pad_output, 33)
        totals_labels = [sum(x).item() for x in zip(totals_labels, torch.bincount(batch_val_labels.reshape(-1)))]
        correct_labels = [sum(x).item() for x in zip(correct_labels, torch.bincount(correct_labels_c.reshape(-1)))]
        totals_test += torch.sum((batch_val_labels != 33) & (batch_val_labels != 30))

In [None]:
useful_correct, useful_totals = correct_labels[:30], totals_labels[:30]
print(len(useful_correct))
temp = []
for i, (c, t) in enumerate(zip(useful_correct, useful_totals)) :
    print(f"labels: {idx_to_label[i]} / correct labels: {c} with total lebels: {t} => accuracy: {c/t if t != 0 else 0}")
    temp.append([idx_to_label[i], c, t, c/t if t!=0 else 0])
df = pd.DataFrame(temp, columns=["label_name", "correct_label", "total_label", "accuracy"])
df.to_csv(f"results_{model_base}.csv")