### 한국어 지식 데이터의 관계 분류

<br>

[한국어 지식기반 관계 데이터](https://www.aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=71633)
<br>[한국어 지식 데이터의 관계 분류](https://aifactory.space/task/2658/leaderboard)

<br>

In [None]:
%pip install accelerate -U

In [1]:
import os
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt


import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from safetensors.torch import load_model, save_model
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer

In [2]:
print(torch.cuda.is_available())

True


In [20]:
train_data = pd.read_csv('data/train_x.csv', encoding='utf-8')
train_y = pd.read_csv('data/train_y.csv', encoding='utf-8')
train_data['label'] = train_y['label']
test_data = pd.read_csv('data/test_x.csv', encoding='utf-8')

train_data['instruction'] = "문장에서 " + train_data['subj'] + " 그리고 " + train_data['obj'] + "는 관계가 있다"
test_data['instruction'] = "문장에서 " + test_data['subj'] + " 그리고 " + test_data['obj'] + "는 관계가 있다"
train_data['sentence'] = train_data['sentence'].str.replace('[^A-Za-z0-9ㄱ-ㅎ가-힣]', ' ', regex=True)
test_data['sentence'] = test_data['sentence'].str.replace('[^A-Za-z0-9ㄱ-ㅎ가-힣]', ' ', regex=True)

"""
train_sentences= []
for i in tqdm(train_data['sentence']):
    train_sentences.append(spacing(i))
train_data['sentence'] = train_sentences

test_sentences = []
for i in tqdm(test_data['sentence']):
    test_sentences.append(spacing(i))
test_data['sentence'] = test_sentences
"""

"\ntrain_sentences= []\nfor i in tqdm(train_data['sentence']):\n    train_sentences.append(spacing(i))\ntrain_data['sentence'] = train_sentences\n\ntest_sentences = []\nfor i in tqdm(test_data['sentence']):\n    test_sentences.append(spacing(i))\ntest_data['sentence'] = test_sentences\n"

In [4]:
sentence_max_len = np.max(train_data['sentence'].str.len())
sentence_min_len = np.min(train_data['sentence'].str.len())
sentence_mean_len = np.mean(train_data['sentence'].str.len())
instruction_max_len = np.max(train_data['instruction'].str.len())
instruction_min_len = np.min(train_data['instruction'].str.len())
instruction_mean_len = np.mean(train_data['instruction'].str.len())
label_num = len(set(train_data['label']))

print('Max Sentence Length: ', sentence_max_len)
print('Min Sentence Length: ', sentence_min_len)
print('Mean Sentence Lengh: ', sentence_mean_len)
print('Max Instruction Length: ', instruction_max_len)
print('Min Instruction Length: ', instruction_min_len)
print('Mean Instruction Lengh: ', instruction_mean_len)
print('The Number of Label: ', label_num)

Max Sentence Length:  497
Min Sentence Length:  13
Mean Sentence Lengh:  84.307
Max Instruction Length:  62
Min Instruction Length:  21
Mean Instruction Lengh:  29.139
The Number of Label:  19


In [23]:
MODEL_NAME = 'kykim/bert-kor-base' # 'kykim/bert-kor-base'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

config = AutoConfig.from_pretrained(MODEL_NAME)
config.num_labels = 20

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at kykim/bert-kor-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
train_dataset, eval_dataset = train_test_split(train_data, test_size=0.1, shuffle=True, stratify=train_data['label'])

tokenized_train = tokenizer(
    list(train_dataset['sentence']),
    list(train_dataset['instruction']),
    return_tensors="pt",
    max_length=512, 
    padding=True,
    truncation=True,
    add_special_tokens=True
)

tokenized_eval = tokenizer(
    list(eval_dataset['sentence']),
    list(eval_dataset['instruction']),
    return_tensors="pt",
    max_length=512,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

# print(tokenized_train['input_ids'][0])
# print(tokenizer.decode(tokenized_train['input_ids'][0]))

In [19]:
class BERTDataset(torch.utils.data.Dataset):
    def __init__(self, pair_dataset, label):
        self.pair_dataset = pair_dataset
        self.label = label

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
        item['label'] = torch.tensor(self.label[idx])
        
        return item

    def __len__(self):
        return len(self.label)

In [12]:
train_dataset = BERTDataset(tokenized_train, train_dataset['label'].values)
eval_dataset = BERTDataset(tokenized_eval, eval_dataset['label'].values)

In [29]:
def compute_metrics(pred):
  
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  probs = pred.predictions

  acc = accuracy_score(labels, preds) 

  return {
      'accuracy': acc,
  }

In [28]:
training_ars = TrainingArguments(
    output_dir='output',
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    save_total_limit=10,
    save_strategy='epoch', 
    evaluation_strategy='epoch',
    load_best_model_at_end = True,
)

trainer = Trainer(
    model=model,
    args=training_ars,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, 
)

In [29]:
trainer.train()
trainer.save_model()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.314619,0.26
2,No log,2.146322,0.26
3,No log,1.993048,0.35
4,No log,1.858849,0.4
5,No log,1.76952,0.46
6,No log,1.756153,0.47
7,No log,1.826705,0.43
8,No log,1.848759,0.46
9,No log,1.872167,0.49
10,No log,1.896422,0.49




### Model Test

In [39]:
MODEL_NAME = 'output' # 'kykim/bert-kor-base'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_safetensors=True)

config = AutoConfig.from_pretrained(MODEL_NAME)
config.num_labels = 20

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)

In [40]:
tokenized_test = tokenizer(
    list(test_data['sentence']),
    list(test_data['instruction']),
    return_tensors="pt",
    max_length=512, 
    padding=True,
    truncation=True,
    add_special_tokens=True
)

In [41]:
test_dataset = BERTDataset(tokenized_test, train_dataset['label'].values[0:len(test_data['sentence'])])

In [42]:
test_args = TrainingArguments(
    output_dir = 'test_output',
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 32,   
    dataloader_drop_last = False    
)

trainer = Trainer(
              model = model, 
              args = test_args, 
              compute_metrics = compute_metrics)

predictions, labels, metrics = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=1)



In [43]:
result_csv = pd.DataFrame(labels)
result_csv.columns = {"label"}

result_csv.to_csv("result.csv")

### Reference

[월간 데이콘 한국어 문장 관계 분류 경진대회 - Hugging Face를 활용한 Modeling(public: 0.841)](https://github.com/ldj7672/Deep-Learning-Tutorials/blob/main/HuggingFace/HuggingFace_SwinT_image_classification.ipynb)