# Set parameters

In [1]:
MODEL_NAME = "klue/bert-base"
MODEL_LEARNING_RATE = 1e-5
MODEL_BATCH_SIZE = 128
MODEL_EPOCHS_NUM = 10
MODEL_OUTPUT_DIR = "trained_model/"

In [2]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
torch.cuda.is_available()

True

# Load data

In [3]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files={"train": "train.csv", "eval": "eval.csv", "test": "test.csv"})
dataset

Using custom data configuration default-d039a187e0a04ce4
Found cached dataset csv (/home/bill/.cache/huggingface/datasets/csv/default-d039a187e0a04ce4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'honorific'],
        num_rows: 24000
    })
    eval: Dataset({
        features: ['sentence', 'honorific'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['sentence', 'honorific'],
        num_rows: 4800
    })
})

In [4]:
import pandas as pd

df = pd.DataFrame(dataset["train"])
df.head()

Unnamed: 0,sentence,honorific
0,내일 반품할 노트북이 삼성인지 엘지인지 알려주세요,1
1,내일 삼성노트북을 반품할거에요 엘지노트북을 반품할거에요,1
2,내일 삼성노트북이랑 엘지노트북 중에 어떤 노트북을 반품하는거에요,1
3,혹시 내일 반품할 노트북이 삼성과 엘지 중에 무엇인지 아시나요,1
4,내일 노트북 반품이 삼성노트북인지 엘지노트북인지 확인부탁드립니다,1


# Set tokenizer

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer("내일 삼성노트북이랑 엘지노트북 중에 어떤 노트북을 반품하는거에요")

{'input_ids': [2, 5420, 3840, 2406, 2265, 2547, 2052, 2379, 17360, 2406, 2265, 2547, 1570, 2170, 3711, 11161, 2069, 24183, 2205, 2259, 2180, 2170, 2182, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# Set encoder

In [6]:
df[["honorific"]].head()

Unnamed: 0,honorific
0,1
1,1
2,1
3,1
4,1


In [7]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder().fit(df[["honorific"]])
encoder.categories_

[array([0, 1])]

In [8]:
encoder.transform(pd.DataFrame({"honorific": [1]})).toarray()[0]

array([0., 1.])

# Process data

In [9]:
def process(dataslice):
  tokenized_inputs = tokenizer(dataslice["sentence"])
  labels = []
  for honorific in dataslice["honorific"]:
    encoded_honorific = encoder.transform(pd.DataFrame({"honorific": [honorific]})).toarray()[0]
    labels.append(encoded_honorific)
  tokenized_inputs["label"] = labels
  return tokenized_inputs

processed_dataset = dataset.map(process, batched=True)
processed_dataset

Loading cached processed dataset at /home/bill/.cache/huggingface/datasets/csv/default-d039a187e0a04ce4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-88a7f6e85c0b3612.arrow
Loading cached processed dataset at /home/bill/.cache/huggingface/datasets/csv/default-d039a187e0a04ce4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-92441cd9a8defedd.arrow
Loading cached processed dataset at /home/bill/.cache/huggingface/datasets/csv/default-d039a187e0a04ce4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-7c5c86aff84acbef.arrow


DatasetDict({
    train: Dataset({
        features: ['sentence', 'honorific', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 24000
    })
    eval: Dataset({
        features: ['sentence', 'honorific', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['sentence', 'honorific', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 4800
    })
})

In [10]:
processed_dataset["train"][1]

{'sentence': '내일 삼성노트북을 반품할거에요 엘지노트북을 반품할거에요',
 'honorific': 1,
 'input_ids': [2,
  5420,
  3840,
  2406,
  2265,
  2547,
  2069,
  24183,
  2085,
  2180,
  2170,
  2182,
  17360,
  2406,
  2265,
  2547,
  2069,
  24183,
  2085,
  2180,
  2170,
  2182,
  3],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'label': [0.0, 1.0]}

# Set categories

In [11]:
categories = encoder.categories_[0]
categories

array([0, 1])

# Set model

In [12]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(categories))

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

# Set trainer

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=MODEL_OUTPUT_DIR,
  learning_rate=MODEL_LEARNING_RATE,
  per_device_train_batch_size=MODEL_BATCH_SIZE,
  per_device_eval_batch_size=MODEL_BATCH_SIZE,
  num_train_epochs=MODEL_EPOCHS_NUM
)

In [14]:
from transformers import Trainer

trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=processed_dataset["train"],
  eval_dataset=processed_dataset["eval"],
  tokenizer=tokenizer
)

# Train

In [15]:
train_model = True
if train_model:
  trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence, honorific. If sentence, honorific are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 24000
  Num Epochs = 10
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 1880
  Number of trainable parameters = 110618882
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.0376
1000,0.0067
1500,0.0061


Saving model checkpoint to trained_model/checkpoint-500
Configuration saved in trained_model/checkpoint-500/config.json
Model weights saved in trained_model/checkpoint-500/pytorch_model.bin
tokenizer config file saved in trained_model/checkpoint-500/tokenizer_config.json
Special tokens file saved in trained_model/checkpoint-500/special_tokens_map.json
Saving model checkpoint to trained_model/checkpoint-1000
Configuration saved in trained_model/checkpoint-1000/config.json
Model weights saved in trained_model/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in trained_model/checkpoint-1000/tokenizer_config.json
Special tokens file saved in trained_model/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to trained_model/checkpoint-1500
Configuration saved in trained_model/checkpoint-1500/config.json
Model weights saved in trained_model/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in trained_model/checkpoint-1500/tokenizer_config.json
Special tok

In [16]:
save_model = False
if save_model:
  model.save_pretrained(MODEL_OUTPUT_DIR)

Configuration saved in trained_model/config.json
Model weights saved in trained_model/pytorch_model.bin


# Evaluate

In [17]:
trained_model = BertForSequenceClassification.from_pretrained(MODEL_OUTPUT_DIR)

loading configuration file trained_model/config.json
Model config BertConfig {
  "_name_or_path": "klue/bert-base",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "multi_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file trained_model/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint 

In [18]:
test_texts = processed_dataset["test"]["sentence"]
test_texts[:5]

['삼성과 엘지 중에 에어컨이 더 저렴한 곳 확인 부탁드립니다',
 '에어컨이 더 저렴한 곳은 어디인가요 삼성과 엘지 중에',
 '삼성 에어컨이 더 저렴한가요 엘지 에어컨이 더 저렴한가요',
 '에어컨이 더 저렴한 곳은 삼성인가요 엘지인가요',
 '삼성과 엘지 중에 에어컨이 더 저렴한 곳을 아십니까']

In [19]:
model_inputs = tokenizer(test_texts, padding=True, return_tensors="pt")
logits = trained_model(**model_inputs).logits
logits[:5]

tensor([[-7.4885,  7.5042],
        [-7.4665,  7.4227],
        [-7.4214,  7.5274],
        [-7.4219,  7.4593],
        [-7.4195,  7.3443]], grad_fn=<SliceBackward0>)

In [20]:
from torch import nn

predicted_probabilities = nn.functional.softmax(logits, dim=-1)
predicted_probabilities[:5]

tensor([[3.0816e-07, 1.0000e+00],
        [3.4176e-07, 1.0000e+00],
        [3.2196e-07, 1.0000e+00],
        [3.4450e-07, 1.0000e+00],
        [3.8744e-07, 1.0000e+00]], grad_fn=<SliceBackward0>)

In [21]:
import numpy as np

predicted_labels = [categories[label_idx]
                    for label_idx in np.argmax(predicted_probabilities.detach().numpy(), axis=1)]
predicted_labels[:5]

[1, 1, 1, 1, 1]

In [22]:
for idx in range(10):
  print(f"{predicted_labels[idx]}: {test_texts[idx]}")

1: 삼성과 엘지 중에 에어컨이 더 저렴한 곳 확인 부탁드립니다
1: 에어컨이 더 저렴한 곳은 어디인가요 삼성과 엘지 중에
1: 삼성 에어컨이 더 저렴한가요 엘지 에어컨이 더 저렴한가요
1: 에어컨이 더 저렴한 곳은 삼성인가요 엘지인가요
1: 삼성과 엘지 중에 에어컨이 더 저렴한 곳을 아십니까
1: 넷플릭스랑 왓챠 중에 가성비가 더 좋은 곳을 알려주세요
1: 가성비가 더 좋은 곳이 넷플릭스인지 왓챠인지 확인부탁드립니다
1: 넷플릭스가 가성비가 더 좋아요 왓챠가 가성비가 더 좋아요
1: 넷플릭스와 왓챠 중에 어디가 더 가성비가 좋은지 좀 알려주세요
1: 가성비가 더 좋은 곳 확인해주세요 넷플릭스랑 왓챠 중에


In [23]:
corrects_num = 0
for idx in range(len(predicted_labels)):
  if predicted_labels[idx] == processed_dataset["test"]["honorific"][idx]:
    corrects_num += 1
corrects_num / len(predicted_labels)

0.9977083333333333