# Set parameters

In [1]:
KR_DIR = "/home/bill/Desktop/kr/"
CLASSIFIER_NAME = "klue/bert-base"
CLASSIFIER_DIR = KR_DIR + "kr-honorifics-classification/trained_model/"
MODEL_NAME = "gogamza/kobart-summarization"
MODEL_MAX_LENGTH = 128
MODEL_OUTPUT_DIR = "trained_model/"
MODEL_EVALUATION_STRATEGY = "epoch"
MODEL_LEARNING_RATE = 2e-5
MODEL_BATCH_SIZE = 128
MODEL_WEIGHT_DECAY = 0.01
MODEL_EPOCHS_NUM = 1
MODEL_PREDICT_WITH_GENERATE = True
MODEL_FP16 = True

In [2]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
torch.cuda.is_available()

True

# Load dataset

In [3]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files={"train": "train.csv", "eval": "eval.csv", "test": "test.csv"})
dataset

Using custom data configuration default-475d303c6f95de56
Found cached dataset csv (/home/bill/.cache/huggingface/datasets/csv/default-475d303c6f95de56/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['non-honorific', 'honorific'],
        num_rows: 12000
    })
    eval: Dataset({
        features: ['non-honorific', 'honorific'],
        num_rows: 600
    })
    test: Dataset({
        features: ['non-honorific', 'honorific'],
        num_rows: 2400
    })
})

In [4]:
import pandas as pd

df = pd.DataFrame(dataset["train"])
df.head()

Unnamed: 0,non-honorific,honorific
0,저기 내일 반품할 노트북이 삼성이니 엘지니,내일 반품할 노트북이 삼성인지 엘지인지 알려주세요
1,야 내일 삼성노트북 반품할거냐 엘지노트북 반품할거냐,내일 삼성노트북을 반품할거에요 엘지노트북을 반품할거에요
2,야 내일 삼성노트북이랑 엘지노트북 중에 어떤 노트북 반품하는거냐,내일 삼성노트북이랑 엘지노트북 중에 어떤 노트북을 반품하는거에요
3,저기 내일 반품할 노트북이 삼성이랑 엘지 중에 뭔지 알고있니,혹시 내일 반품할 노트북이 삼성과 엘지 중에 무엇인지 아시나요
4,저기 내일 노트북 반품하는게 삼성건지 엘지건지 확인좀 해봐,내일 노트북 반품이 삼성노트북인지 엘지노트북인지 확인부탁드립니다


# Set tokenizer

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer("내일 삼성노트북이랑 엘지노트북 중에 어떤 노트북을 반품하는거에요")

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/177k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/682k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/111 [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


{'input_ids': [9517, 12041, 14588, 25569, 10949, 12034, 10227, 16584, 24828, 13328, 10949, 17003, 14593, 27476, 12007, 14149, 13514, 14049, 9031, 11786, 11900], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# Process data

In [6]:
def process(dataslice):
  model_inputs = tokenizer(dataslice["non-honorific"], truncation=True, max_length=MODEL_MAX_LENGTH)
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(dataslice["honorific"], truncation=True, max_length=MODEL_MAX_LENGTH)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

processed_dataset = dataset.map(process, batched=True)
processed_dataset

  0%|          | 0/12 [00:00<?, ?ba/s]



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['non-honorific', 'honorific', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 12000
    })
    eval: Dataset({
        features: ['non-honorific', 'honorific', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 600
    })
    test: Dataset({
        features: ['non-honorific', 'honorific', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2400
    })
})

In [7]:
processed_dataset["train"][1]

{'non-honorific': '야 내일 삼성노트북 반품할거냐 엘지노트북 반품할거냐',
 'honorific': '내일 삼성노트북을 반품할거에요 엘지노트북을 반품할거에요',
 'input_ids': [11734,
  18683,
  14588,
  25569,
  10949,
  14149,
  13514,
  13594,
  9031,
  9531,
  16584,
  24828,
  13328,
  10949,
  14149,
  13514,
  13594,
  9031,
  9531],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [9517,
  12041,
  14588,
  25569,
  10949,
  12007,
  14149,
  13514,
  13594,
  9031,
  11786,
  11900,
  16584,
  24828,
  13328,
  10949,
  12007,
  14149,
  13514,
  13594,
  9031,
  11786,
  11900]}

# Set model

In [8]:
from transformers import BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained(MODEL_NAME)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Downloading:   0%|          | 0.00/496M [00:00<?, ?B/s]

# Set trainer

## Set training arguments

In [9]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
  output_dir=MODEL_OUTPUT_DIR,
  evaluation_strategy=MODEL_EVALUATION_STRATEGY,
  learning_rate=MODEL_LEARNING_RATE,
  per_device_train_batch_size=MODEL_BATCH_SIZE,
  per_device_eval_batch_size=MODEL_BATCH_SIZE,
  weight_decay=MODEL_WEIGHT_DECAY,
  num_train_epochs=MODEL_EPOCHS_NUM,
  predict_with_generate=MODEL_PREDICT_WITH_GENERATE,
  fp16=MODEL_FP16
)

## Set data collator

In [10]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
data_collator

DataCollatorForSeq2Seq(tokenizer=PreTrainedTokenizerFast(name_or_path='gogamza/kobart-summarization', vocab_size=30000, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}), model=BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(30000, 768, padding_idx=3)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(30000, 768, padding_idx=3)
      (embed_positions): BartLearnedPositionalEmbedding(1028, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_pro

## Set metric

In [11]:
from datasets import load_metric

metric = load_metric("sacrebleu")
metric

  metric = load_metric("sacrebleu")


Metric(name: "sacrebleu", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, usage: """
Produces BLEU scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions (`list` of `str`): list of translations to score. Each translation should be tokenized into a list of tokens.
    references (`list` of `list` of `str`): A list of lists of references. The contents of the first sub-list are the references for the first prediction, the contents of the second sub-list are for the second prediction, etc. Note that there must be the same number of references for each prediction (i.e. all sub-lists must be of the same length).
    smooth_method (`str`): The smoothing method to use, defaults to `'exp'`. Possible values are:
        - `'none'`: no smoothing
        - `'floor'`: increment zero counts
        - `'add-k'`: increment num/deno

In [12]:
def postprocess(predictions, labels):
  predictions = [prediction.strip() for prediction in predictions]
  labels = [[label.strip()] for label in labels]
  return predictions, labels

In [13]:
import numpy as np

def compute_metrics(evaluation_predictions):
  predictions, labels = evaluation_predictions
  if isinstance(predictions, tuple):
    predictions = predictions[0]
  decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
  decoded_predictions, decoded_labels = postprocess(decoded_predictions, decoded_labels)
  result = metric.compute(predictions=decoded_predictions, references=decoded_labels)
  result = {"bleu": result["score"]}
  prediction_lengths = [np.count_nonzero(prediction != tokenizer.pad_token_id) for prediction in predictions]
  result["get_len"] = np.mean(prediction_lengths)
  result = {key: round(value, 4) for key, value in result.items()}
  return result

## Set trainer

In [14]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
  model=model,
  args=training_args,
  train_dataset=processed_dataset["train"],
  eval_dataset=processed_dataset["eval"],
  data_collator=data_collator,
  tokenizer=tokenizer,
  compute_metrics=compute_metrics
)

Using cuda_amp half precision backend


# Train

In [15]:
train_model = True
if train_model:
  trainer.train()

The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: non-honorific, honorific. If non-honorific, honorific are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 12000
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 94
  Number of trainable parameters = 123859968
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Get Len
1,No log,1.098305,32.057,19.87


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: non-honorific, honorific. If non-honorific, honorific are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 600
  Batch size = 128


Training completed. Do not forget to share your model on huggingface.co/models =)




In [16]:
save_model = True
if save_model:
  model.save_pretrained(MODEL_OUTPUT_DIR)

Configuration saved in trained_model/config.json
Model weights saved in trained_model/pytorch_model.bin


# Evaluate

## Generate outputs

In [17]:
trained_model = BartForConditionalGeneration.from_pretrained(MODEL_OUTPUT_DIR)

loading configuration file trained_model/config.json
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
Model config BartConfig {
  "_name_or_path": "gogamza/kobart-summarization",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.1,
  "d_model": 768,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "do_blenderbot_90_layernorm": false,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 1,
  "extra_pos_embeddings": 2,
  "

In [18]:
test_texts = processed_dataset["test"]["non-honorific"]
test_texts[:5]

['삼성이랑 엘지 중에 에어컨 더 싼 곳 확인해줄래',
 '에어컨이 더 싼 곳이 삼성이냐 엘지냐',
 '삼성 에어컨 이 더 싸냐 엘지가 더 싸냐',
 '얘 에어컨이 더 싼 곳이 삼성이니 엘지니',
 '저기 삼성과 엘지 중에 에어컨 더 싼 곳 알고있니']

In [19]:
model_inputs = tokenizer(test_texts, padding=True, truncation=True, max_length=MODEL_MAX_LENGTH)
model_inputs["input_ids"][0]

[11220,
 14439,
 10227,
 16584,
 12332,
 17003,
 26232,
 14166,
 24022,
 14551,
 14715,
 25040,
 10232,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3]

In [20]:
model_inputs["input_ids"] = [[tokenizer.bos_token_id] + input_indices + [tokenizer.eos_token_id]
                             for input_indices in model_inputs["input_ids"]]
model_inputs["input_ids"][0]

[0,
 11220,
 14439,
 10227,
 16584,
 12332,
 17003,
 26232,
 14166,
 24022,
 14551,
 14715,
 25040,
 10232,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 1]

In [21]:
model_outputs = trained_model.generate(torch.tensor(model_inputs["input_ids"]))
model_outputs[0]



tensor([    2, 11220, 15355, 11863, 16584, 12332, 17003, 26232, 14166, 24022,
        19105, 14715, 17801, 17275,     1,     3,     3,     3,     3,     3])

In [22]:
outputs = tokenizer.batch_decode(model_outputs, skip_special_tokens=True)
outputs[:5]

['삼성과와 엘지 중에 에어컨 더 싼 곳은 확인해주세요',
 '에어컨이 더 싼 곳이 삼성이냐 엘지냐에 대해 삼성과 엘지가',
 '삼성 에어컨 에어컨 이 더 싸냐 엘지가 더 싸냐요 삼성 에어컨 이 더 싸',
 '에어컨이 더 싼 곳은 삼성이니 엘지니 하는 말이 있죠 에어컨이 더 싼',
 '삼성과 엘지 중에 에어컨 더 싼 곳은 알고있나요 삼성과 엘지 중에']

## Load classifier

In [23]:
from transformers import BertForSequenceClassification

classifier_tokenizer = AutoTokenizer.from_pretrained(CLASSIFIER_NAME)
classifier = BertForSequenceClassification.from_pretrained(CLASSIFIER_DIR)

loading configuration file config.json from cache at /home/bill/.cache/huggingface/hub/models--klue--bert-base/snapshots/812449f1a6bc736e693db7aa0e513e5e90795a62/config.json
Model config BertConfig {
  "_name_or_path": "klue/bert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading file vocab.txt from cache at /home/bill/.cache/huggingface/hub/models--klue--bert-base/snapshots/812449f1a6bc736e693db7aa0e513e5e90795a62/vocab.txt
loading file tokenizer.json from cache at /home

## Evaluate

In [24]:
classifier_inputs = classifier_tokenizer(outputs, padding=True, return_tensors="pt")
logits = classifier(**classifier_inputs).logits
logits[:5]

tensor([[-7.4841,  7.5277],
        [ 6.1777, -6.3077],
        [-1.3936,  1.3218],
        [-7.2436,  7.2471],
        [-7.3923,  7.4635]], grad_fn=<SliceBackward0>)

In [25]:
from torch import nn

predicted_probabilities = nn.functional.softmax(logits, dim=-1)
predicted_probabilities[:5]

tensor([[3.0231e-07, 1.0000e+00],
        [1.0000e+00, 3.7816e-06],
        [6.2070e-02, 9.3793e-01],
        [5.0902e-07, 1.0000e+00],
        [3.5335e-07, 1.0000e+00]], grad_fn=<SliceBackward0>)

In [26]:
predicted_labels = np.argmax(predicted_probabilities.detach().numpy(), axis=1) # TODO add categories
predicted_labels[:5]

array([1, 0, 1, 1, 1])

In [27]:
honorifics_num = 0
for predicted_label in predicted_labels:
  if predicted_label == 1:
    honorifics_num += 1
honorifics_num / len(predicted_labels)

0.7408333333333333