In [1]:
from transformers import DistilBertForSequenceClassification
from kobert_tokenizer import KoBertTokenizer
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
model = DistilBertForSequenceClassification.from_pretrained('monologg/distilkobert', num_labels=6)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at monologg/distilkobert and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-2): 3 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
  

In [4]:
import os
import pandas as pd

path = os.getcwd()

train_data = pd.read_csv("../../datasets/train.csv", encoding = 'utf-8')

train_data.dropna(inplace = True)
train_data.drop(['감정_소분류','시스템문장1','시스템문장2','시스템문장3', '사람문장2', '사람문장3', '신체질환'], axis=1, inplace=True)
train_data.rename({'사람문장1':'text', '감정_대분류':'label'}, axis=1, inplace=True)

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])

train_data

Unnamed: 0.1,Unnamed: 0,연령,성별,상황키워드,label,text
304,305,중년,여성,"재정,은퇴,노후준비",4,지금까지 힘들게 일했는데 은퇴해서 돈이 없다고 하니 자식이 화를 내서 상처를 받았어.
305,306,중년,여성,"재정,은퇴,노후준비",4,친구한테 은퇴할 거라고 얘기했더니 앞으로 뭘 먹고 살 거냐면서 비웃더라고. 기분이 ...
306,307,중년,여성,"재정,은퇴,노후준비",4,친구한테 은퇴한다고 했더니 그게 말이나 되는 거냐며 날 한심한 사람 취급해서 서운했어.
307,308,중년,여성,"재정,은퇴,노후준비",4,그동안 열심히 달려와서 좀 쉬려고 하는데 은퇴한다고 하니 주변에서 다 말려서 기분이...
308,309,중년,여성,"재정,은퇴,노후준비",4,많은 고민 후 은퇴를 결심했는데 주변에서 다들 섣부른 생각이라고 해서 마음이 안 좋아.
...,...,...,...,...,...,...
51625,51626,노년,남성,재정,2,나이가 먹고 이제 돈도 못 벌어 오니까 어떻게 살아가야 할지 막막해. 능력도 없고.
51626,51627,노년,여성,재정,3,몸이 많이 약해졌나 봐. 이제 전과 같이 일하지 못할 것 같아 너무 짜증 나.
51627,51628,노년,여성,재정,4,이제 어떻게 해야 할지 모르겠어. 남편도 그렇고 노후 준비도 안 되어서 미래가 걱정돼.
51628,51629,노년,여성,대인관계,3,몇십 년을 함께 살았던 남편과 이혼했어. 그동안의 세월에 배신감을 느끼고 너무 화가 나.


In [5]:
import numpy as np
import evaluate
from datasets import Dataset
from transformers import TrainingArguments, Trainer

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


datasets = Dataset.from_dict(train_data[['text', 'label']].to_dict('list'))

def tokenize_function(dataset):
    return tokenizer(dataset["text"], padding="max_length", truncation=True)

tokenized_datasets = datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.class_encode_column("label")
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.1, stratify_by_column='label', shuffle=True)

train_dataset = tokenized_datasets['train'].shuffle(seed=42)
val_dataset = tokenized_datasets['test'].shuffle(seed=42)


training_args = TrainingArguments(
    output_dir="checkpoint",
    logging_dir="logs",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

Map: 100%|██████████| 42694/42694 [00:04<00:00, 10239.82 examples/s]
Stringifying the column: 100%|██████████| 42694/42694 [00:00<00:00, 462357.74 examples/s]
Casting to class labels: 100%|██████████| 42694/42694 [00:00<00:00, 465534.62 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [6]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.5368,1.319543,0.501639
2,1.3136,1.254006,0.524356
3,1.2648,1.242506,0.52904


TrainOutput(global_step=1803, training_loss=1.34964844020816, metrics={'train_runtime': 2323.1701, 'train_samples_per_second': 49.618, 'train_steps_per_second': 0.776, 'total_flos': 7741093439619072.0, 'train_loss': 1.34964844020816, 'epoch': 3.0})

In [7]:
trainer.evaluate()

{'eval_loss': 1.2425061464309692,
 'eval_accuracy': 0.52903981264637,
 'eval_runtime': 26.7855,
 'eval_samples_per_second': 159.414,
 'eval_steps_per_second': 2.501,
 'epoch': 3.0}