In [1]:
!pip install transformers torch datasets transformers[torch]

from google.colab import drive
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EvalPrediction
import torch
import json
import numpy as np
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.20.3 (from transformers)
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
Installing collected pa

In [2]:
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/'
model_ckpt = 'beomi/KcELECTRA-base-v2022'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # CPU or GPU
print(f'Using {device} device...')

ds = load_from_disk(file_path+'clean_dataset')
ds

Mounted at /content/drive
Using cuda device...


DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 368431
    })
    val: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10108
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10224
    })
})

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, sep_token = '[SEP]', cls_token = '[CLS]')
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)

model_name = file_path + f"{model_ckpt}-clean"
# 128은 메모리 부족
batch_size = 64
logging_steps = len(ds['train']) // batch_size

training_args = TrainingArguments(
    output_dir = model_name,
    logging_dir = model_name + '/logs',
    num_train_epochs = 2,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    logging_steps = logging_steps,
    save_steps = 50,
    save_total_limit = 1,
    save_strategy = 'epoch',
    evaluation_strategy = 'epoch',
    load_best_model_at_end = True)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    recall = recall_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    return {'f1': f1,
            'accuracy': acc,
            "recall": recall,
            "precision": precision}

tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/450k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=ds['train'],
                  eval_dataset=ds['val'],
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  tokenizer=tokenizer)
trainer.train();

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Recall,Precision
1,0.3263,0.306658,0.875117,0.875445,0.875445,0.875399
2,0.2338,0.321338,0.876979,0.877226,0.877226,0.877109


In [5]:
trainer.evaluate()

{'eval_loss': 0.3066575825214386,
 'eval_f1': 0.8751166547644127,
 'eval_accuracy': 0.8754451919271864,
 'eval_recall': 0.8754451919271864,
 'eval_precision': 0.8753990567963569,
 'eval_runtime': 16.0497,
 'eval_samples_per_second': 629.793,
 'eval_steps_per_second': 9.844,
 'epoch': 2.0}

In [6]:
pred_output = trainer.predict(ds['test'])
pred_output.metrics

{'test_loss': 0.3059616684913635,
 'test_f1': 0.8758372734731511,
 'test_accuracy': 0.8760758998435054,
 'test_recall': 0.8760758998435054,
 'test_precision': 0.8760613438924002,
 'test_runtime': 17.1049,
 'test_samples_per_second': 597.722,
 'test_steps_per_second': 9.354}

In [7]:
pred_output

PredictionOutput(predictions=array([[ 1.9510782 , -1.8165251 ],
       [-2.2154396 ,  2.0832992 ],
       [ 2.1114116 , -2.0180814 ],
       ...,
       [-1.8095937 ,  1.7408291 ],
       [-1.7918804 ,  1.7181505 ],
       [ 0.3420351 , -0.31180546]], dtype=float32), label_ids=array([0, 1, 0, ..., 1, 1, 1]), metrics={'test_loss': 0.3059616684913635, 'test_f1': 0.8758372734731511, 'test_accuracy': 0.8760758998435054, 'test_recall': 0.8760758998435054, 'test_precision': 0.8760613438924002, 'test_runtime': 17.1049, 'test_samples_per_second': 597.722, 'test_steps_per_second': 9.354})

In [8]:
df = pd.read_csv(file_path + 'clean_testset.csv')
df

Unnamed: 0,text,label
0,잠 좀 그만자라,0
1,싯팔 짱깨련이었네 느금마도 짱깨임?,1
2,오늘도 고등학생이 비탈길 내려가다 전신주 박고 죽었대.,0
3,제발..그건 아니잖아,0
4,와 역대급 빌런이다,1
...,...,...
10219,그건 그렇네 이리살꺼 전나 앂질이나하고 급탈이나 하고 죽자,1
10220,"좋아하면 뭐하냐, 잘 찍어야지",1
10221,네가 일 등 한 번도 안 해본 폐기물이라 그런 생각이 드는 거야,1
10222,진짜 운전 개좉같이 하네,1


In [9]:
pred_ts = torch.Tensor(pred_output.predictions)
result_ts = torch.sigmoid(pred_ts)
result = np.empty(10224)

for i in range(len(result_ts)):
    if result_ts[i][0] > result_ts[i][1]:
        result[i] = 0
    else:
        result[i] = 1

result_df = pd.DataFrame(result, columns=['clean_result'])
df = pd.concat([df, result_df], axis=1)
df

Unnamed: 0,text,label,clean_result
0,잠 좀 그만자라,0,0.0
1,싯팔 짱깨련이었네 느금마도 짱깨임?,1,1.0
2,오늘도 고등학생이 비탈길 내려가다 전신주 박고 죽었대.,0,0.0
3,제발..그건 아니잖아,0,0.0
4,와 역대급 빌런이다,1,0.0
...,...,...,...
10219,그건 그렇네 이리살꺼 전나 앂질이나하고 급탈이나 하고 죽자,1,1.0
10220,"좋아하면 뭐하냐, 잘 찍어야지",1,0.0
10221,네가 일 등 한 번도 안 해본 폐기물이라 그런 생각이 드는 거야,1,1.0
10222,진짜 운전 개좉같이 하네,1,1.0


In [10]:
idx = list(range(10224))
TPTN = pd.DataFrame(columns = ['text', 'label'], index = idx)
FPFN = pd.DataFrame(columns = ['text', 'real_label', 'wrong_label'], index = idx)

for i in range(10224):
    if df['label'][i] == df['clean_result'][i]:
        TPTN.iloc[i, 0] = df['text'][i]
        TPTN.iloc[i, 1] = df['label'][i]
    else:
        FPFN.iloc[i, 0] = df['text'][i]
        FPFN.iloc[i, 1] = df['label'][i]
        FPFN.iloc[i, 2] = df['clean_result'][i]

In [11]:
print(TPTN.isnull().sum())
print(FPFN.isnull().sum())
print('====================')
TPTN = TPTN.dropna()
FPFN = FPFN.dropna()
print(TPTN.isnull().sum())
print(FPFN.isnull().sum())

text     1267
label    1267
dtype: int64
text           8957
real_label     8957
wrong_label    8957
dtype: int64
text     0
label    0
dtype: int64
text           0
real_label     0
wrong_label    0
dtype: int64


In [12]:
TPTN.to_csv(file_path+'clean_TPTN.csv', encoding = 'utf-8-sig', index = False)
FPFN.to_csv(file_path+'clean_FPFN.csv', encoding = 'utf-8-sig', index = False)