In [1]:
!pip install transformers torch datasets transformers[torch]

from google.colab import drive
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EvalPrediction
import torch
import json
import numpy as np
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score



In [2]:
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/'
model_ckpt = 'beomi/KcELECTRA-base-v2022'

ds = load_from_disk(file_path+'sequence_dataset')
ds

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 270022
    })
    val: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10160
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10084
    })
})

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, sep_token = '[SEP]', cls_token = '[CLS]')
model = AutoModelForSequenceClassification.from_pretrained(file_path+'epc5_weighted_model')
model_name = file_path + f"{model_ckpt}-sequence22"
# 128, 64은 메모리 부족
batch_size = 32
logging_steps = len(ds['train']) // batch_size

training_args = TrainingArguments(
    output_dir = model_name,
    logging_dir = model_name + '/logs',
    num_train_epochs = 5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    logging_steps = logging_steps,
    save_steps = 50,
    save_total_limit = 1,
    save_strategy = 'epoch',
    evaluation_strategy = 'epoch',
    load_best_model_at_end = True)


def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1 = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
    acc = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true=y_true, y_pred=y_pred, average='weighted')
    precision = precision_score(y_true=y_true, y_pred=y_pred, average='weighted')
    # return as dictionary
    metrics = {'f1': f1,
               'accuracy': acc,
               "recall": recall,
               "precision": precision}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [4]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=ds['train'],
                  eval_dataset=ds['val'],
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  tokenizer=tokenizer)

# trainer.train();

In [5]:
trainer.evaluate()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.24951700866222382,
 'eval_f1': 0.9159652945002834,
 'eval_accuracy': 0.831496062992126,
 'eval_recall': 0.9192865507611927,
 'eval_precision': 0.9130331048799563,
 'eval_runtime': 38.0582,
 'eval_samples_per_second': 266.959,
 'eval_steps_per_second': 8.356}

In [6]:
pred_output = trainer.predict(ds['test'])
pred_output.metrics

{'test_loss': 0.26359787583351135,
 'test_f1': 0.9102871103420264,
 'test_accuracy': 0.8206069020230068,
 'test_recall': 0.9125928643492617,
 'test_precision': 0.9084511763672056,
 'test_runtime': 37.998,
 'test_samples_per_second': 265.382,
 'test_steps_per_second': 8.316}

In [7]:
df = pd.read_csv(file_path + 'sequence_binary_testset.csv')
df

Unnamed: 0,text,label
0,"['대면 예배 하지 말라는데 왜 자꾸 하는거야?', '성경에서는 교인들이 모인 곳이...","[0.0, 0.0]"
1,"['중고 거래만 주구장창 하는 남자들 보면 아주 소금 즙 흐르겠어 짜다 짜', '그...","[0.0, 1.0]"
2,"['나 여잔데 방송해보고 싶어', '니가 하고싶은거 하는거지 도전 ㄱㄱ']","[0.0, 0.0]"
3,"['야, 넌 지금 전문대도 갈 수 없는 수준이면서 인서울을 운운한다니 어이가 없네....","[1.0, 1.0]"
4,['우리가 유럽 가서 노천카페나 길거리에 백인들 부러워하고 사진찍는 거랑 똑같은거야...,"[0.0, 1.0]"
...,...,...
10079,"['너 신발이 너무 낡았다', '너가 사줄거 아니면 간섭마']","[0.0, 1.0]"
10080,"['에휴 너는 이 작품 피디가 누구인 줄이나 알고 드라마 보냐?', '표절했다는 거...","[0.0, 0.0]"
10081,"['돼공 놈들 가진거라곤 몸무게 밖에 없어서 행동 느려 터진거 보면 진짜 답답함',...","[1.0, 0.0]"
10082,['#@인간및인간집단.인물.인물_무정보.남성# 아버지 아직 아들을 못 보내셨나보다ㅜ...,"[0.0, 1.0]"


In [8]:
pred_ts = torch.Tensor(pred_output.predictions)
result_ts = torch.sigmoid(pred_ts)
result = np.empty(10084)

for i in range(len(result_ts)):
    if result_ts[i][0] > result_ts[i][1]:
        result[i] = 0
    else:
        result[i] = 1

result_df = pd.DataFrame(result, columns=['sequence_result'])
df = pd.concat([df, result_df], axis=1)
df

Unnamed: 0,text,label,sequence_result
0,"['대면 예배 하지 말라는데 왜 자꾸 하는거야?', '성경에서는 교인들이 모인 곳이...","[0.0, 0.0]",0.0
1,"['중고 거래만 주구장창 하는 남자들 보면 아주 소금 즙 흐르겠어 짜다 짜', '그...","[0.0, 1.0]",0.0
2,"['나 여잔데 방송해보고 싶어', '니가 하고싶은거 하는거지 도전 ㄱㄱ']","[0.0, 0.0]",1.0
3,"['야, 넌 지금 전문대도 갈 수 없는 수준이면서 인서울을 운운한다니 어이가 없네....","[1.0, 1.0]",0.0
4,['우리가 유럽 가서 노천카페나 길거리에 백인들 부러워하고 사진찍는 거랑 똑같은거야...,"[0.0, 1.0]",1.0
...,...,...,...
10079,"['너 신발이 너무 낡았다', '너가 사줄거 아니면 간섭마']","[0.0, 1.0]",1.0
10080,"['에휴 너는 이 작품 피디가 누구인 줄이나 알고 드라마 보냐?', '표절했다는 거...","[0.0, 0.0]",0.0
10081,"['돼공 놈들 가진거라곤 몸무게 밖에 없어서 행동 느려 터진거 보면 진짜 답답함',...","[1.0, 0.0]",0.0
10082,['#@인간및인간집단.인물.인물_무정보.남성# 아버지 아직 아들을 못 보내셨나보다ㅜ...,"[0.0, 1.0]",1.0


In [10]:
idx = list(range(10084))
TPTN = pd.DataFrame(columns = ['text', 'label'], index = idx)
FPFN = pd.DataFrame(columns = ['text', 'real_label', 'wrong_label'], index = idx)

for i in range(10084):
    if df['label'][i] == df['sequence_result'][i]:
        TPTN.iloc[i, 0] = df['text'][i]
        TPTN.iloc[i, 1] = df['label'][i]
    else:
        FPFN.iloc[i, 0] = df['text'][i]
        FPFN.iloc[i, 1] = df['label'][i]
        FPFN.iloc[i, 2] = df['sequence_result'][i]

In [11]:
print(TPTN.isnull().sum())
print(FPFN.isnull().sum())
print('====================')
TPTN = TPTN.dropna()
FPFN = FPFN.dropna()
print(TPTN.isnull().sum())
print(FPFN.isnull().sum())

text     10084
label    10084
dtype: int64
text           0
real_label     0
wrong_label    0
dtype: int64
text     0.0
label    0.0
dtype: float64
text           0
real_label     0
wrong_label    0
dtype: int64


In [None]:
TPTN.to_csv(file_path+'sequence_TPTN.csv', encoding = 'utf-8-sig', index = False)
FPFN.to_csv(file_path+'sequence_FPFN.csv', encoding = 'utf-8-sig', index = False)

In [None]:
trainer.save_model(file_path+"epc10_weighted_model")