In [1]:
!pip install transformers torch datasets transformers[torch]

from google.colab import drive
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EvalPrediction
import torch
import json
import numpy as np
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.20.3 (from transformers)
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Installing collected pac

In [2]:
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/'
model_ckpt = "beomi/KcELECTRA-base-v2022"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # CPU or GPU
print(f'Using {device} device...')
%cd '/content/drive/MyDrive/dice_loss_for_NLP'

ds = load_from_disk(file_path+'multi_single_dataset')
ds

Mounted at /content/drive
Using cuda device...
/content/drive/MyDrive/dice_loss_for_NLP


DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 293275
    })
    val: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10372
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10637
    })
})

In [3]:
id2label = {0: 'IMMORAL_NONE',
            1: 'CENSURE',
            2: 'DISCRIMINATION',
            3: 'HATE',
            4: 'VIOLENCE',
            5: 'CRIME',
            6: 'SEXUAL',
            7: 'ABUSE'
            }
label2id = {v: k for k, v in id2label.items()}
labels = list(label2id.keys())

In [4]:
from loss.dice_loss import DiceLoss
from loss.focal_loss import FocalLoss
from torch.nn.modules import CrossEntropyLoss
from utils.get_parser import get_parser

class MyTrainer(Trainer):
    def __init__(self, loss_type, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_type= loss_type
        self.dice_smooth=1e-4
        self.dice_ohem=0.0
        self.dice_alpha=0.01
        self.dice_square="store_true"
        self.focal_gamma=2.0
        self.focal_alpha=0.25
        self.num_classes=2

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        if self.loss_type == "ce":
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(outputs[0], labels)
        elif self.loss_type == "focal":
            loss_fct = FocalLoss(gamma=self.focal_gamma, reduction="mean")
            loss = loss_fct(outputs[0], labels)
        elif self.loss_type == "dice":
            loss_fct = DiceLoss(with_logits=True, smooth=self.dice_smooth, ohem_ratio=self.dice_ohem,
                                alpha=self.dice_alpha, square_denominator=self.dice_square,
                                index_label_position=True, reduction="mean")
            loss = loss_fct(outputs[0], labels)
        else:
            raise ValueError
        return (loss, outputs) if return_outputs else loss

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, sep_token = '[SEP]', cls_token = '[CLS]')
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
                                                           id2label=id2label,
                                                           label2id=label2id)

model_name = file_path + f"{model_ckpt}_multi_dice"
# 128은 메모리 부족
batch_size = 64
logging_steps = len(ds['train']) // batch_size

training_args = TrainingArguments(
    output_dir = model_name,
    logging_dir = model_name + '/logs',
    num_train_epochs = 5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    logging_steps = logging_steps,
    save_steps = 50,
    save_total_limit = 1,
    save_strategy = 'epoch',
    evaluation_strategy = 'epoch',
    load_best_model_at_end = True)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    recall = recall_score(labels, preds, average='weighted'),
    precision = precision_score(labels, preds, average='weighted'),
    return {"accuracy": acc, "f1": f1, "recall": recall, "precision": precision}

tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/450k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
trainer = MyTrainer(model=model, args=training_args,
                    loss_type='dice',
                    compute_metrics=compute_metrics,
                    train_dataset=ds['train'],
                    eval_dataset=ds['val'],
                    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                    tokenizer=tokenizer)
trainer.train();

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision
1,0.8725,0.844895,0.578481,0.424002,"(0.5784805244890089,)","(0.33463971721307884,)"
2,0.8349,0.844871,0.578481,0.424002,"(0.5784805244890089,)","(0.33463971721307884,)"
3,0.8348,0.844788,0.578481,0.424002,"(0.5784805244890089,)","(0.33463971721307884,)"
4,0.8349,0.844806,0.578481,0.424002,"(0.5784805244890089,)","(0.33463971721307884,)"
5,0.8349,0.844789,0.578481,0.424002,"(0.5784805244890089,)","(0.33463971721307884,)"


  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "(0.5784805244890089,)" of type <class 'tuple'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "(0.33463971721307884,)" of type <class 'tuple'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "(0.5784805244890089,)" of type <class 'tuple'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "(0.33463971721307884,)" of type <class 'tuple'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
  _war

In [7]:
trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "(0.5784805244890089,)" of type <class 'tuple'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "(0.33463971721307884,)" of type <class 'tuple'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.84478759765625,
 'eval_accuracy': 0.5784805244890089,
 'eval_f1': 0.4240023389853472,
 'eval_recall': (0.5784805244890089,),
 'eval_precision': (0.33463971721307884,),
 'eval_runtime': 19.82,
 'eval_samples_per_second': 523.309,
 'eval_steps_per_second': 8.224,
 'epoch': 5.0}

In [8]:
pred_output = trainer.predict(ds['test'])
pred_output.metrics

  _warn_prf(average, modifier, msg_start, len(result))


{'test_loss': 0.8401482105255127,
 'test_accuracy': 0.5808028579486697,
 'test_f1': 0.42678561479713134,
 'test_recall': (0.5808028579486697,),
 'test_precision': (0.3373319598013426,),
 'test_runtime': 20.3355,
 'test_samples_per_second': 523.075,
 'test_steps_per_second': 8.212}

In [9]:
test_df = pd.read_csv(file_path + 'multi_single_testset.csv')
test_df

Unnamed: 0,text,label
0,하루도 조용한 날이 없네 진짜,0
1,셀카로 안구테러 좀 그만 했으면 좋겠다,1
2,진짜 말하는거 짜증나 죽겠네,0
3,면상 몸매 학벌 씀씀이 배려 인간성 실력 돈 집안 머리 모두 제로인게?,0
4,쟤는 친구들을 의심하고 먼저 손절해버리는게 피해망상 환자같아,1
...,...,...
10632,그냥 이게 편해서 이렇게 먹는데요,0
10633,군인 월급 인상해서 전역때 1000만원 받는다는데?,0
10634,지나가는 남자 다 때려눞히게 생겼는데,2
10635,한숨밖에 안나온다,0


In [10]:
pred_ts = torch.Tensor(pred_output.predictions)
result_ts = torch.sigmoid(pred_ts)
result = np.empty(10637)

for i in range(len(result_ts)):
    if result_ts[i][0] > result_ts[i][1]:
        result[i] = 0
    else:
        result[i] = 1

result_df = pd.DataFrame(result, columns=['multi_single_result'])
test_df = pd.read_csv(file_path + 'multi_single_testset.csv')
df = pd.concat([test_df, result_df], axis=1)
df

Unnamed: 0,text,label,multi_single_result
0,하루도 조용한 날이 없네 진짜,0,0.0
1,셀카로 안구테러 좀 그만 했으면 좋겠다,1,0.0
2,진짜 말하는거 짜증나 죽겠네,0,0.0
3,면상 몸매 학벌 씀씀이 배려 인간성 실력 돈 집안 머리 모두 제로인게?,0,0.0
4,쟤는 친구들을 의심하고 먼저 손절해버리는게 피해망상 환자같아,1,0.0
...,...,...,...
10632,그냥 이게 편해서 이렇게 먹는데요,0,0.0
10633,군인 월급 인상해서 전역때 1000만원 받는다는데?,0,0.0
10634,지나가는 남자 다 때려눞히게 생겼는데,2,0.0
10635,한숨밖에 안나온다,0,0.0


In [11]:
idx = list(range(10637))
TPTN = pd.DataFrame(columns = ['text', 'label'], index = idx)
FPFN = pd.DataFrame(columns = ['text', 'real_label', 'wrong_label'], index = idx)

for i in range(10637):
    if df['label'][i] == df['multi_single_result'][i]:
        TPTN.iloc[i, 0] = df['text'][i]
        TPTN.iloc[i, 1] = df['label'][i]
    else:
        FPFN.iloc[i, 0] = df['text'][i]
        FPFN.iloc[i, 1] = df['label'][i]
        FPFN.iloc[i, 2] = df['multi_single_result'][i]

In [12]:
print(TPTN.isnull().sum())
print(FPFN.isnull().sum())
print('====================')
TPTN = TPTN.dropna()
FPFN = FPFN.dropna()
print(TPTN.isnull().sum())
print(FPFN.isnull().sum())

text     4459
label    4459
dtype: int64
text           6178
real_label     6178
wrong_label    6178
dtype: int64
text     0
label    0
dtype: int64
text           0
real_label     0
wrong_label    0
dtype: int64


In [13]:
TPTN.to_csv(file_path+'multi_dice_TPTN.csv', encoding = 'utf-8-sig', index = False)
FPFN.to_csv(file_path+'multi_dice_FPFN.csv', encoding = 'utf-8-sig', index = False)