In [1]:
!pip install transformers torch datasets transformers[torch]

from google.colab import drive
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EvalPrediction
import torch
import json
import numpy as np
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.20.3 (from transformers)
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
Installing collected pac

In [2]:
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/'
model_ckpt = 'beomi/KcELECTRA-base-v2022'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # CPU or GPU
print(f'Using {device} device...')

%cd '/content/drive/MyDrive/'
!git clone https://github.com/ShannonAI/dice_loss_for_NLP.git

ds = load_from_disk(file_path+'rate_change_binary_dataset')
ds

Mounted at /content/drive
Using cuda device...
/content/drive/MyDrive
fatal: destination path 'dice_loss_for_NLP' already exists and is not an empty directory.


DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 387806
    })
    val: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10210
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10353
    })
})

In [3]:
ds['train'][0]

{'label': 0,
 'input_ids': [2,
  2469,
  11658,
  7932,
  7958,
  19215,
  17987,
  30181,
  8600,
  15416,
  8153,
  3473,
  3],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [4]:
type(ds['train']['label'][0])

int

In [5]:
%cd '/content/drive/MyDrive/dice_loss_for_NLP'

/content/drive/MyDrive/dice_loss_for_NLP


In [6]:
from loss.dice_loss import DiceLoss
from loss.focal_loss import FocalLoss
from torch.nn.modules import CrossEntropyLoss
from utils.get_parser import get_parser

class MyTrainer(Trainer):
    def __init__(self, loss_type, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_type= loss_type
        self.dice_smooth=1e-4
        self.dice_ohem=0.0
        self.dice_alpha=0.01
        self.dice_square="store_true"
        self.focal_gamma=2.0
        self.focal_alpha=0.25
        self.num_classes=2

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        if self.loss_type == "ce":
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(outputs[0], labels)
        elif self.loss_type == "focal":
            loss_fct = FocalLoss(gamma=self.focal_gamma, reduction="mean")
            loss = loss_fct(outputs[0], labels)
        elif self.loss_type == "dice":
            loss_fct = DiceLoss(with_logits=True, smooth=self.dice_smooth, ohem_ratio=self.dice_ohem,
                                alpha=self.dice_alpha, square_denominator=self.dice_square,
                                index_label_position=True, reduction="mean")
            loss = loss_fct(outputs[0], labels)
        else:
            raise ValueError
        return (loss, outputs) if return_outputs else loss

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, sep_token = '[SEP]', cls_token = '[CLS]')
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)
parser = get_parser()

model_name = file_path + f"{model_ckpt}-focal-loss"
# 128은 메모리 부족
batch_size = 64
logging_steps = len(ds['train']) // batch_size

training_args = TrainingArguments(
    output_dir = model_name,
    logging_dir = model_name + '/logs',
    num_train_epochs = 5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    logging_steps = logging_steps,
    save_steps = 50,
    save_total_limit = 1,
    save_strategy = 'epoch',
    evaluation_strategy = 'epoch',
    load_best_model_at_end = True)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    recall = recall_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    return {'f1': f1,
            'accuracy': acc,
            "recall": recall,
            "precision": precision}

tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/450k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
trainer = MyTrainer(model=model, args=training_args,
                    loss_type='focal',
                    compute_metrics=compute_metrics,
                    train_dataset=ds['train'],
                    eval_dataset=ds['val'],
                    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                    tokenizer=tokenizer)
trainer.train();

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Recall,Precision
1,0.0873,0.082589,0.869161,0.869246,0.869246,0.86913
2,0.068,0.089903,0.867297,0.867091,0.867091,0.86798
3,0.047,0.1119,0.865882,0.865916,0.865916,0.865856
4,0.0284,0.157445,0.863694,0.863565,0.863565,0.863972
5,0.017,0.210814,0.861121,0.861019,0.861019,0.861313


In [9]:
trainer.evaluate()

{'eval_loss': 0.08258869498968124,
 'eval_f1': 0.8691607026272515,
 'eval_accuracy': 0.8692458374142997,
 'eval_recall': 0.8692458374142997,
 'eval_precision': 0.8691303249953853,
 'eval_runtime': 17.7012,
 'eval_samples_per_second': 576.797,
 'eval_steps_per_second': 9.039,
 'epoch': 5.0}

In [10]:
pred_output = trainer.predict(ds['test'])
pred_output.metrics

{'test_loss': 0.08242447674274445,
 'test_f1': 0.871598250222223,
 'test_accuracy': 0.8716314111851637,
 'test_recall': 0.8716314111851637,
 'test_precision': 0.8715737777118028,
 'test_runtime': 18.6874,
 'test_samples_per_second': 554.009,
 'test_steps_per_second': 8.669}

In [11]:
df = pd.read_csv(file_path + 'rate_change_binary_result.csv')
df

Unnamed: 0,text,label,binary_rate_change_result
0,걔넨 저거 한시간도 못해 멧퇴지년들은,1,1.0
1,지금은 키우기 편해도 크면 저능아되지,1,1.0
2,저 여자들은 군인 처우 개선이 그렇게 싫나?,0,0.0
3,나였으면 자살했다.,1,1.0
4,너보단 좋잖아 넌 뚱뚱하니까 말이야,1,1.0
...,...,...,...
10348,애초에 잘빠지고 늘씬한 여자만 할수있는 직업이라 그런가?ㅋㅋ,0,1.0
10349,걔네들이 웬만큼 뻔뻔한 애들이냐?,0,0.0
10350,이런데도 KT 계속 쓰면 붕신에 흑우지 ㅋㅋㅋ,1,1.0
10351,새치 당했다면서?,0,0.0


In [12]:
pred_ts = torch.Tensor(pred_output.predictions)
result_ts = torch.sigmoid(pred_ts)
result = np.empty(10353)

for i in range(len(result_ts)):
    if result_ts[i][0] > result_ts[i][1]:
        result[i] = 0
    else:
        result[i] = 1

result_df = pd.DataFrame(result, columns=['focal_loss_result'])
df = pd.concat([df, result_df], axis=1)
df

Unnamed: 0,text,label,binary_rate_change_result,focal_loss_result
0,걔넨 저거 한시간도 못해 멧퇴지년들은,1,1.0,1.0
1,지금은 키우기 편해도 크면 저능아되지,1,1.0,1.0
2,저 여자들은 군인 처우 개선이 그렇게 싫나?,0,0.0,0.0
3,나였으면 자살했다.,1,1.0,1.0
4,너보단 좋잖아 넌 뚱뚱하니까 말이야,1,1.0,1.0
...,...,...,...,...
10348,애초에 잘빠지고 늘씬한 여자만 할수있는 직업이라 그런가?ㅋㅋ,0,1.0,1.0
10349,걔네들이 웬만큼 뻔뻔한 애들이냐?,0,0.0,1.0
10350,이런데도 KT 계속 쓰면 붕신에 흑우지 ㅋㅋㅋ,1,1.0,1.0
10351,새치 당했다면서?,0,0.0,0.0


In [13]:
idx = list(range(10353))
TPTN = pd.DataFrame(columns = ['text', 'label'], index = idx)
FPFN = pd.DataFrame(columns = ['text', 'real_label', 'wrong_label'], index = idx)

for i in range(10353):
    if df['label'][i] == df['focal_loss_result'][i]:
        TPTN.iloc[i, 0] = df['text'][i]
        TPTN.iloc[i, 1] = df['label'][i]
    else:
        FPFN.iloc[i, 0] = df['text'][i]
        FPFN.iloc[i, 1] = df['label'][i]
        FPFN.iloc[i, 2] = df['focal_loss_result'][i]


In [14]:
print(TPTN.isnull().sum())
print(FPFN.isnull().sum())
print('====================')
TPTN = TPTN.dropna()
FPFN = FPFN.dropna()
print(TPTN.isnull().sum())
print(FPFN.isnull().sum())

text     1329
label    1329
dtype: int64
text           9024
real_label     9024
wrong_label    9024
dtype: int64
text     0
label    0
dtype: int64
text           0
real_label     0
wrong_label    0
dtype: int64


In [15]:
TPTN.to_csv(file_path+'focal_loss_TPTN.csv', encoding = 'utf-8-sig', index = False)
FPFN.to_csv(file_path+'focal_loss_FPFN.csv', encoding = 'utf-8-sig', index = False)