In [None]:
!pip install transformers torch datasets transformers[torch]

from google.colab import drive
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EvalPrediction
import torch
import json
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m104.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetens

In [None]:
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/'
model_ckpt = 'beomi/KcELECTRA-base-v2022'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # CPU or GPU
print(f'Using {device} device...')

ds = load_from_disk(file_path+'sequence_dataset')
ds

Mounted at /content/drive
Using cuda device...


DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 163274
    })
    val: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 72567
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 54425
    })
})

In [None]:
ds['train'][0]

{'label': [1.0, 1.0],
 'input_ids': [2,
  8019,
  12116,
  4235,
  4162,
  4009,
  8141,
  2207,
  4142,
  4180,
  2710,
  4138,
  4217,
  22425,
  17,
  8445,
  8019,
  1381,
  4143,
  14071,
  14612,
  17718,
  8767,
  3,
  2,
  11536,
  4535,
  4019,
  609,
  10664,
  19001,
  4063,
  7992,
  8227,
  29092,
  8083,
  1,
  17601,
  4180,
  16624,
  4029,
  14194,
  4020,
  3,
  0,
  0,
  0,
  0],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0]}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, sep_token = '[SEP]', cls_token = '[CLS]')
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
                                                           problem_type = 'multi_label_classification',
                                                           num_labels = 2)

model_name = file_path + f"{model_ckpt}-binary-intent"
# 128은 메모리 부족
batch_size = 64
logging_steps = len(ds['train']) // batch_size

training_args = TrainingArguments(
    output_dir = model_name,
    logging_dir = model_name + '/logs',
    num_train_epochs = 5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    logging_steps = logging_steps,
    save_steps = 50,
    save_total_limit = 2,
    save_strategy = 'epoch',
    evaluation_strategy = 'epoch',
    load_best_model_at_end = True)


def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'weighted')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result



# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions # pred.predictions.argmax(-1)
#     f1 = f1_score(labels, preds, average="weighted")
#     acc = accuracy_score(labels, preds)
#     return {"accuracy": acc, "f1": f1}

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class MyTrainer(Trainer):
    def __init__(self, loss_type, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_type= loss_type

    def compute_loss(self, logits, labels):
        if self.loss_type == "ce":
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_classes), labels.view(-1))
        elif self.loss_type == "focal":
            loss_fct = FocalLoss(gamma=self.args.focal_gamma, reduction="mean")
            loss = loss_fct(logits.view(-1, self.num_classes), labels.view(-1))
        elif self.loss_type == "dice":
            loss_fct = DiceLoss(with_logits=True, smooth=self.args.dice_smooth, ohem_ratio=self.args.dice_ohem,
                                alpha=self.args.dice_alpha, square_denominator=self.args.dice_square,
                                index_label_position=True, reduction="mean")
            loss = loss_fct(logits.view(-1, self.num_classes), labels)
        else:
            raise ValueError
        return loss



transformers.models.electra.modeling_electra.ElectraForSequenceClassification

In [None]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=ds['train'],
                  eval_dataset=ds['val'],
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  tokenizer=tokenizer)
trainer.train();

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.4962,0.407353,0.836984,0.806263,0.676024
2,0.3607,0.379953,0.853652,0.82442,0.706464
3,0.2746,0.384855,0.860549,0.833858,0.715739
4,0.1962,0.42141,0.864897,0.842496,0.72821
5,0.142,0.47245,0.865896,0.840705,0.727424


In [None]:
trainer.evaluate()

{'eval_loss': 0.3799530267715454,
 'eval_f1': 0.8536516875824545,
 'eval_roc_auc': 0.8244198021366761,
 'eval_accuracy': 0.7064643708572768,
 'eval_runtime': 281.2052,
 'eval_samples_per_second': 258.057,
 'eval_steps_per_second': 4.033,
 'epoch': 5.0}

In [None]:
pred_output = trainer.predict(ds['test'])
pred_output.metrics

{'test_loss': 0.38105708360671997,
 'test_f1': 0.8527028210739352,
 'test_roc_auc': 0.8245716583985656,
 'test_accuracy': 0.706550298576022,
 'test_runtime': 213.0644,
 'test_samples_per_second': 255.439,
 'test_steps_per_second': 3.994}

In [None]:
print(pred_output.predictions.shape, pred_output.label_ids.shape)

(54425, 2) (54425, 2)


In [None]:
pred_output

# predictions 기대값 => predictions=array([[[0.xxx, 0.xxx], [0.xxx, 0.xxx]],
#                       [[0.xxx, 0.xxx], [0.xxx, 0.xxx]],
#                       [[0.xxx, 0.xxx], [0.xxx, 0.xxx]]])

PredictionOutput(predictions=array([[-2.159942  ,  4.027989  ],
       [-2.9438853 , -1.3919188 ],
       [ 2.5999746 ,  3.375455  ],
       ...,
       [ 3.524044  , -3.484709  ],
       [ 3.943894  ,  1.8889935 ],
       [-0.24782501,  3.578032  ]], dtype=float32), label_ids=array([[1., 1.],
       [0., 0.],
       [1., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]], dtype=float32), metrics={'test_loss': 0.38105708360671997, 'test_f1': 0.8527028210739352, 'test_roc_auc': 0.8245716583985656, 'test_accuracy': 0.706550298576022, 'test_runtime': 213.0644, 'test_samples_per_second': 255.439, 'test_steps_per_second': 3.994})

In [None]:
trainer.save_model(file_path+"epc10_weighted_model")