In [1]:
!pip install transformers torch datasets transformers[torch]

from google.colab import drive
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EvalPrediction
import torch
import json
import numpy as np
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.20.3 (from transformers)
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
Installing collected pa

In [2]:
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/'
model_ckpt = 'beomi/KcELECTRA-base-v2022'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # CPU or GPU
print(f'Using {device} device...')

%cd '/content/drive/MyDrive/'
!git clone https://github.com/ShannonAI/dice_loss_for_NLP.git

ds = load_from_disk(file_path+'multi_sequence_dataset')
ds

Mounted at /content/drive
Using cuda device...
/content/drive/MyDrive
fatal: destination path 'dice_loss_for_NLP' already exists and is not an empty directory.


DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 181181
    })
    val: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10091
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10545
    })
})

In [3]:
id2label = {0: 'IMMORAL_NONE',
            1: 'CENSURE',
            2: 'DISCRIMINATION',
            3: 'HATE',
            4: 'VIOLENCE',
            5: 'CRIME',
            6: 'SEXUAL',
            7: 'ABUSE'
            }
label2id = {v: k for k, v in id2label.items()}
labels = list(label2id.keys())

In [5]:
%cd '/content/drive/MyDrive/dice_loss_for_NLP'

/content/drive/MyDrive/dice_loss_for_NLP


In [6]:
from loss.dice_loss import DiceLoss
from loss.focal_loss import FocalLoss
from torch.nn.modules import CrossEntropyLoss
from utils.get_parser import get_parser

class MyTrainer(Trainer):
    def __init__(self, loss_type, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_type= loss_type
        self.dice_smooth=1e-4
        self.dice_ohem=0.0
        self.dice_alpha=0.01
        self.dice_square="store_true"
        self.focal_gamma=2.0
        self.focal_alpha=0.25
        self.num_classes=2

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        if self.loss_type == "ce":
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(outputs[0], labels)
        elif self.loss_type == "focal":
            loss_fct = FocalLoss(gamma=self.focal_gamma, reduction="mean")
            loss = loss_fct(outputs[0], labels)
        elif self.loss_type == "dice":
            loss_fct = DiceLoss(with_logits=True, smooth=self.dice_smooth, ohem_ratio=self.dice_ohem,
                                alpha=self.dice_alpha, square_denominator=self.dice_square,
                                index_label_position=True, reduction="mean")
            loss = loss_fct(outputs[0], labels)
        else:
            raise ValueError
        return (loss, outputs) if return_outputs else loss

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, sep_token = '[SEP]', cls_token = '[CLS]')
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
                                                           id2label=id2label,
                                                           label2id=label2id)

model_name = file_path + f"{model_ckpt}-multi-class-sequence"
# 128은 메모리 부족
batch_size = 64
logging_steps = len(ds['train']) // batch_size

training_args = TrainingArguments(
    output_dir = model_name,
    logging_dir = model_name + '/logs',
    num_train_epochs = 1,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    logging_steps = logging_steps,
    save_steps = 50,
    save_total_limit = 1,
    save_strategy = 'epoch',
    evaluation_strategy = 'epoch',
    load_best_model_at_end = True)

def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    softmax = torch.nn.Softmax()
    probs = softmax(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1 = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true=y_true, y_pred=y_pred, average='weighted')
    precision = precision_score(y_true=y_true, y_pred=y_pred, average='weighted')
    # return as dictionary
    metrics = {'f1': f1,
               'accuracy': accuracy,
               "recall": recall,
               "precision": precision}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/450k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
trainer = MyTrainer(model=model, args=training_args,
                    loss_type='focal',
                  compute_metrics=compute_metrics,
                  train_dataset=ds['train'],
                  eval_dataset=ds['val'],
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  tokenizer=tokenizer)
trainer.train();

RuntimeError: ignored

In [12]:
trainer.evaluate()

ValueError: ignored

In [None]:
pred_output = trainer.predict(ds['test'])
pred_output.metrics

In [None]:
trainer.save_model(file_path+"epc5_multi_class_model")