In [1]:
!pip install transformers torch datasets transformers[torch]

from google.colab import drive
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EvalPrediction
import torch
import json
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.5-py3-none-any.whl (7.8 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.20.3 (from transformers)
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packa

In [2]:
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/'
model_ckpt = 'beomi/KcELECTRA-base-v2022'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # CPU or GPU
print(f'Using {device} device...')

ds = load_from_disk(file_path+'kcelectra_binary_dataset')
ds

Mounted at /content/drive
Using cuda device...


DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 229707
    })
    val: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 102093
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 76569
    })
})

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, sep_token = '[SEP]', cls_token = '[CLS]')
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)

model_name = file_path + f"{model_ckpt}-param_change"
# 128은 메모리 부족
batch_size = 64
logging_steps = len(ds['train']) // batch_size

training_args = TrainingArguments(
    output_dir = model_name,
    logging_dir = model_name + '/logs',
    num_train_epochs = 2,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    logging_steps = logging_steps,
    save_steps = 50,
    save_total_limit = 1,
    learning_rate = 8e-5,
    save_strategy = 'epoch',
    evaluation_strategy = 'epoch',
    load_best_model_at_end = True)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/450k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=ds['train'],
                  eval_dataset=ds['val'],
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  tokenizer=tokenizer)
trainer.train();

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3437,0.319912,0.866396,0.866401
2,0.2318,0.340538,0.871127,0.871032


In [5]:
trainer.evaluate()

{'eval_loss': 0.3199116587638855,
 'eval_accuracy': 0.8663963249194362,
 'eval_f1': 0.8664013664629457,
 'eval_runtime': 57.5486,
 'eval_samples_per_second': 1774.031,
 'eval_steps_per_second': 27.733,
 'epoch': 2.0}

In [6]:
pred_output = trainer.predict(ds['test'])
pred_output.metrics

{'test_loss': 0.31658849120140076,
 'test_accuracy': 0.8679491700296464,
 'test_f1': 0.8679267107986641,
 'test_runtime': 44.8521,
 'test_samples_per_second': 1707.144,
 'test_steps_per_second': 26.688}

In [7]:
pred_ts = torch.Tensor(pred_output.predictions)
result_ts = torch.sigmoid(pred_ts)
result = np.empty(76569)

for i in range(len(result_ts)):
    if result_ts[i][0] > result_ts[i][1]:
        result[i] = 0
    else:
        result[i] = 1

result_df = pd.DataFrame(result, columns=['kcelectra_param_change_result'])
test_df = pd.read_csv(file_path + 'kcelectra_binary_testset.csv')
result_df = pd.concat([test_df, result_df], axis=1)
result_df.to_csv(file_path+'kcelectra_param_change_result.csv', encoding = 'utf-8-sig', index = False)