In [None]:
!pip install transformers torch datasets transformers[torch]

from google.colab import drive
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import torch
import json
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m104.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetens

In [None]:
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/'
model_ckpt = "beomi/kcbert-base"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # CPU or GPU
print(f'Using {device} device...')

ds = load_from_disk(file_path+'sequence_dataset2')
ds

Mounted at /content/drive
Using cuda device...


DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 163274
    })
    val: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 72567
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 54425
    })
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, sep_token = '[SEP]', cls_token = '[CLS]')
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)

model_name = file_path + f"{model_ckpt}-binary-intent"
# 2,1 구성일때 배치 128은 메모리 부족
batch_size = 64
logging_steps = len(ds['train']) // batch_size

training_args = TrainingArguments(
    output_dir = model_name,
    logging_dir = model_name + '/logs',
    num_train_epochs = 1,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    logging_steps = logging_steps,
    save_steps = 50,
    save_total_limit = 2,
    save_strategy = 'no',
    load_best_model_at_end = False)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1) # pred.predictions
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/250k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=ds['train'],
                  eval_dataset=ds['val'],
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  tokenizer=tokenizer)
# trainer.train();

In [None]:
trainer.evaluate()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.7609697580337524,
 'eval_accuracy': 0.47780671655160056,
 'eval_f1': 0.3559141389683873,
 'eval_runtime': 73.9665,
 'eval_samples_per_second': 981.079,
 'eval_steps_per_second': 15.331}

In [None]:
pred_output = trainer.predict(ds['test'])
pred_output.metrics

{'test_loss': 0.7652347087860107,
 'test_accuracy': 0.4736610013780432,
 'test_f1': 0.3512768306290661,
 'test_runtime': 54.1902,
 'test_samples_per_second': 1004.333,
 'test_steps_per_second': 15.704}

In [None]:
print(pred_output.predictions.shape, pred_output.label_ids.shape)

(54425, 2) (54425,)


In [None]:
pred_output

# predictions 기대값 => predictions=array([[[0.xxx, 0.xxx], [0.xxx, 0.xxx]],
#                       [[0.xxx, 0.xxx], [0.xxx, 0.xxx]],
#                       [[0.xxx, 0.xxx], [0.xxx, 0.xxx]]])

PredictionOutput(predictions=array([[-0.9064111 , -0.00974971],
       [-0.5776809 ,  0.24635029],
       [ 0.09601541,  0.49565482],
       ...,
       [ 0.3207142 ,  0.33953002],
       [-0.29324237,  0.22206974],
       [ 0.22461204,  0.18042941]], dtype=float32), label_ids=array([1, 0, 1, ..., 1, 1, 0]), metrics={'test_loss': 0.7652347087860107, 'test_accuracy': 0.4736610013780432, 'test_f1': 0.3512768306290661, 'test_runtime': 54.1902, 'test_samples_per_second': 1004.333, 'test_steps_per_second': 15.704})

In [None]:
a = pred_output.predictions.argmax(-1)
a

array([1, 1, 1, ..., 1, 1, 0])

In [None]:
b = pred_output.predictions.argmax(1)
b

array([1, 1, 1, ..., 1, 1, 0])

In [None]:
c = pred_output.predictions.argmax(0)
c

array([31386,  1576])