In [None]:
!pip install transformers datasets huggingface_hub

In [None]:
pip install transformers datasets evaluate

In [None]:
from datasets import load_dataset

ds = load_dataset("yangwang825/klue-ynat")

In [None]:
print(ds)
print(ds["train"].column_names)

In [None]:
import numpy as np
import evaluate
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

encoded_dataset = ds.map(preprocess_function, batched=True)

In [None]:
import torch
import numpy as np
from transformers import (
    Trainer,
    TrainingArguments,
    AutoModelForSequenceClassification,
    AutoTokenizer
)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

dataset = encoded_dataset["train"].train_test_split(test_size=0.2)
train_ds = dataset["train"]
test_ds = dataset["test"]

label_names = dataset['train'].features['label'].names

label2id = {label: idx for idx, label in enumerate(label_names)}
id2label = {idx: label for idx, label in enumerate(label_names)}

model_id = "klue/bert-base"
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=len(train_ds.features['label'].names)
)

model.config.label2id = label2id
model.config.id2label = id2label

tokenizer = AutoTokenizer.from_pretrained(model_id)

train_ds = train_ds.map(tokenize_function, batched=True)
test_ds = test_ds.map(tokenize_function, batched=True)


In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
trainer.evaluate()
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

In [None]:
# classifier 코드
from transformers import pipeline

classifier = pipeline("text-classification", model="./saved_model", tokenizer="./saved_model")

text = "유튜브 내달 2일까지 크리에이터 지원 공간 운영"
result = classifier(text)

print(result)