In [None]:
# Pytorch 및 기타 라이브러리 설치
%pip install -qqq torch torchvision setuptools scikit-learn

# Hugging Face 라이브러리 설치
%pip install  --upgrade datasets -qqq accelerate hf-transfer transformers

In [None]:
from datasets import load_dataset

# huggingface.co/dataset의 데이터셋 ID
dataset_id = "burtenshaw/PleIAs_common_corpus_code_classification"

# 원시 데이터셋 로드
dataset = load_dataset(dataset_id)

In [None]:
print(len(dataset["train"]))
print(dataset["train"][0])

In [None]:
from transformers import AutoTokenizer

# 토크나이저를 로드할 모델 ID
model_id = "answerdotai/ModernBERT-base"

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(model_id)

# 토큰화 도우미 함수
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, return_tensors="pt")

# 데이터셋 토큰화
tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])

tokenized_dataset["train"].features.keys()
# dict_keys(['labels', 'input_ids', 'attention_mask'])


In [None]:
from transformers import AutoModelForSequenceClassification

# 토크나이저를 로드할 모델 ID
model_id = "answerdotai/ModernBERT-base"

# 추론에 유용한 모델 레이블 준비
labels = list(set(tokenized_dataset["train"]["labels"]))
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
# huggingface.co/models에서 모델 다운로드
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=num_labels, label2id=label2id, id2label=id2label,
)

In [None]:
import numpy as np
from sklearn.metrics import f1_score

# 메트릭 도우미 메서드
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    score = f1_score(
            labels, predictions, labels=labels, pos_label=1, average="weighted"
        )
    return {"f1": float(score) if score == 1 else score}


In [None]:
from huggingface_hub import HfFolder
from transformers import Trainer, TrainingArguments

# 훈련 인수 정의
training_args = TrainingArguments(
    output_dir= "ModernBERT-code-classifier",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=5e-5,
    num_train_epochs=5,
    bf16=True, # bfloat16 훈련
    optim="adamw_torch_fused", # 개선된 옵티마이저
    # 로깅 및 평가 전략
    logging_strategy="steps",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    # 허브에 푸시 매개변수
    push_to_hub=True,
    hub_strategy="every_save",
    hub_token=HfFolder.get_token(),
    report_to="wandb"
)



# 과적합

In [None]:
limited_dataset = tokenized_dataset["train"].select(range(100))

# Trainer 인스턴스 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=limited_dataset,
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
# 메모리 비우기

import torch
torch.cuda.empty_cache()

del trainer
del model
del limited_dataset

# 과소적합

In [None]:
# 낮은 학습률 정의
training_args.learning_rate = 1e-7

# Trainer 인스턴스 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=limited_dataset,
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
# 메모리 비우기

import torch
torch.cuda.empty_cache()

del trainer
del model

# 적절한 학습! 🥣

In [None]:
# 유효한 학습률 정의
training_args.learning_rate = 5e-5

# Trainer 인스턴스 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=limited_dataset,
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)
trainer.train()

# 추론

In [None]:
from transformers import pipeline

# 리포지토리 ID를 사용하여 huggingface.co/models에서 모델 로드
classifier = pipeline(
    task="text-classification",
    model="argilla/ModernBERT-domain-classifier",
    device=0,
)

sample = """def add_numbers(a, b):
    return a + b"""

classifier(sample)
