<a href="https://colab.research.google.com/github/minmings111/AICA_study/blob/main/AICA_2025_08_18.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 2025.08.18. --1
# Fine Tuning

!pip -q install transformers datasets

In [2]:
from datasets import load_dataset
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          TrainingArguments,
                          Trainer,)
import numpy as np

In [3]:
# 1) 데이터 로드(NSMC 공식 tsv)
files = {
    "train": "https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt",
    "test": "https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt",
}

dataset = load_dataset("csv", data_files=files, delimiter="\t")

Downloading data:   0%|          | 0.00/14.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.89M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:
train_data = dataset['train'].shuffle(seed=42).select(range(2000))
test_data = dataset['test'].shuffle(seed=42).select(range(500))

In [5]:
print(train_data)
print(test_data)

Dataset({
    features: ['id', 'document', 'label'],
    num_rows: 2000
})
Dataset({
    features: ['id', 'document', 'label'],
    num_rows: 500
})


In [6]:
print(train_data[:3])
print(test_data[:3])
print(train_data.column_names)

{'id': [10020916, 6297236, 10085270], 'document': ['For Carl.칼 세이건으로 시작해서 칼 세이건으로 끝난다.', '모든 면에서 너무 좋고, 특히 동양적이고', '달콤한 꿀과 톡쏘는 칠리의 만남'], 'label': [1, 1, 1]}
{'id': [7545542, 9952650, 10215056], 'document': ['how boring!', '최고의 영화....!', '정우성배우 너무 섹시하고 시간가는줄 모르고 몰입해서봤다!배우들의 엄청난 연기력! 탄탄한 스토리와 액션 너무 재미있다'], 'label': [0, 1, 1]}
['id', 'document', 'label']


In [7]:
print(train_data.shape)
print(test_data.shape)

(2000, 3)
(500, 3)


In [8]:
# 2) 모델과 톸나이저 준비
#   - klue/bert-base (한국어에 맞춘 Bert 베이스 모델)
#   - num_labels=2  -> 이진분류(부정=0, 긍정=1)

model_name = 'klue/bert-base'
tokenizer= AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# 3) 토큰화 함수: tokenize function
#   - document 텍스트를 입력으로 받아 토큰화
#   - truncation = True  -> max_length를 넘는 토큰을 잘라냄
#   - padding="max_length"  -> 64 고정

def tokenize(batch):
  return tokenizer(batch['document'], truncation=True, padding="max_length", max_length=64)

In [10]:
# batched=true : 여러 샘플을 묶어 한 번에 토큰화하여 속도 개선

train_data = train_data.map(tokenize, batched=True)
test_data = test_data.map(tokenize, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [11]:
# 4) 학습 설정 / TrainingArguments
#   - output_dir : 결과 및 체크 포인터 저장 폴더
#   - per_device_*_batch_size : GPU/CPU 장치별 배치 크기
#   - num_trian_epochs : 전체 데이터셋을 학습 횟수
#   - logging_steps : 로그 출력 빈도
#   - report_to="none" : webdb 외부 로깅을 끔

args = TrainingArguments(
    output_dir = './result',
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 32,
    num_train_epochs = 1,
    logging_steps = 50,
    report_to="none",
)

In [12]:
# 5) 평가 지표 설정 / Metrics function
#   - trainer(logits, labels) 쌍으로 전달
#   - np.argmax로 예측 라벨 산출 -> 정확도 계산 [1.2, 3.4, 0.7]

def metrics(eval_pred):
  logits, labels = eval_pred
  preds = np.argmax(logits, axis = -1) # 축방향이 -1 이면, 가장 마지막 값의 차원을 기준으로 인덱스 리턴
  return {'acc': (preds == labels).mean()}

In [13]:
# 6) Trainer 생성

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_data,
    eval_dataset = test_data,
    tokenizer = tokenizer,
    compute_metrics = metrics,
)

  trainer = Trainer(


In [14]:
# 7) 학습

trainer.train()

Step,Training Loss
50,0.5661
100,0.4072


TrainOutput(global_step=125, training_loss=0.4557613296508789, metrics={'train_runtime': 1253.2843, 'train_samples_per_second': 1.596, 'train_steps_per_second': 0.1, 'total_flos': 65777763840000.0, 'train_loss': 0.4557613296508789, 'epoch': 1.0})

In [15]:
# 8) 평가 / Evaluate

print(trainer.evaluate())

{'eval_loss': 0.41357678174972534, 'eval_acc': 0.832, 'eval_runtime': 85.2974, 'eval_samples_per_second': 5.862, 'eval_steps_per_second': 0.188, 'epoch': 1.0}


In [16]:
test_sentences = [
    '최고예요',
    '정말 별로였어요',
    '배우의 연기가 훌륭했어요',
    '시간이 아깝네요'
]

In [17]:
# 입력데이터를 Dataset 형식으로 변환

from datasets import Dataset

predict_dataset = Dataset.from_dict({'document': test_sentences})
predict_dataset = predict_dataset.map(lambda x : tokenizer(x['document'],
                                                           truncation=True,
                                                           padding = "max_length",
                                                           max_length=64))

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [18]:
# Trainer 이용해서 예측

pred_output = trainer.predict(predict_dataset)

In [19]:
pred_output

PredictionOutput(predictions=array([[-2.0597167 ,  1.4453682 ],
       [ 0.98135823, -1.286501  ],
       [-1.7684253 ,  1.6802918 ],
       [ 1.1680125 , -1.3850025 ]], dtype=float32), label_ids=None, metrics={'test_runtime': 0.8432, 'test_samples_per_second': 4.744, 'test_steps_per_second': 1.186})

In [20]:
# 예측 라벨 변환

preds = pred_output.predictions.argmax(-1)
labels = {0: "NEGATIVE", 1: "POSITIVE"}

In [21]:
for text, label in zip(test_sentences, preds):
  print(f"{text} => {labels[label]}")

최고예요 => POSITIVE
정말 별로였어요 => NEGATIVE
배우의 연기가 훌륭했어요 => POSITIVE
시간이 아깝네요 => NEGATIVE


In [22]:
# 모델 저장

trainer.save_model('bert_kor_kosa_nsmc8')

In [23]:
pip install huggingface_hub



In [None]:
# huggingface 로그인

from huggingface_hub import notebook_login

notebook_login()

In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.create_repo(repo_id="bert-kor-kosa-nsmc8", private=False)

In [None]:
model.push_to_hub('bert-kor-kosa-nsmc8')