<a href="https://colab.research.google.com/github/minmings111/AICA_study/blob/main/AICA_2025_08_18.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 2025.08.18. --1
# Fine Tuning

!pip -q install transformers datasets

In [None]:
from datasets import load_dataset
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          TrainingArguments,
                          Trainer,)
import numpy as np

In [None]:
# 1) 데이터 로드(NSMC 공식 tsv)
files = {
    "train": "https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt",
    "test": "https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt",
}

dataset = load_dataset("csv", data_files=files, delimiter="\t")

Downloading data:   0%|          | 0.00/14.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.89M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
train_data = dataset['train'].shuffle(seed=42).select(range(2000))
test_data = dataset['test'].shuffle(seed=42).select(range(500))

In [None]:
print(train_data)
print(test_data)

Dataset({
    features: ['id', 'document', 'label'],
    num_rows: 2000
})
Dataset({
    features: ['id', 'document', 'label'],
    num_rows: 500
})


In [None]:
print(train_data[:3])
print(test_data[:3])
print(train_data.column_names)

{'id': [10020916, 6297236, 10085270], 'document': ['For Carl.칼 세이건으로 시작해서 칼 세이건으로 끝난다.', '모든 면에서 너무 좋고, 특히 동양적이고', '달콤한 꿀과 톡쏘는 칠리의 만남'], 'label': [1, 1, 1]}
{'id': [7545542, 9952650, 10215056], 'document': ['how boring!', '최고의 영화....!', '정우성배우 너무 섹시하고 시간가는줄 모르고 몰입해서봤다!배우들의 엄청난 연기력! 탄탄한 스토리와 액션 너무 재미있다'], 'label': [0, 1, 1]}
['id', 'document', 'label']


In [None]:
print(train_data.shape)
print(test_data.shape)

(2000, 3)
(500, 3)


In [None]:
# 2) 모델과 톸나이저 준비
#   - klue/bert-base (한국어에 맞춘 Bert 베이스 모델)
#   - num_labels=2  -> 이진분류(부정=0, 긍정=1)

model_name = 'klue/bert-base'
tokenizer= AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 3) 토큰화 함수: tokenize function
#   - document 텍스트를 입력으로 받아 토큰화
#   - truncation = True  -> max_length를 넘는 토큰을 잘라냄
#   - padding="max_length"  -> 64 고정

def tokenize(batch):
  return tokenizer(batch['document'], truncation=True, padding="max_length", max_length=64)


In [None]:
# batched=true : 여러 샘플을 묶어 한 번에 토큰화하여 속도 개선

train_data = train_data.map(tokenize, batched=True)
test_data = test_data.map(tokenize, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
# 4) 학습 설정 / TrainingArguments
#   - output_dir : 결과 및 체크 포인터 저장 폴더
#   - per_device_*_batch_size : GPU/CPU 장치별 배치 크기
#   - num_trian_epochs : 전체 데이터셋을 학습 횟수
#   - logging_steps : 로그 출력 빈도
#   - report_to="none" : webdb 외부 로깅을 끔

args = TrainingArguments(
    output_dir = './result',
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 32,
    num_train_epochs = 1,
    logging_steps = 50,
    report_to="none",
)

In [None]:
# 5) 평가 지표 설정 / Metrics function
#   - trainer(logits, labels) 쌍으로 전달
#   - np.argmax로 예측 라벨 산출 -> 정확도 계산 [1.2, 3.4, 0.7]

def metrics(eval_pred):
  logits, labels = eval_pred
  preds = np.argmax(logits, axis = -1) # 축방향이 -1 이면, 가장 마지막 값의 차원을 기준으로 인덱스 리턴
  return {'acc': (preds == labels).mean()}

In [None]:
# 6) Trainer 생성

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_data,
    eval_dataset = test_data,
    tokenizer = tokenizer,
    compute_metrics = metrics,
)

  trainer = Trainer(


In [None]:
# 7) 학습

trainer.train()

Step,Training Loss
50,0.5123
100,0.3875


TrainOutput(global_step=125, training_loss=0.4237917137145996, metrics={'train_runtime': 35.4756, 'train_samples_per_second': 56.377, 'train_steps_per_second': 3.524, 'total_flos': 65777763840000.0, 'train_loss': 0.4237917137145996, 'epoch': 1.0})

In [None]:
# 8) 평가 / Evaluate

print(trainer.evaluate())

{'eval_loss': 0.3914499878883362, 'eval_acc': 0.846, 'eval_runtime': 1.9264, 'eval_samples_per_second': 259.551, 'eval_steps_per_second': 8.306, 'epoch': 1.0}


In [None]:
test_sentences = [
    '최고예요',
    '정말 별로였어요',
    '배우의 연기가 훌륭했어요',
    '시간이 아깝네요'
]

In [None]:
# 입력데이터를 Dataset 형식으로 변환

from datasets import Dataset

predict_dataset = Dataset.from_dict({'document': test_sentences})
predict_dataset = predict_dataset.map(lambda x : tokenizer(x['document'],
                                                           truncation=True,
                                                           padding = "max_length",
                                                           max_length=64))

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [None]:
# Trainer 이용해서 예측

pred_output = trainer.predict(predict_dataset)

In [None]:
pred_output

PredictionOutput(predictions=array([[-1.7731566,  1.4912181],
       [ 1.4096699, -1.2498688],
       [-1.8317807,  1.6202472],
       [ 1.4545407, -1.3258945]], dtype=float32), label_ids=None, metrics={'test_runtime': 0.027, 'test_samples_per_second': 148.33, 'test_steps_per_second': 37.083})

In [None]:
# 예측 라벨 변환

preds = pred_output.predictions.argmax(-1)
labels = {0: "NEGATIVE", 1: "POSITIVE"}

In [None]:
for text, label in zip(test_sentences, preds):
  print(f"{text} => {labels[label]}")

최고예요 => POSITIVE
정말 별로였어요 => NEGATIVE
배우의 연기가 훌륭했어요 => POSITIVE
시간이 아깝네요 => NEGATIVE


In [None]:
# 모델 저장

trainer.save_model('bert_kor_kosa_nsmc8')

In [None]:
pip install huggingface_hub



In [None]:
# huggingface 로그인

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.create_repo(repo_id="bert-kor-kosa-nsmc8", private=False)

RepoUrl('https://huggingface.co/minmings111/bert-kor-kosa-nsmc8', endpoint='https://huggingface.co', repo_type='model', repo_id='minmings111/bert-kor-kosa-nsmc8')

In [None]:
model.push_to_hub('bert-kor-kosa-nsmc8')

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmpchy6ybtl/model.safetensors    :   0%|          |  556kB /  442MB            

CommitInfo(commit_url='https://huggingface.co/minmings111/bert-kor-kosa-nsmc8/commit/3a4c9d6c1f5504b6cd33b410623e76ee690fc644', commit_message='Upload BertForSequenceClassification', commit_description='', oid='3a4c9d6c1f5504b6cd33b410623e76ee690fc644', pr_url=None, repo_url=RepoUrl('https://huggingface.co/minmings111/bert-kor-kosa-nsmc8', endpoint='https://huggingface.co', repo_type='model', repo_id='minmings111/bert-kor-kosa-nsmc8'), pr_revision=None, pr_num=None)