# 0. 준비

In [11]:
!mkdir ~/transformers

mkdir: cannot create directory ‘/root/transformers’: File exists


In [12]:
!git clone https://github.com/e9t/nsmc.git

fatal: destination path 'nsmc' already exists and is not an empty directory.


In [13]:
!pip install --upgrade datasets



In [14]:
!pip install --upgrade transformers



In [27]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [15]:
import tensorflow
import numpy
import transformers
import datasets

print(tensorflow.__version__)
print(numpy.__version__)
print(transformers.__version__)
print(datasets.__version__)

2.18.0
2.0.2
4.52.3
3.6.0


# 1. NSMC 데이터 분석 및 Huggingface dataset 구성

In [16]:
from datasets import load_dataset

huggingface_dataset = load_dataset('nsmc')
print(huggingface_dataset)

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 150001
    })
    test: Dataset({
        features: ['text'],
        num_rows: 50001
    })
})


- 컬럼이 text로만 떠서 이를 수정함

In [17]:
def split_text(example):
    parts = example['text'].split('\t')

    if len(parts) != 3 or parts[0] == "id":
        return {"id": None, "document": None, "label": None}

    return {
        "id": parts[0],
        "document": parts[1],
        "label": int(parts[2])
    }

from datasets import DatasetDict

processed_dataset = {}

for split in ['train', 'test']:
    split_data = huggingface_dataset[split].map(split_text)
    # 헤더/잘못된 행 제거
    split_data = split_data.filter(lambda ex: ex['label'] is not None)
    # text 컬럼 제거
    split_data = split_data.remove_columns(["text"])
    processed_dataset[split] = split_data

huggingface_dataset = DatasetDict(processed_dataset)

# 확인
print(huggingface_dataset)
print(huggingface_dataset['train'][0])


DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})
{'id': '9976970', 'document': '아 더빙.. 진짜 짜증나네요 목소리', 'label': 0}


In [18]:
train = huggingface_dataset['train']
cols = train.column_names
cols

['id', 'document', 'label']

In [19]:
for i in range(5):
    for col in cols:
        print(col, ":", train[col][i])
    print('\n')

id : 9976970
document : 아 더빙.. 진짜 짜증나네요 목소리
label : 0


id : 3819312
document : 흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나
label : 1


id : 10265843
document : 너무재밓었다그래서보는것을추천한다
label : 0


id : 9045019
document : 교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정
label : 0


id : 6483659
document : 사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다
label : 1




# 2. klue/bert-base model 및 tokenizer 불러오기

In [20]:
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
def transform(data):
    return tokenizer(
        data['document'],
        truncation = True,
        padding = 'max_length',
        max_length=128,
        return_token_type_ids = False,
        )

#3. tokenizer로 데이터셋 전처리 후 model 학습 진행

In [22]:
hf_dataset = huggingface_dataset.map(transform, batched=True)

# 해당 데이터셋에는 validation이 없음
# 따라서 train 데이터를 train/validation으로 분리
split_dataset = hf_dataset['train'].train_test_split(test_size=0.1, seed=42)

hf_train_dataset = split_dataset['train']
hf_val_dataset = split_dataset['test']
hf_test_dataset = hf_dataset['test']

In [24]:
import os
import numpy as np
from transformers import Trainer, TrainingArguments

output_dir = os.getenv('HOME')+'/transformers'

training_arguments = TrainingArguments(
    output_dir,                                         # output이 저장될 경로
    eval_strategy="epoch",           #evaluation하는 빈도
    learning_rate = 2e-5,                         #learning_rate
    per_device_train_batch_size = 8,   # 각 device 당 batch size
    per_device_eval_batch_size = 8,    # evaluation 시에 batch size
    num_train_epochs = 3,                     # train 시킬 총 epochs
    weight_decay = 0.01,                        # weight decay
)

In [28]:
import evaluate
import numpy as np
from transformers import Trainer, TrainingArguments

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [29]:
trainer = Trainer(
    model=model,           # 학습시킬 model
    args=training_arguments,           # TrainingArguments을 통해 설정한 arguments
    train_dataset=hf_train_dataset,    # training dataset
    eval_dataset=hf_val_dataset,       # evaluation dataset
    compute_metrics=compute_metrics,
)
trainer.train()
print("슝~")



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlucy071101[0m ([33mlucy071101_hyun[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3007,0.314844,0.8966
2,0.231,0.381729,0.905067
3,0.1585,0.445128,0.9076


슝~


In [31]:
trainer.evaluate(hf_test_dataset)

{'eval_loss': 0.46268805861473083,
 'eval_accuracy': 0.90374,
 'eval_runtime': 99.8904,
 'eval_samples_per_second': 500.549,
 'eval_steps_per_second': 62.569,
 'epoch': 3.0}

# 4. Fine-tuning을 통하여 모델 성능(accuarcy) 향상
- 이미 accuracy가 90% 이상이기에 실험을 해보는 데에 의의를 둠

- learning_rate를 2e-5에서 3e-5로 조절 => eval_accuracy가 오히려 미세하게 낮아짐

In [32]:
training_arguments = TrainingArguments(
    output_dir,                                         # output이 저장될 경로
    eval_strategy="epoch",           #evaluation하는 빈도
    learning_rate = 3e-5,                         #learning_rate 조절
    per_device_train_batch_size = 8,   # 각 device 당 batch size
    per_device_eval_batch_size = 8,    # evaluation 시에 batch size
    num_train_epochs = 3,                     # train 시킬 총 epochs
    weight_decay = 0.01,                        # weight decay
)

In [33]:
trainer = Trainer(
    model=model,           # 학습시킬 model
    args=training_arguments,           # TrainingArguments을 통해 설정한 arguments
    train_dataset=hf_train_dataset,    # training dataset
    eval_dataset=hf_val_dataset,       # evaluation dataset
    compute_metrics=compute_metrics,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2215,0.584978,0.888067
2,0.1393,0.581851,0.9028
3,0.0947,0.573139,0.9052


TrainOutput(global_step=50625, training_loss=0.1387004503791715, metrics={'train_runtime': 5469.4134, 'train_samples_per_second': 74.048, 'train_steps_per_second': 9.256, 'total_flos': 2.66399943552e+16, 'train_loss': 0.1387004503791715, 'epoch': 3.0})

In [34]:
trainer.evaluate(hf_test_dataset)

{'eval_loss': 0.5962709188461304,
 'eval_accuracy': 0.90072,
 'eval_runtime': 101.0227,
 'eval_samples_per_second': 494.938,
 'eval_steps_per_second': 61.867,
 'epoch': 3.0}

- learning_rate를 다시 되돌리고 warmup과 학습률 스케쥴러 추가

In [36]:
training_arguments = TrainingArguments(
    output_dir,                                         # output이 저장될 경로
    eval_strategy="epoch",           #evaluation하는 빈도
    learning_rate = 2e-5,                         #learning_rate
    per_device_train_batch_size = 8,   # 각 device 당 batch size
    per_device_eval_batch_size = 8,    # evaluation 시에 batch size
    num_train_epochs = 3,                     # train 시킬 총 epochs
    weight_decay = 0.01,                        # weight decay
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
)

In [37]:
trainer = Trainer(
    model=model,           # 학습시킬 model
    args=training_arguments,           # TrainingArguments을 통해 설정한 arguments
    train_dataset=hf_train_dataset,    # training dataset
    eval_dataset=hf_val_dataset,       # evaluation dataset
    compute_metrics=compute_metrics,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0861,0.673091,0.8912
2,0.0522,0.765229,0.8986
3,0.0784,0.695636,0.902533


TrainOutput(global_step=50625, training_loss=0.05901092862729673, metrics={'train_runtime': 5351.6672, 'train_samples_per_second': 75.677, 'train_steps_per_second': 9.46, 'total_flos': 2.66399943552e+16, 'train_loss': 0.05901092862729673, 'epoch': 3.0})

In [38]:
trainer.evaluate(hf_test_dataset)

{'eval_loss': 0.715921938419342,
 'eval_accuracy': 0.89896,
 'eval_runtime': 99.8339,
 'eval_samples_per_second': 500.832,
 'eval_steps_per_second': 62.604,
 'epoch': 3.0}

# 5. Bucketing을 적용하여 학습시키고, STEP 4의 결과와의 비교

- 고정 padding을 없애고 dynamic padding + bucketing 추가

In [39]:
def transform_noPad(data):
    return tokenizer(
        data['document'],
        truncation = True,
        padding = False,
        max_length=128,
        return_token_type_ids = False,
        )

In [40]:
dp_dataset = huggingface_dataset.map(transform_noPad, batched=True)

# 해당 데이터셋에는 validation이 없음
# 따라서 train 데이터를 train/validation으로 분리
split_dataset = dp_dataset['train'].train_test_split(test_size=0.1, seed=42)

dp_train_dataset = split_dataset['train']
dp_val_dataset = split_dataset['test']
dp_test_dataset = dp_dataset['test']

Map:   0%|          | 0/150000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [41]:
import os
import numpy as np
from transformers import Trainer, TrainingArguments

output_dir = os.getenv('HOME')+'/transformers'

training_arguments = TrainingArguments(
    output_dir,                                         # output이 저장될 경로
    eval_strategy="epoch",           #evaluation하는 빈도
    learning_rate = 2e-5,                         #learning_rate
    per_device_train_batch_size = 8,   # 각 device 당 batch size
    per_device_eval_batch_size = 8,    # evaluation 시에 batch size
    num_train_epochs = 3,                     # train 시킬 총 epochs
    weight_decay = 0.01,                        # weight decay
    group_by_length=True,                        # ✅ bucketing
)

In [42]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [43]:
trainer = Trainer(
    model=model,           # 학습시킬 model
    args=training_arguments,           # TrainingArguments을 통해 설정한 arguments
    train_dataset=dp_train_dataset,    # training dataset
    eval_dataset=dp_val_dataset,       # evaluation dataset
    compute_metrics=compute_metrics,
    data_collator=data_collator,        # ✅ dynamic padding
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0911,0.597865,0.897533
2,0.063,0.696077,0.899067
3,0.027,0.73894,0.898733


TrainOutput(global_step=50625, training_loss=0.06861594145974995, metrics={'train_runtime': 5368.3797, 'train_samples_per_second': 75.442, 'train_steps_per_second': 9.43, 'total_flos': 4829950198794720.0, 'train_loss': 0.06861594145974995, 'epoch': 3.0})

In [44]:
trainer.evaluate(dp_test_dataset)

{'eval_loss': 0.7354622483253479,
 'eval_accuracy': 0.9003,
 'eval_runtime': 97.0817,
 'eval_samples_per_second': 515.03,
 'eval_steps_per_second': 64.379,
 'epoch': 3.0}

- 모두 다 큰 차이가 없었다.
- 하지만 dynamic padding + bucketing이 성능을 높이지 못한 이유가 궁금하다.


---


- huggingface가 직접 모델링을 하는 것보다 훨씬 편리하다는 점에서 이점이 있는 것 같다.
- 하지만 fine tuning에 대해서는 공부를 더 해야될 것 같다.