# 프로젝트 : 커스텀 프로젝트 직접 만들기

In [1]:
import tensorflow
import numpy
import transformers
import datasets
from datasets import load_dataset

In [2]:
dataset = load_dataset("nsmc")
dataset

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/807 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset nsmc/default (download: 18.62 MiB, generated: 20.90 MiB, post-processed: Unknown size, total: 39.52 MiB) to /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/6.33M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.89M [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset nsmc downloaded and prepared to /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})

### model, tokenizer load

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer

model_name = "klue/bert-base"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [4]:
print(model.classifier)

Linear(in_features=768, out_features=2, bias=True)


skt-kobert가 해당 데이터셋에 대해서 90프로 넘는 accuracy를 달성했는데 dropout을 헤드부분에서 적용하여 참고하여 설정

In [5]:
import torch.nn as nn

# 기존 분류 헤드 교체 (드롭아웃 추가)
class CustomClassificationHead(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.5):
        super(CustomClassificationHead, self).__init__()
        self.dropout = nn.Dropout(p=dropout_rate)
        self.out_proj = nn.Linear(input_dim, output_dim)
    
    def forward(self, features):
        x = self.dropout(features)
        x = self.out_proj(x)
        return x

# 새로운 분류 헤드 설정
model.classifier = CustomClassificationHead(
    input_dim=model.config.hidden_size,
    output_dim=2,  # 클래스 수
    dropout_rate=0.5
)

# 확인
print(model.classifier)

CustomClassificationHead(
  (dropout): Dropout(p=0.5, inplace=False)
  (out_proj): Linear(in_features=768, out_features=2, bias=True)
)


In [6]:
def transform(data):
    return tokenizer(
        data['document'],
        truncation = True,
        padding = 'max_length',
        max_length = 128,
        return_token_type_ids = False
    )

In [7]:
# train
train_dataset = dataset["train"].map(transform, batched=True)

# val
val_dataset = dataset["test"].map(transform, batched=True)

  0%|          | 0/150 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [8]:
train_dataset[0]

{'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'input_ids': [2,
  1376,
  831,
  2604,
  18,
  18,
  4229,
  9801,
  2075,
  2203,
  2182,
  4243,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [9]:
# 평가 지표 - accuracy 생성
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)  # argmax 
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [10]:
from transformers import AdamW, get_cosine_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 옵티마이저와 스케줄러 설정
learning_rate = 2e-5
num_epochs = 3
warmup_ratio = 0.1

# 가중치 감쇠 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]


bias와 LayerNorm.weight에 감쇠를 적용하지 않는이유 (추측)
- 감쇠를 적용하지 않아도 학습 안정성이 유지되고, 과적합에 덜 민감
- 때문에 불필요한 연산 제거

In [11]:
# 옵티마이저 정의
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

# 학습 단계 계산
t_total = len(train_dataset) // 16 * num_epochs  # 배치 크기 16 가정
warmup_steps = int(t_total * warmup_ratio)


In [12]:
# 스케줄러 정의
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)

In [13]:
# TrainingArguments 설정
training_args = TrainingArguments(
    output_dir='./results',             # 결과 저장 경로
    num_train_epochs=num_epochs,        # 총 에폭 수
    per_device_train_batch_size=16,     # 학습 배치 크기
    per_device_eval_batch_size=16,      # 평가 배치 크기
    learning_rate=learning_rate,        # 학습률
    warmup_steps=warmup_steps,          # 워밍업 단계 수
#     weight_decay=0.01,                  # 기본 weight decay
    evaluation_strategy="epoch",        # 에폭마다 평가
    save_strategy="epoch",              # 에폭마다 체크포인트 저장
    logging_dir='./logs',               # 로그 저장 경로
    logging_steps=500,                  # 로그 출력 빈도
    report_to="none",                   # 로깅 플랫폼 설정
    save_total_limit=2,                 # 저장할 체크포인트 수 제한
    load_best_model_at_end=True         # 최고의 모델 로드
)

In [35]:
# Trainer 설정
trainer = Trainer(
    model=model,                        # 학습할 모델
    args=training_args,                 # TrainingArguments
    train_dataset=train_dataset,        # 학습 데이터셋
    eval_dataset=val_dataset,           # 검증 데이터셋
    tokenizer=tokenizer,                # 토크나이저
    compute_metrics=compute_metrics,    # 평가 지표 함수
    optimizers=(optimizer, scheduler)   # 옵티마이저와 스케줄러 전달
)

In [36]:
# 모델 학습 시작
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running training *****
  Num examples = 150000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 28125


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2737,0.249071,0.90094,0.905243,0.897152,0.901179
2,0.1984,0.268162,0.90274,0.884776,0.927621,0.905692
3,0.1213,0.377723,0.90636,0.900137,0.915584,0.907795


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-9375
Configuration saved in ./results/checkpoint-9375/config.json
Model weights saved in ./results/checkpoint-9375/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-9375/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-9375/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-18750
Configuration saved in ./results/checkpoint-18750/config.json
Model weights saved in ./results/checkpoint-18750/pytorch_model.bin
tokenizer c

TrainOutput(global_step=28125, training_loss=0.21969765818277995, metrics={'train_runtime': 11154.1173, 'train_samples_per_second': 40.344, 'train_steps_per_second': 2.521, 'total_flos': 2.9599993728e+16, 'train_loss': 0.21969765818277995, 'epoch': 3.0})

In [37]:
# 평가
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 16


{'eval_loss': 0.24907077848911285,
 'eval_accuracy': 0.90094,
 'eval_precision': 0.9052429052429053,
 'eval_recall': 0.8971517101656536,
 'eval_f1': 0.9011791464655534,
 'eval_runtime': 380.371,
 'eval_samples_per_second': 131.451,
 'eval_steps_per_second': 8.216,
 'epoch': 3.0}

루브릭 달성!

### Bucketing

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
# Trainer 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # acc 함수
)

In [16]:
# 모델 학습
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running training *****
  Num examples = 150000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 28125


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2724,0.252755,0.89986,0.899833,0.901442,0.900637
2,0.1994,0.274378,0.90094,0.882578,0.926509,0.90401
3,0.1234,0.377874,0.90464,0.900082,0.91181,0.905908


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-9375
Configuration saved in ./results/checkpoint-9375/config.json
Model weights saved in ./results/checkpoint-9375/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-9375/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-9375/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-18750
Configuration saved in ./results/checkpoint-18750/config.json
Model weights saved in ./results/checkpoint-18750/pytorch_model.bin
tokenizer c

TrainOutput(global_step=28125, training_loss=0.2228409088812934, metrics={'train_runtime': 11430.0471, 'train_samples_per_second': 39.37, 'train_steps_per_second': 2.461, 'total_flos': 2.9599993728e+16, 'train_loss': 0.2228409088812934, 'epoch': 3.0})

### 회고

뭔가 이번주는 몸도 아프고 이사준비도 해야해서 정신이없다..\
아직 못 다한 노드를 천천히 마무리 해야겠다.\
이번 노드에서는 베이스 실험들을 동기들 먼저 해주어서 성능을 높이는 방안에대해서만 생각할 수 있었다.
- 해당 task에서 대부분의 모델들의 성능이 89% 정확도를 보여주었지만 skt에서 학습한 kobert 모델을 90%을 넘겨서 해당 방법을 착안하였다.
    - classifier head에 dropout 적용 (50%)
    - weight decay 설정 (bias, layerNorm 제외)
    
또한 실험 결과에 볼 수 있듯 해당 방법으로 정확도 90%을 넘겼고 이후 동적 패딩을 data collator를 사용하여 비교실험 하였다.\
실험 결과 큰 차이가 없는것으로 보아 지금 단계의 결과에서는 큰 성능 향상이 어려운것 같다.(학습시간도 비슷하다?)\
동적 패딩은 알고는 있었지만 실제 코드 구현이 이렇게 간편할 줄은 몰랐는데 앞으로 훈련할때 자주 애용해야겠다.