# 자연어 추론을 위한 BERT fine-tunning

In [19]:
# 라이브러리 임포트
from transformers import AutoModelForSequenceClassification,AutoTokenizer, Trainer, TrainingArguments

import torch
import numpy as np

## datasets 로딩
datasets 라이브러리를 이용하여 데이터를 가져옵니다.

In [20]:
from datasets import load_dataset
datasets = load_dataset("klue", 'nli')

Downloading and preparing dataset klue/nli (download: 1.20 MiB, generated: 6.10 MiB, post-processed: Unknown size, total: 7.30 MiB) to /root/.cache/huggingface/datasets/klue/nli/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e...


Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset klue downloaded and prepared to /root/.cache/huggingface/datasets/klue/nli/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [21]:
datasets

DatasetDict({
    train: Dataset({
        features: ['guid', 'source', 'premise', 'hypothesis', 'label'],
        num_rows: 24998
    })
    validation: Dataset({
        features: ['guid', 'source', 'premise', 'hypothesis', 'label'],
        num_rows: 3000
    })
})

In [22]:
# 데이터 셋을 학습 데이터 셋과 검증 데이터 셋으로 나눔 
train_set = datasets['train']
valid_set = datasets['validation']

In [23]:
# 시퀀스 분류를 위한 robert 모델 불러오기 
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base',num_labels = 3)

loading configuration file https://huggingface.co/xlm-roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/87683eb92ea383b0475fecf99970e950a03c9ff5e51648d6eee56fb754612465.ab95cf27f9419a99cce4f19d09e655aba382a2bafe2fe26d0cc24c18cf1a1af6
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "positio

In [26]:
# tokenizer 불러오기 
# xlm-roberta-base 모델을 학습할 때 사용된 tokenizer 를 불러와야 된다.

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/87683eb92ea383b0475fecf99970e950a03c9ff5e51648d6eee56fb754612465.ab95cf27f9419a99cce4f19d09e655aba382a2bafe2fe26d0cc24c18cf1a1af6
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.1

## 데이터셋 전처리
아래의 작업을 토크나이저를 사용하여 데이터셋을 빠르게 전처리 할 수 있다.
1. 시작 부분에 [CLS]토큰을 추가하고 끝 부분에 [SEP]토큰을 추가.
2. 토큰을 토큰ID(숫자)에 매핑
3. segment_id(토큰 타입 id) 추가
    - 토큰 타입 ID는 입력 문장이 여러개일 때 각 문장을 구별하는데 사용 됨.
    - 첫 번째 문장의 모든 토큰은 0에 매핑하고 두 번째 문장의 모든 토큰은 1로 매핑
    - 현재 예제에서는 문장이 하나만 들어가기 때문에 모두 0으로 매핑
4. attention maks 생성


In [8]:
tokenizer(['한국은 사계절이 있다' ,'나는 배고프다', '나는 덥다'],padding = True,
         max_length = 5)

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


{'input_ids': [[0, 9397, 697, 5939, 11931, 19625, 469, 3162, 2], [0, 37231, 16493, 1077, 10068, 1875, 2, 1, 1], [0, 37231, 6, 244636, 1875, 2, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 0, 0, 0]]}

위 코드 출력값을 보면 , 두 번째와 세 번째 문장에는 토큰이 2개만 있는 상태라서 [CLS]와 [SEP]를 추가하면 총 4개의 토큰이 된다.  
paddding을 True로 설정하고 max_length= 5로 성정했기 때문에 추가 [PAD] 토큰이 두 번째 및 세 번째 문장에 추가 된 것이다.  
두 번째와 세 번째 문장의 어텐션 마스크에 0이 있는 이유이기도 하다.

In [27]:
train_set

Dataset({
    features: ['guid', 'source', 'premise', 'hypothesis', 'label'],
    num_rows: 24998
})

In [29]:
# preprocess 함수를 만들어 데이터셋을 전처리 적용
#
def preprocess(data):
    return tokenizer(data['premise'],data['hypothesis'] ,
                     padding = True,
                    truncation=True # 문장 잘림 허용 옵션
                    )

In [30]:
train_set = train_set.map(preprocess, batched = True,
                         batch_size = len(train_set))


  0%|          | 0/1 [00:00<?, ?ba/s]

In [31]:
valid_set =valid_set.map(preprocess, batched = True,
                         batch_size = len(valid_set))

  0%|          | 0/1 [00:00<?, ?ba/s]

In [32]:
#set format 함수를 이용해 데이터셋에서 필요한 columns과 형식을 입력.
train_set.set_format('torch',
                     columns =['input_ids','attention_mask','label'])
valid_set.set_format('torch',
                     columns =['input_ids','attention_mask','label'])

## 모델 학습

In [33]:
# 필요한 파라미터 정의

batch = 8
epochs = 2

warmup_steps = 500
weight_decay = 0.01

In [34]:
# 학습 인수 정의

train_args = TrainingArguments(output_dir = './results',
                               num_train_epochs = epochs,
                               per_device_train_batch_size = batch,
                               per_device_eval_batch_size = batch,
                               warmup_steps = warmup_steps ,
                               weight_decay = weight_decay,
                               logging_dir='./logs')

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [35]:
# 학습 trainer 정의
trainer = Trainer(model = model,
                  args = train_args,
                  train_dataset = train_set,
                  eval_dataset = valid_set
                 )

In [36]:
# 모델 학습
trainer.train()

The following columns in the training set  don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: guid, source, premise, hypothesis.
***** Running training *****
  Num examples = 24998
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6250


Step,Training Loss
500,1.1058
1000,1.1096
1500,1.1046
2000,1.1043
2500,1.1057
3000,1.1038
3500,1.1031
4000,1.1014
4500,1.1025
5000,1.101


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2500
Configuration saved in ./results/checkpoint-2500/config.json
Model weights saved in ./results/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-3000
Configuration saved in ./results/checkpoint-3

TrainOutput(global_step=6250, training_loss=1.103439814453125, metrics={'train_runtime': 4370.9634, 'train_samples_per_second': 11.438, 'train_steps_per_second': 1.43, 'total_flos': 2672031869148096.0, 'train_loss': 1.103439814453125, 'epoch': 2.0})

In [37]:
# 학습 후 모델 평가
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: guid, source, premise, hypothesis.
***** Running Evaluation *****
  Num examples = 3000
  Batch size = 8


{'epoch': 2.0,
 'eval_loss': 1.0999269485473633,
 'eval_runtime': 46.005,
 'eval_samples_per_second': 65.21,
 'eval_steps_per_second': 8.151}