# 텍스트 분석을 위한 BERT fine-tunning

In [1]:
# 라이브러리 임포트
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments

import torch
import numpy as np

## datasets 로딩
datasets 라이브러리를 이용하여 데이터를 가져옵니다.

In [3]:
from datasets import load_dataset
datasets = load_dataset("klue", 'ynat')

Reusing dataset klue (/home/ubuntu/.cache/huggingface/datasets/klue/ynat/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
datasets

DatasetDict({
    train: Dataset({
        features: ['guid', 'title', 'label', 'url', 'date'],
        num_rows: 45678
    })
    validation: Dataset({
        features: ['guid', 'title', 'label', 'url', 'date'],
        num_rows: 9107
    })
})

In [5]:
# 데이터 셋을 학습 데이터 셋과 검증 데이터 셋으로 나눔 
train_set = datasets['train']
valid_set = datasets['validation']

In [12]:
# 시퀀스 분류를위한 robert 모델 불러오기 
from transformers import AutoModelForSequenceClassification,AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base',num_labels = 7)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

In [13]:
# tokenizer 불러오기 - pretrained 된 모델과 똑같은 tokenizer를 가져와야됨.
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

## 데이터셋 전처리
- 토크나이저를 사용하여 데이터셋을 빠르게 전처리 할 수 있다.
- 토크나이저를 이용하면 

In [24]:
tokenizer(['한국은 사계절이 있다' ,'나는 배고프다', '나는 덥다'],padding = True,
         max_length = 5)



{'input_ids': [[0, 9397, 697, 5939, 11931, 19625, 469, 3162, 2], [0, 37231, 16493, 1077, 10068, 1875, 2, 1, 1], [0, 37231, 6, 244636, 1875, 2, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 0, 0, 0]]}

위 코드 출력값을 보면 , 두 번째와 세 번째 문장에는 토큰이 2개만 있는 상태라서 [CLS]와 [SEP]를 추가하면 총 4개의 토큰이 된다.  
paddding을 True로 설정하고 max_length= 5로 성정했기 때문에 추가 [PAD] 토큰이 두 번째 및 세 번째 문장에 추가 된 것이다.  
두 번째와 세 번째 문장의 어텐션 마스크에 0이 있는 이유이기도 하다.

In [15]:
# preprocess 함수를 만들어 데이터셋을 전처리 적용
#
def preprocess(data):
    return tokenizer(data['title'], 
                     padding = True,
                    truncation=True # 문장 잘림 허용 옵션
                    )

In [16]:
train_set = train_set.map(preprocess, batched = True,
                         batch_size = len(train_set))


  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
valid_set =valid_set.map(preprocess, batched = True,
                         batch_size = len(valid_set))

  0%|          | 0/1 [00:00<?, ?ba/s]

In [18]:
#set format 함수를 이용해 데이터셋에서 필요한 columns과 형식을 입력.
train_set.set_format('torch',
                     columns =['input_ids','attention_mask','label'])
valid_set.set_format('torch',
                     columns =['input_ids','attention_mask','label'])

## 모델 학습

In [19]:
# 필요한 파라미터 정의

batch = 8
epochs = 2

warmup_steps = 500
weight_decay = 0.01

In [20]:
# 학습 인수 정의

train_args = TrainingArguments(output_dir = './results',
                               num_train_epochs = epochs,
                               per_device_train_batch_size = batch,
                               per_device_eval_batch_size = batch,
                               warmup_steps = warmup_steps ,
                               weight_decay = weight_decay,
                               logging_dir='./logs')

In [21]:
# 학습 trainer 정의
trainer = Trainer(model = model,
                  args = train_args,
                  train_dataset = train_set,
                  eval_dataset = valid_set
                 )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
# 모델 학습
trainer.train()

The following columns in the training set  don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: title, url, date, guid.
***** Running training *****
  Num examples = 45678
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5710
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmoo-jong[0m (use `wandb login --relogin` to force relogin)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




Step,Training Loss
500,1.0613
1000,0.6396
1500,0.5881
2000,0.5501
2500,0.504
3000,0.4426
3500,0.4067
4000,0.4011
4500,0.3811
5000,0.3548


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2500
Configuration saved in ./results/checkpoint-2500/config.json
Model weights saved in ./results/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-3000
Configuration saved in ./results/checkpoint-3

TrainOutput(global_step=5710, training_loss=0.508902944908877, metrics={'train_runtime': 1823.8038, 'train_samples_per_second': 50.091, 'train_steps_per_second': 3.131, 'total_flos': 1643212583901000.0, 'train_loss': 0.508902944908877, 'epoch': 2.0})

In [23]:
# 학습 후 모델 평가
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: title, url, date, guid.
***** Running Evaluation *****
  Num examples = 9107
  Batch size = 16


{'eval_loss': 0.4689125716686249,
 'eval_runtime': 68.2208,
 'eval_samples_per_second': 133.493,
 'eval_steps_per_second': 8.355,
 'epoch': 2.0}