# 텍스트 분류를 위한 BERT fine-tunning

In [8]:
# 라이브러리 임포트
from transformers import AutoModelForSequenceClassification,AutoTokenizer, Trainer, TrainingArguments

import torch
import numpy as np

## datasets 로딩
datasets 라이브러리를 이용하여 데이터를 가져옵니다.

In [4]:
from datasets import load_dataset
datasets = load_dataset("imdb")

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /home/ubuntu/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [6]:
# 데이터 셋을 학습 데이터 셋과 검증 데이터 셋으로 나눔 
train_set = datasets['train']
test_set = datasets['test']

In [9]:
# 시퀀스 분류를 위한 robert 모델 불러오기 
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base',num_labels = 7)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

In [10]:
# tokenizer 불러오기 
# xlm-roberta-base 모델을 학습할 때 사용된 tokenizer 를 불러와야 된다.

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

## 데이터셋 전처리
아래의 작업을 토크나이저를 사용하여 데이터셋을 빠르게 전처리 할 수 있다.
1. 시작 부분에 [CLS]토큰을 추가하고 끝 부분에 [SEP]토큰을 추가.
2. 토큰을 토큰ID(숫자)에 매핑
3. segment_id(토큰 타입 id) 추가
    - 토큰 타입 ID는 입력 문장이 여러개일 때 각 문장을 구별하는데 사용 됨.
    - 첫 번째 문장의 모든 토큰은 0에 매핑하고 두 번째 문장의 모든 토큰은 1로 매핑
    - 현재 예제에서는 문장이 하나만 들어가기 때문에 모두 0으로 매핑
4. attention maks 생성


In [11]:
tokenizer(['한국은 사계절이 있다' ,'나는 배고프다', '나는 덥다'],padding = True,
         max_length = 5)

{'input_ids': [[0, 9397, 697, 5939, 11931, 19625, 469, 3162, 2], [0, 37231, 16493, 1077, 10068, 1875, 2, 1, 1], [0, 37231, 6, 244636, 1875, 2, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 0, 0, 0]]}

위 코드 출력값을 보면 , 두 번째와 세 번째 문장에는 토큰이 2개만 있는 상태라서 [CLS]와 [SEP]를 추가하면 총 4개의 토큰이 된다.  
paddding을 True로 설정하고 max_length= 5로 성정했기 때문에 추가 [PAD] 토큰이 두 번째 및 세 번째 문장에 추가 된 것이다.  
두 번째와 세 번째 문장의 어텐션 마스크에 0이 있는 이유이기도 하다.

In [14]:
# preprocess 함수를 만들어 데이터셋을 전처리 적용
#
def preprocess(data):
    return tokenizer(data['text'], 
                     padding = True,
                    truncation=True # 문장 잘림 허용 옵션
                    )

In [15]:
train_set = train_set.map(preprocess, batched = True,
                         batch_size = len(train_set))


  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
test_set =test_set.map(preprocess, batched = True,
                         batch_size = len(test_set))

  0%|          | 0/1 [00:00<?, ?ba/s]

In [18]:
#set format 함수를 이용해 데이터셋에서 필요한 columns과 형식을 입력.
train_set.set_format('torch',
                     columns =['input_ids','attention_mask','label'])
test_set.set_format('torch',
                     columns =['input_ids','attention_mask','label'])

## 모델 학습

In [19]:
# 필요한 파라미터 정의

batch = 8
epochs = 2

warmup_steps = 500
weight_decay = 0.01

In [20]:
# 학습 인수 정의

train_args = TrainingArguments(output_dir = './results',
                               num_train_epochs = epochs,
                               per_device_train_batch_size = batch,
                               per_device_eval_batch_size = batch,
                               warmup_steps = warmup_steps ,
                               weight_decay = weight_decay,
                               logging_dir='./logs')

In [21]:
# 학습 trainer 정의
trainer = Trainer(model = model,
                  args = train_args,
                  train_dataset = train_set,
                  eval_dataset = test_set
                 )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
# 모델 학습
trainer.train()

The following columns in the training set  don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 25000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3126
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmoo-jong[0m (use `wandb login --relogin` to force relogin)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




Step,Training Loss
500,0.5833
1000,0.3308
1500,0.2844
2000,0.2114
2500,0.1899
3000,0.1594


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2500
Configuration saved in ./results/checkpoint-2500/config.json
Model weights saved in ./results/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-3000
Configuration saved in ./results/checkpoint-3

TrainOutput(global_step=3126, training_loss=0.28737024916186976, metrics={'train_runtime': 1593.359, 'train_samples_per_second': 31.38, 'train_steps_per_second': 1.962, 'total_flos': 1.315614336e+16, 'train_loss': 0.28737024916186976, 'epoch': 2.0})

In [23]:
# 학습 후 모델 평가
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16


{'eval_loss': 0.22251223027706146,
 'eval_runtime': 283.6543,
 'eval_samples_per_second': 88.135,
 'eval_steps_per_second': 5.51,
 'epoch': 2.0}