# BERT 활용

In [None]:
!pip install ratsnlp

In [3]:
# torch 기반 모델임

import torch
from ratsnlp.nlpbook.classification import ClassificationTrainArguments

In [5]:
# kcbert 를 pretrained 모델로
args = ClassificationTrainArguments(
    pretrained_model_name='beomi/kcbert-base',

    # nsmc : 네이버 영화 리뷰 데이터
    downstream_corpus_name='nsmc',
    downstream_model_dir='/content/drive/MyDrive/메타버스_아카데미_2기/딥러닝/7월/models/bert',
    batch_size=32 if torch.cuda.is_available() else 4,
    learning_rate= 5e-5,

    # 최대 문자 길이
    # 더 작은 문자들은 자동으로 padding 됌
    max_seq_length=128,
    epochs=3,
    seed=100,
)

In [6]:
from ratsnlp import nlpbook
nlpbook.set_seed(args)

set seed: 100


In [7]:
nlpbook.set_logger(args)

INFO:ratsnlp:Training/evaluation parameters ClassificationTrainArguments(pretrained_model_name='beomi/kcbert-base', downstream_task_name='document-classification', downstream_corpus_name='nsmc', downstream_corpus_root_dir='/content/Korpora', downstream_model_dir='/content/drive/MyDrive/메타버스_아카데미_2기/딥러닝/7월/models/bert', max_seq_length=128, save_top_k=1, monitor='min val_loss', seed=100, overwrite_cache=False, force_download=False, test_mode=False, learning_rate=5e-05, epochs=3, batch_size=32, cpu_workers=4, fp16=False, tpu_cores=0)
INFO:ratsnlp:Training/evaluation parameters ClassificationTrainArguments(pretrained_model_name='beomi/kcbert-base', downstream_task_name='document-classification', downstream_corpus_name='nsmc', downstream_corpus_root_dir='/content/Korpora', downstream_model_dir='/content/drive/MyDrive/메타버스_아카데미_2기/딥러닝/7월/models/bert', max_seq_length=128, save_top_k=1, monitor='min val_loss', seed=100, overwrite_cache=False, force_download=Fals

### 전이학습시킬 데이터 로드

In [9]:
from Korpora import Korpora

Korpora.fetch(
    corpus_name = args.downstream_corpus_name,
    root_dir = args.downstream_corpus_root_dir,
    force_download = True
)

[nsmc] download ratings_train.txt: 14.6MB [00:00, 222MB/s]
[nsmc] download ratings_test.txt: 4.90MB [00:00, 66.6MB/s]


### 토크나이저 준비

In [10]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
    args.pretrained_model_name,
    do_lower_case = False
)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/250k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

### 학습 데이터셋 생성

In [11]:
from ratsnlp.nlpbook.classification import NsmcCorpus,ClassificationDataset

corpus = NsmcCorpus()
train_dataset = ClassificationDataset(
    args = args,
    corpus = corpus,
    tokenizer= tokenizer,
    mode = 'train'
)

INFO:ratsnlp:Creating features from dataset file at /content/Korpora/nsmc
INFO:ratsnlp:Creating features from dataset file at /content/Korpora/nsmc
INFO:ratsnlp:loading train data... LOOKING AT /content/Korpora/nsmc/ratings_train.txt
INFO:ratsnlp:loading train data... LOOKING AT /content/Korpora/nsmc/ratings_train.txt
INFO:ratsnlp:tokenize sentences, it could take a lot of time...
INFO:ratsnlp:tokenize sentences, it could take a lot of time...
INFO:ratsnlp:tokenize sentences [took 29.872 s]
INFO:ratsnlp:tokenize sentences [took 29.872 s]
INFO:ratsnlp:*** Example ***
INFO:ratsnlp:*** Example ***
INFO:ratsnlp:sentence: 아 더빙.. 진짜 짜증나네요 목소리
INFO:ratsnlp:sentence: 아 더빙.. 진짜 짜증나네요 목소리
INFO:ratsnlp:tokens: [CLS] 아 더 ##빙 . . 진짜 짜증나네 ##요 목소리 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

In [None]:
# 커스텀 데이터셋 생성
import pandas as pd
import numpy as np
from ratsnlp.nlpbook.classification import ClassificationExample

class CustomCorpus:
  def __int__(self):
    df = pd.read_csv('appreply.csv')
    df = df.iloc[3:]
    df['score_new'] = np.where(df['score']>=4,1,0)

  # mode = 'train'일 시, train 데이터셋 넣음
  # mode = 'test' 일 시, 테스트 데이터셋 넣음
  def get_examples(self,data_root_path,mode):
    examples = []
    for temp in range(len(df)):
      text_a = df.iloc[temp]['text']
      # 넣어도됌
      text_b = None
      label = df.iloc[temp]['score_new']
      # ClassificationExample()로 집어넣음
      examples.append(ClassificationExample(text_a=text_a,text_b=text_b,label=label))
    return examples

  def get_labels(self):
    return [0,1]

  @property
  def num_labels(self):
    return len(self.get_labels)

In [12]:
train_dataset[0]

ClassificationFeatures(input_ids=[2, 2170, 832, 5045, 17, 17, 7992, 29734, 4040, 10720, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], token_type_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [14]:
from torch.utils.data import DataLoader,RandomSampler

train_dataloader = DataLoader(
    train_dataset,
    # 배치사이즈만큼 랜덤으로 가져옴
    batch_size=args.batch_size,
    sampler=RandomSampler(train_dataset,replacement=False),
    collate_fn = nlpbook.data_collator,
    drop_last=False
)

In [15]:
from torch.utils.data import SequentialSampler
val_dataset = ClassificationDataset(
    args=args,
    corpus=corpus,
    tokenizer=tokenizer,
    mode='test'
)

val_dataloader = DataLoader(
    val_dataset,
    # 배치사이즈만큼 랜덤으로 가져옴
    batch_size=args.batch_size,
    sampler=RandomSampler(val_dataset,replacement=False),
    collate_fn = nlpbook.data_collator,
    drop_last=False
)

INFO:ratsnlp:Creating features from dataset file at /content/Korpora/nsmc
INFO:ratsnlp:Creating features from dataset file at /content/Korpora/nsmc
INFO:ratsnlp:loading test data... LOOKING AT /content/Korpora/nsmc/ratings_test.txt
INFO:ratsnlp:loading test data... LOOKING AT /content/Korpora/nsmc/ratings_test.txt
INFO:ratsnlp:tokenize sentences, it could take a lot of time...
INFO:ratsnlp:tokenize sentences, it could take a lot of time...
INFO:ratsnlp:tokenize sentences [took 9.868 s]
INFO:ratsnlp:tokenize sentences [took 9.868 s]
INFO:ratsnlp:*** Example ***
INFO:ratsnlp:*** Example ***
INFO:ratsnlp:sentence: 굳 ㅋ
INFO:ratsnlp:sentence: 굳 ㅋ
INFO:ratsnlp:tokens: [CLS] 굳 ㅋ [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

### 모델 생성 및 초기화

In [17]:
from transformers import BertConfig, BertForSequenceClassification

# pretrained 모델이 어떤 환경을 가지고 있는지
# BertForMaskedLM 아키텍처

pretrained_model_config = BertConfig.from_pretrained(
    args.pretrained_model_name,
    num_labels = corpus.num_labels
)
pretrained_model_config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 300,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

In [19]:
# Bert 전이학습 시 모델에 맞는 환경을 불러와야함
model = BertForSequenceClassification.from_pretrained(
    args.pretrained_model_name,
    config= pretrained_model_config
)

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

### task 설정

In [21]:
from ratsnlp.nlpbook.classification import ClassificationTask
task = ClassificationTask(model,args)
task

ClassificationTask(
  (model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30000, 768, padding_idx=0)
        (position_embeddings): Embedding(300, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=

### 모델 학습 (1에폭당 1시간)

In [23]:
trainer = nlpbook.get_trainer(args)
trainer.fit(
    task,
    # train_dataloaders 로 뀜
    train_dataloaders = train_dataloader,
    val_dataloaders = val_dataloader
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 108 M 
--------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
435.680   Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [24]:
!pip install ratsnlp



### 학습이 완료된 후에

In [29]:
# 학습할 때와 똑같은 환경으로 맞추어 주고 load
from ratsnlp.nlpbook.classification import ClassificationTrainArguments

args = ClassificationTrainArguments(
    pretrained_model_name = 'beomi/kcbert-base',
    downstream_model_dir='/content/drive/MyDrive/메타버스_아카데미_2기/딥러닝/7월/models/bert',
    max_seq_length = 128
)

In [31]:
# 체크포인트 load
import torch
fine_tuned_model_ckpt = torch.load(
    args.downstream_model_checkpoint_path,
    map_location = torch.device('cpu')
)

AttributeError: ignored

In [32]:
from transformers import BertConfig
pretrained_model_config = BertConfig.from_pretrained(
    args.pretrained_model_name,
    num_labels = fine_tuned_model_ckpt['state_dict']['midel.classifier.bias'].shape.numel()
)

NameError: ignored

In [33]:
# 모델 초기화
from transformers import BertForSequenceClassification
model = BertForSequenceClassification(pretrained_model_config)

# 체크포인트에 있는 w,b값 업데이트
model.load_state-dict({k.replace('model.',''): v for k,v in fine_tuned_model_ckpt['state_dict'].items()})

AttributeError: ignored

In [None]:
# 문장을 집어넣으면
def inference_fn(sentence):

  # 토크나이징
  inputs = tokenizer(
      [sentence],
      max_length=arg.max_seq_length,
      padding = 'max_length',
      truncation=True
  )

  with torch.no_grad():
    outputs = model(**{k:torch.tensor(v) for k,v in inputs.items()})
    prob = outputs.logits.softmax(dim=1)
    positive_prob = round(prob[0][1].item(),4)
    negative_prob = round(prob[0][0].item(),4)
    pred = '긍정' if torch.argmax(prob) == 1 else '부정'

    return pred
