BERT 위에 출력층이 추가된 모델 클래스를 바로 사용하기<br>
(마지막 Output layer를 별도 지정하지 않아도 사용 가능)


https://github.com/ukairia777/tensorflow-nlp-tutorial/blob/main/18.%20Fine-tuning%20BERT%20(Cls%2C%20NER%2C%20NLI)/18-4.%20kor_bert_nsmc_model_from_transformers_gpu.ipynb

In [None]:
# 데이터 전처리

import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import re
import urllib.request

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

In [None]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

In [None]:
print('train count:',len(train_data))
print('test count:', len(test_data))
train_data.head()

In [None]:
train_data['document'].nunique(), train_data['label'].nunique()

In [None]:
train_data.drop_duplicates(subset=['document'], inplace=True) # 중복 제거
test_data.drop_duplicates(subset=['document'], inplace=True)
print('train count:',len(train_data))
print('test count:', len(test_data))

In [None]:
train_data.dropna(how='any', inplace=True)
test_data.dropna(how='any', inplace=True)
print(train_data.isnull().values.any())
print(test_data.isnull().values.any())

Tokenizer

In [None]:
!pip install transformers

In [None]:
import transformers
transformers.__version__

In [None]:
from transformers import BertTokenizerFast #?

tokenizer = BertTokenizerFast.from_pretrained("klue/bert-base")

In [None]:
X_train_list = train_data['document'].tolist()
X_test_list = test_data['document'].tolist()
y_train = train_data['label'].tolist()
y_test = test_data['label'].tolist()

In [None]:
X_train = tokenizer(X_train_list, padding=True, truncation=True)
X_test = tokenizer(X_test_list, padding=True, truncation=True)

In [None]:
print(X_train[0].tokens)
print(X_train[0].ids)
print(X_train[0].type_ids)
print(X_train[0].attention_mask)

데이터셋 생성 및 모델 학습

In [None]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_train),
    y_train
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_test),
    y_test
))

In [None]:
from transformers import TFBertForSequenceClassification
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

In [None]:
# Model 가져오기; Model 바로 컴파일(w/ optimizer, loss function)
model = TFBertForSequenceClassification.from_pretrained("klue/bert-base", num_labels=2, from_pt=True)
model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics=['accuracy'])

In [None]:
model.hf_compute_loss

In [None]:
callback_earlystop = EarlyStopping(
    monitor="val_accuracy",
    min_delta=0.001,
    patience=2)

model.fit(
    train_dataset.shuffle(10000).batch(32), epochs=5, batch_size=64,
    validation_data=val_dataset.shuffle(10000).batch(64),
    callbacks=[callback_earlystop]
)

In [None]:
model.evaluate(val_dataset.batch(1024))

모델 저장

In [None]:
model.save_pretrained('nsmc_model/bert-base')
tokenizer.save_pretrained('nsmc_model/bert-base')

모델 로드 및 테스트

In [None]:
from transformers import TextClassificationPipeline

# load
loaded_tokenizer = BertTokenizerFast.from_pretrained('nsmc_model/bert-base')
loaded_model = TFBertForSequenceClassification.from_pretrained('nsmc_model/bert-base')

text_classifier = TextClassificationPipeline(
    tokenizer=loaded_tokenizer,
    model=loaded_model,
    framework='tf',
    return_all_scores=True
)

In [None]:
test_data

In [None]:
text_classifier('뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아')[0]