In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [3]:
# 전처리
from transformers import BertTokenizer
BERT_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_NAME)

# 데이터 준비
import nltk
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
import numpy as np
import warnings
warnings.filterwarnings('ignore')

nltk.download('movie_reviews')
ids = movie_reviews.fileids()
reviews = [movie_reviews.raw(id) for id in ids]
categoris = [movie_reviews.categories(id)[0] for id in ids]

# 라벨인코딩
y = [ 1 if c=='pos' else 0 for c in categoris]

# train/test split
x_train,x_test,y_train,y_test = train_test_split(reviews,y,stratify=y,random_state=42,test_size=0.2,shuffle=True)

# 데이터셋
import torch
from torch.utils.data import Dataset, DataLoader
class MovieReivewDataset(Dataset):
    '''
    Args
        encodings : 토크나이져된 값(딕셔너리형태)
        labels : 라벨링된 클래스값(0 1)
    '''
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, index):  # torch 데이터를받는다
        item = {
            key:val[index].clone().detach()
            for key,val in self.encodings.items()
        }
        item['labels'] = torch.tensor(self.labels[index], dtype=torch.long)
        return item
    def __len__(self):
        return len(self.labels)

# 토큰화  (BertTokenizerFast)  --- huggingFace 공식 추천
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
train_encodings =  tokenizer(
    x_train,
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors='pt'
)
test_encodings =  tokenizer(
    x_test,
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors='pt'
)
train_dataset = MovieReivewDataset(train_encodings,y_train)
test_dataset = MovieReivewDataset(test_encodings,y_test)

# Trainer API
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report
import evaluate  # load_metric
# 모델 로드  클래스개수 2
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
accuracy_metric = evaluate.load('accuracy')
precision_metric = evaluate.load('precision')
recall_metric = evaluate.load('recall')
f1_metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy' : accuracy_metric.compute(predictions=predictions, references=labels)['accuracy'],
        'precision' : precision_metric.compute(predictions=predictions, references=labels)['precision'],
        'recall' : recall_metric.compute(predictions=predictions, references=labels)['recall'],
        'f1' : f1_metric.compute(predictions=predictions, references=labels)['f1'],
    }

# TraingAguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,             # NLP 에서는 2 ~5
    # per_device_train_batch_size=8,
    # per_device_eval_batch_size=16, # 평가는 gradient 안하므로 train보다 크게설정하는 경향
    # learning_rate=2e-5,
    # weight_decay=0.01,  # L2정규화 규제 강도
    # logging_dir='./logs',
    # logging_steps=50,
    # eval_strategy='epoch',
    # save_strategy='epoch',
    # load_best_model_at_end=True,
    # metric_for_best_model='f1',
    report_to='none'
)
# Trainer gpu 자동 감지
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    compute_metrics = compute_metrics
)
# 학습실행
trainer.train()

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Step,Training Loss
500,0.4024


TrainOutput(global_step=600, training_loss=0.3566880130767822, metrics={'train_runtime': 495.9417, 'train_samples_per_second': 9.679, 'train_steps_per_second': 1.21, 'total_flos': 1262933065728000.0, 'train_loss': 0.3566880130767822, 'epoch': 3.0})

In [4]:
results = trainer.evaluate()
results

{'eval_loss': 0.5421512722969055,
 'eval_accuracy': 0.8675,
 'eval_precision': 0.8888888888888888,
 'eval_recall': 0.84,
 'eval_f1': 0.8637532133676092,
 'eval_runtime': 10.8783,
 'eval_samples_per_second': 36.77,
 'eval_steps_per_second': 4.596,
 'epoch': 3.0}

In [None]:
# Pytorch로 직접개선
# 메모리 정리
del model, trainer
torch.cuda.empty_cache()


# BertForSequenceClassification  --> Bert전용모델
# 분류용 헤드를 포함 : [CLS]토큰을 출력 -> Linear layer -> logits

# BertModel : BERT 전용
# 분류헤드없음 -> 분류 회귀 QA등 작업에 바로 쓰려면 헤드를 붙여야한다.
from transformers import BertModel
import torch.nn.functional as F
from torch.optim import AdamW
import time

