In [14]:
!pip install evaluate



In [None]:
# 전처리
from transformers import BertTokenizer
BERT_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_NAME)

# 데이터 준비
import nltk
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
import numpy as np
import warnings
warnings.filterwarnings('ignore')

nltk.download('movie_reviews')
ids = movie_reviews.fileids()
reviews = [movie_reviews.raw(id) for id in ids]
categoris = [movie_reviews.categories(id)[0] for id in ids]

# 라벨인코딩
y = [ 1 if c=='pos' else 0 for c in categoris]

# train/test split
x_train,x_test,y_train,y_test = train_test_split(reviews,y,stratify=y,random_state=42,test_size=0.2,shuffle=True)

# 데이터셋
import torch
from torch.utils.data import Dataset, DataLoader
class MovieReivewDataset(Dataset):
    '''
    Args
        encodings : 토크나이져된 값(딕셔너리형태)
        labels : 라벨링된 클래스값(0 1)
    '''
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, index):  # torch 데이터를받는다
        item = {
            key:val[index].clone().detach()
            for key,val in self.encodings.items()
        }
        item['labels'] = torch.tensor(self.labels[index], dtype=torch.long)
        return item
    def __len__(self):
        return len(self.labels)

# 토큰화  (BertTokenizerFast)  --- huggingFace 공식 추천
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
train_encodings =  tokenizer(
    x_train,
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors='pt'
)
test_encodings =  tokenizer(
    x_test,
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors='pt'
)
train_dataset = MovieReivewDataset(train_encodings,y_train)
test_dataset = MovieReivewDataset(test_encodings,y_test)

# Trainer API
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report
import evaluate  # load_metric
# 모델 로드  클래스개수 2
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
accuracy_metric = evaluate.load('accuracy')
precision_metric = evaluate.load('precision')
recall_metric = evaluate.load('recall')
f1_metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy' : accuracy_metric.compute(predictions=predictions, references=labels)['accuracy'],
        'precision' : precision_metric.compute(predictions=predictions, references=labels)['precision'],
        'recall' : recall_metric.compute(predictions=predictions, references=labels)['recall'],
        'f1' : f1_metric.compute(predictions=predictions, references=labels)['f1'],
    }

# TraingAguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,             # NLP 에서는 2 ~5
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16, # 평가는 gradient 안하므로 train보다 크게설정하는 경향
    learning_rate=2e-5,
    weight_decay=0.01,  # L2정규화 규제 강도
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    report_to='none'
)
# Trainer gpu 자동 감지
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    compute_metrics = compute_metrics
)
# 학습실행
trainer.train()

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [None]:
results = trainer.evaluate()
results

In [None]:
# Pytorch로 직접개선
# 메모리 정리
# del model , trainer
torch.cuda.empty_cache()


# BertForSequenceClassification  --> Bert전용모델
# 분류용 헤드를 포함 : [CLS]토큰을 출력 -> Linear layer -> logits

# BertModel : BERT 전용
# 분류헤드없음 -> 분류 회귀 QA등 작업에 바로 쓰려면 헤드를 붙여야한다.
from transformers import BertModel
import torch.nn.functional as F
from torch.optim import AdamW
import time

# Bert모델
bert_model = BertModel.from_pretrained('bert-base-uncased')

class ImporovedBertClassifier(torch.nn.Module):
  ''' 개선버전
    - Dropout 추가
    - 초기화 개선
  '''
  def __init__(self,pretrained_model, num_labels=2,dropout=0.1):
    super(ImporovedBertClassifier, self).__init__()
    self.bert = pretrained_model
    self.dropout = torch.nn.Dropout(dropout)
    self.classifier = torch.nn.Linear(self.bert.config.hidden_size, num_labels)
    # BERT 분류기 가중치 초기화 - 선형계층 초기화(입력과 출력의 분산을 일정하게 유지)
    torch.nn.init.xavier_uniform_(self.classifier.weight)
    # 편향 초기화
    torch.nn.init.zeros_(self.classifier.bias)
  def forward(self,input_ids,attention_mask, token_type_ids):
    outputs = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids
    )
    cls_output = outputs.last_hidden_state[ : , 0 , : ]
    cls_output =  self.dropout(cls_output)
    return self.classifier(cls_output)

#모델초기화
model = ImporovedBertClassifier(bert_model, num_labels=2)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# dataloader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

#옵티마이져 손실함수
optimizer = AdamW(model.parameters(),lr=2e-5, weight_decay=0.01)
# crossentropyloss는 softmax가 적용
criterion = torch.nn.CrossEntropyLoss()
num_epochs = 3
model.train()
start = time.time()

# 학습루프
for epoch in range(num_epochs):
  total_loss = 0
  for step,batch in  enumerate(train_loader):
    optimizer.zero_grad()
    #입력준비
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    token_type_ids = batch['token_type_ids'].to(device)
    labels = batch['labels'].to(device)
    # forward
    outputs = model(input_ids, attention_mask, token_type_ids)
    loss = criterion(outputs,labels)

    #backward
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
epoch_loss = total_loss / len(train_loader)
print(f'epoch : {epoch+1}  loss : {epoch_loss}')



