In [None]:
import warnings
warnings.filterwarnings('ignore')

import nltk
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from transformers import  AutoTokenizer,  AutoModelForSequenceClassification
import datasets

In [None]:
# 데이터 다운로드
# 라벨은 pos:1  neg:0
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
reviews = [movie_reviews.raw(id) for id in movie_reviews.fileids()]
categoris = [ movie_reviews.categories(id)[0] for id in movie_reviews.fileids() ]
labels = [  1 if label == 'pos' else 0 for label in categoris   ]

In [None]:
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels \
  = train_test_split(reviews, labels, stratify=labels, test_size=0.2, random_state=42)

In [None]:
# 토크나이져
BERT_MODEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
# 훈련/테스트 데이터 토근화
train_encodings = tokenizer(train_texts, truncation=True, padding=True,return_tensors='pt',max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True,return_tensors='pt',max_length=512)
train_encodings['input_ids'].shape,  test_encodings['input_ids'].shape


In [None]:
# torch dataset 구성
class MovieReviewDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels
  def __len__(self):
    return len(self.labels)
  def __getitem__(self, idx):
    item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item
train_dataset =   MovieReviewDataset(train_encodings, train_labels)
test_dataset =  MovieReviewDataset(test_encodings,test_labels)
print(f'훈련 샘플수 : {len(train_dataset)}')
print(f'테스트 샘플수 : {len(test_dataset)}')

In [None]:
next(iter(train_dataset)).keys()

In [None]:
# 모델 로드
model = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL_NAME, num_labels=2)
print(f'파라메터수 : {sum( p.numel() for p in model.parameters() )}')
print(f'학습 가능한 파라메터 : {sum( p.numel() for p in model.parameters() if p.requires_grad)} ')

In [None]:
!pip install evaluate

In [None]:
# 평가 매트릭스
import evaluate
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to = 'none'  # W&B TensorBoard 자동 로딩 모두 끔
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)
print(f'에포크 : {training_args.num_train_epochs}')
print(f'배치크기 : {training_args.per_device_train_batch_size}')
print(f'학습률 : {training_args.learning_rate}')


In [None]:
# 모델 학습
train_result = trainer.train()
print(f'총 학습시간 : {train_result.metrics["train_runtime"]}')
print(f'최종손실 : {train_result.metrics["train_loss"]}')