In [2]:
import warnings
warnings.filterwarnings('ignore')

import nltk
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from transformers import  AutoTokenizer,  AutoModelForSequenceClassification
import datasets

In [3]:
# 데이터 다운로드
# 라벨은 pos:1  neg:0
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
reviews = [movie_reviews.raw(id) for id in movie_reviews.fileids()]
categoris = [ movie_reviews.categories(id)[0] for id in movie_reviews.fileids() ]
labels = [  1 if label == 'pos' else 0 for label in categoris   ]

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [4]:
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels \
  = train_test_split(reviews, labels, stratify=labels, test_size=0.2, random_state=42)

In [5]:
# 토크나이져
BERT_MODEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
# 훈련/테스트 데이터 토근화
train_encodings = tokenizer(train_texts, truncation=True, padding=True,return_tensors='pt',max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True,return_tensors='pt',max_length=512)
train_encodings['input_ids'].shape,  test_encodings['input_ids'].shape


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(torch.Size([1600, 512]), torch.Size([400, 512]))

In [6]:
# torch dataset 구성
class MovieReviewDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels
  def __len__(self):
    return len(self.labels)
  def __getitem__(self, idx):
    item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item
train_dataset =   MovieReviewDataset(train_encodings, train_labels)
test_dataset =  MovieReviewDataset(test_encodings,test_labels)
print(f'훈련 샘플수 : {len(train_dataset)}')
print(f'테스트 샘플수 : {len(test_dataset)}')

훈련 샘플수 : 1600
테스트 샘플수 : 400


In [7]:
next(iter(train_dataset)).keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [8]:
# 모델 로드
model = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL_NAME, num_labels=2)
print(f'파라메터수 : {sum( p.numel() for p in model.parameters() )}')
print(f'학습 가능한 파라메터 : {sum( p.numel() for p in model.parameters() if p.requires_grad)} ')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


파라메터수 : 109483778
학습 가능한 파라메터 : 109483778 


In [9]:
!pip install evaluate



In [10]:
# 평가 매트릭스
import evaluate
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script: 0.00B [00:00, ?B/s]

In [11]:
from transformers import TrainingArguments, Trainer

In [14]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)
