## imdb 데이터 셋을 이용한 감정 분석

In [None]:
# 필요 패키지 설치
!pip install nlp
!pip install transformers



In [None]:
# 라이브러리 임포트
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset
import torch
import numpy as np

In [None]:
# 데이터셋과 모델 로딩
!gdown https://drive.google.com/uc?id=11_M4ootuT7I1G0RlihcC0cA3Elqotlc-
dataset = load_dataset('csv',data_files = './imdbs.csv', split='train')

Downloading...
From: https://drive.google.com/uc?id=11_M4ootuT7I1G0RlihcC0cA3Elqotlc-
To: /content/imdbs.csv
  0% 0.00/132k [00:00<?, ?B/s]100% 132k/132k [00:00<00:00, 40.4MB/s]


Using custom data configuration default


In [None]:
dataset

Dataset(features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}, num_rows: 100)

In [None]:
# 데이터 셋 분리(train, validation)
dataset = dataset.train_test_split(test_size = 0.3)
dataset

{'test': Dataset(features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}, num_rows: 30),
 'train': Dataset(features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}, num_rows: 70)}

In [None]:
train_set = dataset['train']
test_set = dataset['test']

In [None]:
# 모델 불러오기
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
# 토크나이저 불러오기
# BertTokenizer 대신 BertTokenizerFast ㅏ용
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
# 토크나이저를 이용하여 데이터셋 전처리 

In [None]:
# 문장을 토큰화 하고 시작 부분에 [CLS] 토큰을 추가 하고 끝에 [SEP] 토큰 추가
def preprocess(data):
    return tokenizer(data['text'], padding = True, truncation = True)

In [None]:
# preprocess 함수를 사용해 학습 및 테스트셋 전처리
train_set = train_set.map(preprocess, batched = True,batch_size = 128)
test_set = test_set.map(preprocess, batched = True, batch_size = 128)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
train_set.features

{'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'label': Value(dtype='int64', id=None),
 'text': Value(dtype='string', id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [None]:
#set_format함수를 사용해 필요한 columns과 형식을 입력.
train_set.set_format('torch',
                     columns = ['input_ids','attention_mask','label'])
test_set.set_format('torch',
                    columns = ['input_ids','attention_mask','label'])

In [None]:
# 모델 학습
batch_size = 8
epochs = 2

In [None]:
warmup_steps  = 500
weight_decay = 0.01

In [None]:
#train 인수 정의
training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs = epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size= batch_size,
    warmup_steps = warmup_steps,
    weight_decay = weight_decay,
    #evaluate_during_training= True,
    logging_dir = './logs'
)

# trainer 정의
trainer = Trainer(model = model,
                  args = training_args,
                  train_dataset = train_set,
                  eval_dataset = test_set)

In [None]:
# 학습 시작
trainer.train()

# 평가
trainer.evaluate()

***** Running training *****
  Num examples = 70
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 18


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 30
  Batch size = 8


{'epoch': 2.0,
 'eval_loss': 0.6446303725242615,
 'eval_runtime': 2.553,
 'eval_samples_per_second': 11.751,
 'eval_steps_per_second': 1.567}