In [1]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.




In [2]:
import os
import re
import torch
import numpy as np
import pandas as pd

from tqdm import trange
from torch import nn
from torch.optim import Adam
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from transformers import get_linear_schedule_with_warmup
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from transformers import BertForSequenceClassification, AdamW, BertConfig, BertModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [3]:
print(torch.cuda.is_available())

True


### 데이터 load

In [4]:
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')

In [5]:
test

Unnamed: 0,id,reviews
0,0,채소가 약간 시들어 있어요
1,1,발톱 두껍고 단단한 분들 써도 소용없어요 이 테이프 물렁거리고 힘이없어서 들어 올리...
2,2,부들부들 좋네요 입어보고 시원하면 또 살게요
3,3,이런 1. 8 골드 주라니깐 파란개 오네 회사전화걸어도 받지도 않고 머하자는거임?
4,4,검수도 없이 보내구 불량 배송비 5000원 청구하네요 완전별로 별하나도 아까워요
...,...,...
24995,24995,사용해보니 좋아요~^^
24996,24996,저렴한가격에. 질좋고. 핏좋고. 너무. 이쁘게. 입고다녀요..
24997,24997,세트상품이라고 써있어서 그런줄 알고 구매했더니 단품이었네요 낚인 느낌도 들고 그러네...
24998,24998,역시 로네펠트!! 좋아요.


### 토크나이저 load

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

### 데이터 전처리

#### 1) 특수문자 제거

In [7]:
clean_train = []
clean_test = []

for train_txt, test_txt in zip(train.reviews, test.reviews):
    clean_train.append(re.sub('[^가-힣ㄱ-하-ㅣ\\s]', ' ', train_txt))

#### 2) 토큰 추가

In [8]:
train_add_token = ['[CLS] '+s+' [SEP]' for s in clean_train]

In [9]:
tokenized_train = [tokenizer.tokenize(i) for i in train_add_token]

### 분류 모델

In [11]:
input_ids = [tokenizer.convert_tokens_to_ids(i) for i in tokenized_train]

len_check = []

for i in input_ids:
    len_check.append(len(i))

max(len_check)

116

In [12]:
MAX_LEN = 128 # default

input_ids= pad_sequences(input_ids, maxlen=MAX_LEN, truncating='post', padding='post')

In [13]:
input_ids[0]

array([   101,   9678,  16985,  48549,  62849,  17196,  58303,   9496,
        14153, 119423,  12965,  48549,    102,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
      

In [15]:
attention_mask = []

for ids in input_ids:
    seq_mask = [float(i>0) for i in ids]
    attention_mask.append(seq_mask)

In [16]:
attention_mask[0]

[1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [24]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, train['target'].values, random_state=42, test_size=0.25)

In [31]:
train_mask, validation_mask, _, _ = train_test_split(attention_mask, input_ids, random_state=42, test_size=0.2)

In [32]:
BATCH_SIZE = 32

train_data = TensorDataset(torch.tensor(train_inputs), torch.tensor(train_labels), torch.tensor(train_mask))
train_data = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=BATCH_SIZE)

validation_data = TensorDataset(torch.tensor(validation_inputs), torch.tensor(validation_labels), torch.tensor(validation_mask))
validation_data = DataLoader(validation_data, sampler=RandomSampler(validation_data), batch_size=BATCH_SIZE)

AssertionError: Size mismatch between tensors

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [29]:
print(train_inputs.shape)
print(train_labels.shape)
print(len(train_mask))

(18750, 128)
(18750,)
20000


In [None]:
model = BertModel.from_pretrained('bert-base-multilingual-cased').to(device)

In [None]:
earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=2)

In [None]:
seed_val = 42
epochs = 20

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# 그래디언트 초기화
model.zero_grad()

# 에폭만큼 반복
for epoch_i in trange(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 시작 시간 설정
    t0 = time.time()

    # 로스 초기화
    total_loss = 0

    # 훈련모드로 변경
    model.train()
        
    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        # Forward 수행                
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        # 로스 구함
        loss = outputs[0]

        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        optimizer.step()

        # 스케줄러로 학습률 감소
        scheduler.step()

        # 그래디언트 초기화
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    #시작 시간 설정
    t0 = time.time()

    # 평가모드로 변경
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch
        
        # 그래디언트 계산 안함
        with torch.no_grad():     
            # Forward 수행
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # 로스 구함
        logits = outputs[0]

        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")
print("")