<a href="https://colab.research.google.com/github/piabona/Natural-Language-Processing/blob/main/06_Word_embedding_%EA%B3%BC_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
DATA_PATH = "/content/drive/MyDrive/06_nlp/data/"
SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
df = pd.read_csv(f"{DATA_PATH}imdb_dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.sentiment = (df.sentiment == "positive").astype(int)

In [None]:
target = df.sentiment.to_numpy().reshape(-1,1)
target

array([[1],
       [1],
       [1],
       ...,
       [0],
       [0],
       [0]])

- 토큰화

In [None]:
import re
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

In [None]:
pat = re.compile(r"\b\w{2,}\b")
def tokenizer(text,pat):
    text = text.lower()
    words = []
    for w in pat.findall(text):
        if w not in ENGLISH_STOP_WORDS:
            words.append(w)
    return words

In [None]:
tokenizer(df["review"][0],pat)

['reviewers',
 'mentioned',
 'watching',
 'just',
 'oz',
 'episode',
 'll',
 'hooked',
 'right',
 'exactly',
 'happened',
 'br',
 'br',
 'thing',
 'struck',
 'oz',
 'brutality',
 'unflinching',
 'scenes',
 'violence',
 'set',
 'right',
 'word',
 'trust',
 'faint',
 'hearted',
 'timid',
 'pulls',
 'punches',
 'regards',
 'drugs',
 'sex',
 'violence',
 'hardcore',
 'classic',
 'use',
 'word',
 'br',
 'br',
 'called',
 'oz',
 'nickname',
 'given',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'focuses',
 'mainly',
 'emerald',
 'city',
 'experimental',
 'section',
 'prison',
 'cells',
 'glass',
 'fronts',
 'face',
 'inwards',
 'privacy',
 'high',
 'agenda',
 'em',
 'city',
 'home',
 'aryans',
 'muslims',
 'gangstas',
 'latinos',
 'christians',
 'italians',
 'irish',
 'scuffles',
 'death',
 'stares',
 'dodgy',
 'dealings',
 'shady',
 'agreements',
 'far',
 'away',
 'br',
 'br',
 'say',
 'main',
 'appeal',
 'fact',
 'goes',
 'shows',
 'wouldn',
 'dare',
 'forget',
 'pretty',

In [None]:
docs = []
for text in df["review"]:
    tokens = tokenizer(text,pat)
    docs.append(tokens)
len(docs)

50000

In [None]:
from gensim.models.word2vec import Word2Vec

In [None]:
emb_model = Word2Vec(docs,vector_size=64,sg=1,seed=SEED)

# 학습 데이터 만들기
- batch, seq(단어순서), feature(단어벡터)

In [None]:
len_list = [ len(doc) for doc in  docs]
max(len_list) ,min(len_list) , np.mean(len_list)

(1304, 3, 110.85568)

In [None]:
max_len = np.mean(len_list).astype(int)
max_len

110

In [None]:
train = []
for doc in tqdm(docs):
    vec = [ emb_model.wv[w] for w in doc if emb_model.wv.key_to_index.get(w) is not None ]
    vec = np.array(vec)[:max_len]
    if vec.shape[0] < max_len:
        diff = max_len - vec.shape[0]

        vec = np.concatenate([
           np.zeros([ diff, vec.shape[1] ]) , vec
        ])
    train.append(vec)
train = np.array(train)
train.shape

  0%|          | 0/50000 [00:00<?, ?it/s]

(50000, 110, 64)

- 이 다음 부터는 rnn 관련 레이어를 이용해서 pytorch 로 학습 한후에 교차검증 점수 결과를 저한테 알려주세요~~


# 데이터셋 클래스 만들기

In [None]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self,x,y=None):
        self.x = x
        self.y = y

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self,idx):
        item = {}
        item["x"] = torch.Tensor(self.x[idx])
        if self.y is not None:
            item["y"] = torch.Tensor(self.y[idx])
        return item

# 모델만들기(rnn 기반 레이어 활용하기)

In [None]:
class Net(torch.nn.Module):
    def __init__(self,input_size,hidden_size):
        super().__init__()
        self.rnn_layer = torch.nn.LSTM(input_size, hidden_size,batch_first =True )
        self.linear_layer = torch.nn.Linear(hidden_size,hidden_size//2)
        self.relu = torch.nn.ReLU()
        self.output_layer = torch.nn.Linear(hidden_size//2 , 1)
    def forward(self,x):
        _, (hn, cn)  =  self.rnn_layer(x)
        x = hn[-1] # n_layer, batch, feature
        x = self.linear_layer(x)
        x = self.relu(x)
        return self.output_layer(x)

In [None]:
batch = torch.Tensor(train[:2])
batch.shape

torch.Size([2, 110, 64])

In [None]:
Net(batch.shape[2],128)(batch)

tensor([[-0.1219],
        [-0.1153]], grad_fn=<AddmmBackward0>)

# 학습 loop 함수 만들기

In [None]:
def train_loop(dataloader,model,loss_fn,optimizer,device):
    epoch_loss = 0
    model.train() # 모델 객체를 학습모드로 전환
    for batch in dataloader:
        pred = model( batch["x"].to(device) )
        loss = loss_fn( pred,batch["y"].to(device) )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)

    return epoch_loss

# 테스트 loop 함수 만들기


In [None]:
@torch.no_grad()
def test_loop(dataloader,model,loss_fn,device):
    model.eval() # 평가 모드
    sig = torch.nn.Sigmoid()
    pred_list = []
    epoch_loss = 0
    for batch in dataloader:
        pred = model(batch["x"].to(device))

        # 검증 평가할 경우
        if batch.get("y") is not None:
            loss = loss_fn(pred,batch["y"].to(device))
            epoch_loss += loss.item()

        # 예측값 만들기
        pred = sig(pred)
        pred = pred.to("cpu").numpy()
        pred_list.append(pred)

    pred = np.concatenate(pred_list)
    epoch_loss /= len(dataloader)
    return epoch_loss, pred

# 하이퍼파라미터 정의

In [None]:
batch_size = 32
loss_fn = torch.nn.BCEWithLogitsLoss() # 손실 객체
epochs = 100
input_size = train.shape[2]
hidden_size = 128
n_splits = 5 # 폴드수

# 학습하기

In [None]:
from sklearn.metrics import accuracy_score
import gc

In [None]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=n_splits,random_state=SEED,shuffle=True)

In [None]:
is_holdout = False
reset_seeds(SEED)
best_score_list = []
for i, (tri,vai) in enumerate(cv.split(train)):

    train_dt = ReviewDataset(train[tri],target[tri]) # 학습용 데이터셋
    valid_dt = ReviewDataset(train[vai],target[vai]) # 검증용 데이터셋

    train_dl = torch.utils.data.DataLoader(train_dt,batch_size=batch_size,shuffle=True) # 학습용 데이터로더
    valid_dl = torch.utils.data.DataLoader(valid_dt,batch_size=batch_size,shuffle=False) # 검증용 데이터로더

    model = Net(input_size,hidden_size).to(device)
    optimizer = torch.optim.Adam(model.parameters())

    best_score = 0 # acc 최고 점수를 저장할 변수
    patience = 0 # 조기종료조건을 주기 위한 변수
    for epoch in range(epochs):
        train_loss = train_loop(train_dl,model,loss_fn,optimizer,device)
        valid_loss, pred = test_loop(valid_dl,model,loss_fn,device)
        pred = (pred > 0.5).astype(int)
        score = accuracy_score(target[vai],pred)
        print(score)
        patience += 1
        if best_score < score:
            patience = 0
            best_score = score # 최고 점수 갱신
            torch.save(model.state_dict(), f"model_{i}.pth")

        if patience == 5:
            break

    print(f"{i} 번째 폴드 ACC: {best_score}")
    best_score_list.append(best_score)

    del train_dl, train_dt, valid_dl, valid_dt, optimizer, model
    gc.collect()
    torch.cuda.empty_cache()
    if is_holdout:
        break

0.837
0.8623
0.8731
0.8758
0.8794
0.8817
0.8822
0.8827
0.8837
0.881
0.8795
0.853
0.8629
0.8674
0 번째 폴드 ACC: 0.8837
0.8498
0.8675
0.8742
0.8802
0.8647
0.8849
0.8858
0.888
0.881
0.875
0.8835
0.8723
0.8672
1 번째 폴드 ACC: 0.888
0.825
0.8601
0.8698
0.8528
0.8757
0.8782
0.8793
0.8796
0.8805
0.8786
0.8808
0.8788
0.8814
0.8776
0.88
0.8765
0.8777
0.8723
2 번째 폴드 ACC: 0.8814
0.8436
0.859
0.8728
0.8647
0.8795
0.8797
0.8816
0.8768
0.883
0.8806
0.8807
0.8819
0.8749
0.871
3 번째 폴드 ACC: 0.883
0.5036
0.8554
0.8673
0.8672
0.872
0.8662
0.875
0.8765
0.8799
0.8789
0.8807
0.8788
0.8786
0.8731
0.8758
0.867
4 번째 폴드 ACC: 0.8807


In [None]:
np.mean(best_score_list)

0.88336