#### 문장 분류 모델 생성 및 학습 <hr>

In [21]:
import torch.nn as nn
from typing import Literal

[1] 커스텀 모델 생성<hr>

In [22]:
# 모델 흐름
# 1. 입력받은 값을 통해 임베딩 값 얻음
# 2. 임베딩 값을 통해 출력값 얻음
# 3. 출력값의 마지막 시점만을 활용하여 분류

class textCLF(nn.Module):
    def __init__(self, n_vocab, hidden_dim, embedding_dim, n_layers,
                 model_type: Literal['lstm', 'rnn'], dropout=0.5, bidirectional=True):
        super().__init__()

        # 임베딩 층
        # num-> 단어사전의 크기
        self.embedding= nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=embedding_dim,
            padding_idx=0
        )
        if model_type =='rnn':
            self.model=nn.RNN(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True
            )
        elif model_type =='lstm':
            self.model= nn.LSTM(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True
            )
        # 양방향 여부
        if bidirectional:
            self.classifier= nn.Linear(hidden_dim*2, 1)
        else:
            self.classifier= nn.Linear(hidden_dim, 1)
        self.dropout=nn.Dropout(dropout)

    #포워딩
    def forward(self, input):
        embeddings= self.embedding(input)
        output,_= self.model(embeddings)
        last_output=output[:,-1,:]
        last_output=self.dropout(last_output)
        logits=self.classifier(last_output)
        return logits

[2] 데이터 불러오기<hr>

In [23]:
import pandas as pd
from Korpora import Korpora
from torch.utils.data import Dataset, DataLoader

In [24]:
# 데이터 불러오기
corpus= Korpora.load('nsmc')
corpusDF=pd.DataFrame(corpus.test)

# 학습/ 테스트로 분할
train=corpusDF.sample(frac=0.9, random_state=42)
test=corpusDF.drop(train.index)

len(train), len(test)



    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\KDP-25\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\KD

(45000, 5000)

In [25]:
# 토큰화 및 단어사전 구축
from collections import Counter
import re
from konlpy.tag import Okt

def build_vocab(corpus, n_vocab, special_tokens):
    counter=Counter()
    for tokens in corpus:
        counter.update(tokens)
    vocab= special_tokens
    for token, count in counter.most_common(n_vocab):
        vocab.append(token)
    return vocab
tokenizer=Okt()
# 토큰화
train_tokens=[tokenizer.morphs(review) for review in train.text]
test_tokens=[tokenizer.morphs(review) for review in test.text]

vocab= build_vocab(corpus=train_tokens, n_vocab=5000, special_tokens=['<PAD>', '<UNK>'])
token_to_id= {token:idx for idx, token in enumerate(vocab)}
id_to_token= {idx: token for idx, token in enumerate(vocab)}


In [26]:
# 정수 인코딩 및 패딩
import numpy as np

def padding(sequences, max_length, pad_value):
    result=[]
    for s in sequences:
        s=s[:max_length]
        pad_length=max_length-len(s)
        padded_s=s+[pad_value]*pad_length
        result.append(padded_s)
    return np.asarray(result)

unk_id= token_to_id['<UNK>']
train_ids=[
    [token_to_id.get(token, unk_id) for token in review] for review in train_tokens
]
test_ids= [
    [token_to_id.get(token, unk_id) for token in review] for review in test_tokens
]

max_length=32
pad_id=token_to_id['<PAD>']
train_ids= padding(train_ids, max_length, pad_id)
test_ids=padding(test_ids, max_length, pad_id)


In [27]:
train_ids.shape, train_labels.shape

((45000, 32), torch.Size([45000]))

In [28]:
import torch
from torch.utils.data import TensorDataset, DataLoader

train_ids=torch.tensor(train_ids)
test_ids=torch.tensor(test_ids)

train_labels=torch.tensor(train.label.values, dtype=torch.float32)
test_labels= torch.tensor(test.label.values, dtype= torch.float32)

train_dataset= TensorDataset(train_ids, train_labels)
test_dataset= TensorDataset(test_ids, test_labels)

train_loader= DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader= DataLoader(test_dataset, batch_size=16, shuffle=True)

In [29]:
# 손실 및 최적화 함수
import torch.nn as nn
from torch import optim

n_vocab= len(token_to_id)
hidden_dim= 64
embedding_dim= 128
n_layers= 2

device= 'cuda' if torch.cuda.is_available() else 'cpu'
classifier= textCLF(
    n_vocab=n_vocab, hidden_dim=hidden_dim, 
    embedding_dim=embedding_dim, n_layers=n_layers, model_type='lstm'
).to(device)
criterion= nn.BCEWithLogitsLoss().to(device)
optimizer= optim.RMSprop(classifier.parameters(), lr=0.001)


In [31]:
def train(model, datasets, criterion, optimizer, device, interval):
    model.train()
    losses= []
    for step, (input_ids, labels) in enumerate(datasets):
        input_ids= input_ids.to(device)
        labels= labels.to(device).unsqueeze(1)

        logits=model(input_ids)
        loss=criterion(logits, labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step%interval==0:
            print(f'train_loss {step}: {np.mean(losses)}')

def test(model, datasets, criterion, device):
    model.eval()
    losses=[]
    corrects=[]

    for step, (input_ids, labels) in enumerate(datasets):
        input_ids= input_ids.to(device)
        labels=labels.to(device).unsqueeze(1)

        logits=model(input_ids)
        loss= criterion(logits, labels)
        losses.append(loss.item())
        yhat=torch.sigmoid(logits)>.5
        corrects.extend(torch.eq(yhat, labels).cpu().tolist())
    print(f'Val loss: {np.mean(losses)}, accuracy: {np.mean(corrects)}')

epoch=5
interval=500
for ep in range(epoch):
    train(classifier, train_loader, criterion, optimizer, device, interval)
    test(classifier, test_loader, criterion, device)
    

train_loss 0: 0.33660516142845154
train_loss 500: 0.5284082805384657
train_loss 1000: 0.5144564527255314
train_loss 1500: 0.5061360617167627
train_loss 2000: 0.500360908522003
train_loss 2500: 0.4909531065269929
Val loss: 0.4538654964952804, accuracy: 0.7912
train_loss 0: 0.33634525537490845
train_loss 500: 0.40618007601854095
train_loss 1000: 0.41413508829745377
train_loss 1500: 0.411471781484053
train_loss 2000: 0.4117932712462829
train_loss 2500: 0.41090639629086606
Val loss: 0.42927986450088673, accuracy: 0.8108
train_loss 0: 0.415261834859848
train_loss 500: 0.3814060431189404


KeyboardInterrupt: 