<a href="https://colab.research.google.com/github/miiiingi/dacon_sentiment/blob/main/dacon_sentimental_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/gdrive/')
my_folder = '/gdrive/MyDrive/ColabNotebooks/dacon_senti/dataset/dataset'

Mounted at /gdrive/


In [2]:
import pandas as pd
import numpy as np
!pip install konlpy
from konlpy.tag import Mecab
from konlpy.tag import Okt
from tqdm import tqdm
from torchtext.legacy import data, datasets
import torch
import random
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.1 MB/s 
Collecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 70.4 MB/s 
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.3.0 konlpy-0.6.0


Torchtext - Field 정의

In [3]:
okt = Okt()
ID = data.Field(use_vocab=False, sequential=False, batch_first= True)
TEXT = data.Field(use_vocab = True, tokenize=okt.morphs, sequential= True, batch_first = True)
LABEL = data.Field(use_vocab= False, sequential= False, is_target=True, batch_first= True)
fields = [('id', ID), ('document',TEXT), ('label',LABEL)]

Torchtext - 데이터셋 불러와서 정의

In [4]:
train_data, test_data = data.TabularDataset.splits(
                            path = f'{my_folder}/',
                            train = 'train.csv',
                            test = 'test.csv',
                            format = 'csv',
                            fields = fields,
                            skip_header = True,
)
train_data, valid_data = train_data.split(random_state=random.seed(722), split_ratio = 0.5)

Torchtext - 단어 집합 만들기

In [5]:
TEXT.build_vocab(train_data, min_freq=5) # 단어 집합 생성
LABEL.build_vocab(train_data)

Torchtext - 배치화 시키기

In [6]:
train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train_data, valid_data, test_data), batch_size=32, sort_key = lambda x: len(x.document), sort_within_batch = True,
        shuffle=True, repeat=False)

GRU Model 정의

In [7]:
from torch import nn as nn 
import torch.nn.functional as F
class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        super(GRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(embed_dim, self.hidden_dim,
                          num_layers=self.n_layers * 2,
                          batch_first=True,
                          bidirectional = True)
        self.out = nn.Linear(self.hidden_dim * 2, n_classes)

    def forward(self, x):
        x = self.embed(x)
        # h_0 = self._init_state(batch_size=x.size(0)) # 첫번째 히든 스테이트를 0벡터로 초기화
        x, _ = self.gru(x)  # GRU의 리턴값은 (배치 크기, 시퀀스 길이, 은닉 상태의 크기)
        h_t = x[:,-1,:] # (배치 크기, 은닉 상태의 크기)의 텐서로 크기가 변경됨. 즉, 마지막 time-step의 은닉 상태만 가져온다.
        self.dropout(h_t)
        logit = self.out(h_t)  # (배치 크기, 은닉 상태의 크기) -> (배치 크기, 출력층의 크기)
        return logit

    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [8]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = GRU(1, 256, len(TEXT.vocab), 128, 2, 0.5).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [9]:
def train(model, optimizer, train_iter):
    model.train()
    for b, batch in enumerate(train_iter):
        x, y = batch.document.to(DEVICE), batch.label.to(DEVICE)
        optimizer.zero_grad()
        logit = model(x)
        loss = F.cross_entropy(logit, y)
        loss.backward()
        optimizer.step()

In [10]:
def evaluate(model, val_iter):
    """evaluate model"""
    model.eval()
    corrects, total_loss = 0, 0
    for batch in val_iter:
        x, y = batch.document.to(DEVICE), batch.label.to(DEVICE)
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction='sum')
        total_loss += loss.item()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
    size = len(val_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [None]:
best_val_loss = None
for e in tqdm(range(300)):
    train(model, optimizer, train_iter)
    val_loss, val_accuracy = evaluate(model, val_iter)

    print("[Epoch: %d] val loss : %5.2f | val accuracy : %5.2f" % (e+1, val_loss, val_accuracy))

    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("/gdrive/MyDrive/ColabNotebooks/dacon_senti/snapshot"):
            os.makedirs("/gdrive/MyDrive/ColabNotebooks/dacon_senti/snapshot")
        torch.save(model.state_dict(), f'/gdrive/MyDrive/ColabNotebooks/dacon_senti/snapshot/sentiment.pt')
        best_val_loss = val_loss

  0%|          | 1/300 [00:07<37:45,  7.58s/it]

[Epoch: 1] val loss :  0.59 | val accuracy : 72.00


  1%|          | 2/300 [00:15<38:05,  7.67s/it]

[Epoch: 2] val loss :  0.50 | val accuracy : 77.00


  1%|          | 3/300 [00:22<37:58,  7.67s/it]

[Epoch: 3] val loss :  0.52 | val accuracy : 77.00


  1%|▏         | 4/300 [00:30<37:35,  7.62s/it]

[Epoch: 4] val loss :  0.53 | val accuracy : 75.68


  2%|▏         | 5/300 [00:38<37:21,  7.60s/it]

[Epoch: 5] val loss :  0.58 | val accuracy : 76.60


  2%|▏         | 6/300 [00:45<37:09,  7.58s/it]

[Epoch: 6] val loss :  0.55 | val accuracy : 77.52


  2%|▏         | 7/300 [00:53<36:52,  7.55s/it]

[Epoch: 7] val loss :  0.56 | val accuracy : 77.84


  3%|▎         | 8/300 [01:00<36:53,  7.58s/it]

[Epoch: 8] val loss :  0.58 | val accuracy : 77.72


  3%|▎         | 9/300 [01:08<36:53,  7.61s/it]

[Epoch: 9] val loss :  0.62 | val accuracy : 78.40


  3%|▎         | 10/300 [01:16<36:51,  7.62s/it]

[Epoch: 10] val loss :  0.77 | val accuracy : 74.04


  4%|▎         | 11/300 [01:23<36:33,  7.59s/it]

[Epoch: 11] val loss :  0.67 | val accuracy : 78.00


  4%|▍         | 12/300 [01:31<36:22,  7.58s/it]

[Epoch: 12] val loss :  0.85 | val accuracy : 76.56


  4%|▍         | 13/300 [01:38<36:01,  7.53s/it]

[Epoch: 13] val loss :  0.73 | val accuracy : 77.76


  5%|▍         | 14/300 [01:46<35:48,  7.51s/it]

[Epoch: 14] val loss :  0.77 | val accuracy : 78.04


  5%|▌         | 15/300 [01:53<35:39,  7.51s/it]

[Epoch: 15] val loss :  0.80 | val accuracy : 77.44


  5%|▌         | 16/300 [02:01<35:29,  7.50s/it]

[Epoch: 16] val loss :  0.92 | val accuracy : 77.80


  6%|▌         | 17/300 [02:08<35:27,  7.52s/it]

[Epoch: 17] val loss :  0.99 | val accuracy : 78.60


  6%|▌         | 18/300 [02:16<35:28,  7.55s/it]

[Epoch: 18] val loss :  1.05 | val accuracy : 77.04


  6%|▋         | 19/300 [02:23<35:19,  7.54s/it]

[Epoch: 19] val loss :  1.01 | val accuracy : 77.84


  7%|▋         | 20/300 [02:31<35:06,  7.52s/it]

[Epoch: 20] val loss :  1.10 | val accuracy : 78.20


  7%|▋         | 21/300 [02:38<34:56,  7.51s/it]

[Epoch: 21] val loss :  1.22 | val accuracy : 77.52


  7%|▋         | 22/300 [02:46<34:58,  7.55s/it]

[Epoch: 22] val loss :  1.24 | val accuracy : 78.20


  8%|▊         | 23/300 [02:53<34:44,  7.53s/it]

[Epoch: 23] val loss :  1.12 | val accuracy : 77.04


  8%|▊         | 24/300 [03:01<34:37,  7.53s/it]

[Epoch: 24] val loss :  1.05 | val accuracy : 77.68


  8%|▊         | 25/300 [03:08<34:35,  7.55s/it]

[Epoch: 25] val loss :  1.13 | val accuracy : 77.84


  9%|▊         | 26/300 [03:16<34:34,  7.57s/it]

[Epoch: 26] val loss :  1.28 | val accuracy : 77.56


  9%|▉         | 27/300 [03:24<34:30,  7.58s/it]

[Epoch: 27] val loss :  1.34 | val accuracy : 77.60


  9%|▉         | 28/300 [03:31<34:24,  7.59s/it]

[Epoch: 28] val loss :  1.35 | val accuracy : 77.52
