<a href="https://colab.research.google.com/github/miiiingi/dacon_sentiment/blob/main/dacon_sentimental_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/gdrive/')
my_folder = '/gdrive/MyDrive/ColabNotebooks/dacon_senti/dataset/dataset'

Mounted at /gdrive/


In [2]:
import pandas as pd
import numpy as np
!pip install konlpy
from konlpy.tag import Mecab
from konlpy.tag import Okt
from tqdm import tqdm
from torchtext.legacy import data, datasets
import torch
import random
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
SEED = 722
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

seed_all(SEED)

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.2 MB/s 
Collecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 73.6 MB/s 
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.3.0 konlpy-0.6.0


Torchtext - Field 정의

In [36]:
okt = Okt()
ID = data.Field(use_vocab=False, batch_first= True)
TEXT = data.Field(use_vocab = True, tokenize=okt.morphs, batch_first = True)
LABEL = data.LabelField(use_vocab= False, is_target=True, batch_first= True, dtype = torch.float)
fields = [('id', ID), ('document',TEXT), ('label',LABEL)]

ID_te = data.Field(use_vocab=False, batch_first= True)
TEXT_te = data.Field(use_vocab = True, tokenize=okt.morphs, batch_first = True)
fields_te = [('id', ID_te), ('document',TEXT_te)]

Torchtext - 데이터셋 불러와서 정의

In [48]:
train_data, test_data = data.TabularDataset.splits(
                            path = f'{my_folder}/',
                            train = 'train.csv',
                            test = 'test.csv',
                            format = 'csv',
                            fields = fields,
                            skip_header = True,
)
train_data, valid_data = train_data.split(random_state=random.seed(722), split_ratio = 0.5)
# train_data, valid_data = train_data[0].split(random_state=random.seed(722), split_ratio = 0.5)

In [38]:
test_data = data.TabularDataset.splits(
                            path = f'{my_folder}/',
                            train = 'test.csv',
                            format = 'csv',
                            fields = fields_te,
                            skip_header = True,
)
test_data = test_data[0]

In [39]:
print(vars(test_data[1]))
print(vars(train_data[0]))

{'id': ['2'], 'document': ['훈훈한', '정이', '느껴지는', '영화', '!', '가족', '끼리', '드라마', '보듯이', '보면', '딱', '~!']}
{'id': ['1242'], 'document': ['한화', '한화', '가', '한편', '의', '영화', '같아서', '재밌네요', '!'], 'label': '1'}


Torchtext - 단어 집합 만들기

In [40]:
TEXT.build_vocab(train_data, min_freq=5) # 단어 집합 생성
LABEL.build_vocab(train_data)

In [41]:
TEXT_te.build_vocab(test_data, min_freq=5) # 단어 집합 생성

Torchtext - 배치화 시키기

In [42]:
train_iter, val_iter = data.BucketIterator.splits(
        (train_data, valid_data), batch_size=32, sort_key = lambda x: len(x.document), sort_within_batch = True,
        shuffle=True, repeat=False)

In [43]:
test_iter = data.BucketIterator.splits(
        (test_data), batch_size=32, sort_key = lambda x: len(x.document), sort_within_batch = True,
        shuffle=True, repeat=False)

In [44]:
print(dir(train_iter))
print(type(train_iter))
print(dir(train_iter.data))
print(type(val_iter))
print(type(test_iter))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_iterations_this_epoch', '_random_state_this_epoch', '_restored_from_state', 'batch_size', 'batch_size_fn', 'create_batches', 'data', 'dataset', 'device', 'epoch', 'init_epoch', 'iterations', 'load_state_dict', 'random_shuffler', 'repeat', 'shuffle', 'sort', 'sort_key', 'sort_within_batch', 'splits', 'state_dict', 'train']
<class 'torchtext.legacy.data.iterator.BucketIterator'>
['__call__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__func__', '__ge__', '__get__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', 

In [45]:
print(dir(train_iter))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_iterations_this_epoch', '_random_state_this_epoch', '_restored_from_state', 'batch_size', 'batch_size_fn', 'create_batches', 'data', 'dataset', 'device', 'epoch', 'init_epoch', 'iterations', 'load_state_dict', 'random_shuffler', 'repeat', 'shuffle', 'sort', 'sort_key', 'sort_within_batch', 'splits', 'state_dict', 'train']


In [46]:
print(dir(next(iter(train_iter))))
print(val_iter)
print(dir(next(iter(test_iter))))

ValueError: ignored

GRU Model 정의

In [None]:
from torch import nn as nn 
import torch.nn.functional as F
class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        super(GRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(embed_dim, self.hidden_dim,
                          num_layers=self.n_layers * 2,
                          batch_first=True,
                          bidirectional = True)
        self.out = nn.Linear(self.hidden_dim * 2, n_classes)

    def forward(self, x):
        x = self.embed(x)
        # h_0 = self._init_state(batch_size=x.size(0)) # 첫번째 히든 스테이트를 0벡터로 초기화
        x, _ = self.gru(x)  # GRU의 리턴값은 (배치 크기, 시퀀스 길이, 은닉 상태의 크기)
        h_t = x[:,-1,:] # (배치 크기, 은닉 상태의 크기)의 텐서로 크기가 변경됨. 즉, 마지막 time-step의 은닉 상태만 가져온다.
        self.dropout(h_t)
        logit = self.out(h_t)  # (배치 크기, 은닉 상태의 크기) -> (배치 크기, 출력층의 크기)
        return logit

    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [None]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = GRU(1, 256, len(TEXT.vocab), 128, 2, 0.3).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [None]:
def train(model, optimizer, train_iter):
    model.train()
    for b, batch in enumerate(train_iter):
        x, y = batch.document.to(DEVICE), batch.label.to(DEVICE)
        print(batch)
        print(x)
        print(y)
        optimizer.zero_grad()
        logit = model(x)
        loss = F.cross_entropy(logit, y)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
        loss.backward()
        optimizer.step()

In [None]:
def evaluate(model, val_iter):
    """evaluate model"""
    model.eval()
    corrects, total_loss = 0, 0
    for batch in val_iter:
        x, y = batch.document.to(DEVICE), batch.label.to(DEVICE)
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction='sum')
        total_loss += loss.item()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
    size = len(val_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [None]:
def evaluate_test(model, val_iter):
    """test model"""
    model.eval()
    corrects, total_loss = 0, 0
    print(next(iter(val_iter)))
    for batch in val_iter:
        print(dir(batch))

        x = batch.document.to(DEVICE)
        logit = model(x)
        print(logit)
    #     loss = F.cross_entropy(logit, y, reduction='sum')
    #     total_loss += loss.item()
    #     corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
    # size = len(val_iter.dataset)
    # avg_loss = total_loss / size
    # avg_accuracy = 100.0 * corrects / size
    # return avg_loss, avg_accuracy

In [None]:
best_val_loss = None
val_accuracy_accum = 0
Epochs = 10
for e in tqdm(range(Epochs)):
    train(model, optimizer, train_iter)
    train_loss, train_accuracy = evaluate(model, train_iter)
    val_loss, val_accuracy = evaluate(model, val_iter)
    val_accuracy_accum += val_accuracy

    print("[Epoch: %d] train loss : %5.2f | train acc : %5.2f | val loss : %5.2f | val accuracy : %5.2f" % (e+1, train_loss, train_accuracy, val_loss, val_accuracy))

    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("/gdrive/MyDrive/ColabNotebooks/dacon_senti/snapshot"):
            os.makedirs("/gdrive/MyDrive/ColabNotebooks/dacon_senti/snapshot")
        torch.save(model.state_dict(), f'/gdrive/MyDrive/ColabNotebooks/dacon_senti/snapshot/sentiment.pt')
        best_val_loss = val_loss
print(f'final validation score : {val_accuracy_accum / Epochs}')

  0%|          | 0/10 [00:00<?, ?it/s]


ValueError: ignored

In [None]:
model.load_state_dict(torch.load(f'/gdrive/MyDrive/ColabNotebooks/dacon_senti/snapshot/sentiment.pt'))
evaluate_test(model, test_iter)

RuntimeError: ignored