<a href="https://colab.research.google.com/github/miiiingi/dacon_sentiment/blob/main/dacon_sentimental_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/gdrive/')
my_folder = '/gdrive/MyDrive/ColabNotebooks/dacon_senti/dataset/dataset'

Drive already mounted at /gdrive/; to attempt to forcibly remount, call drive.mount("/gdrive/", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
!pip install konlpy
from konlpy.tag import Mecab
from konlpy.tag import Okt
from tqdm import tqdm
from torchtext.legacy import data, datasets
import torch
import random
import os
import math
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
SEED = 722
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

seed_all(SEED)



Torchtext - Field 정의

In [3]:
tokenizer = Okt()
ID = data.Field(sequential=False, use_vocab=False)
TEXT = data.Field(sequential=True, use_vocab=True, tokenize=tokenizer.morphs)
LABEL = data.LabelField(sequential=False, use_vocab=False, is_target=True)

첫 번째 실험 : tokenizer을 morphs(75.37 / clip grad norm x > 과적합 심하게 발생) > nouns(68.4)로 변화 / morphs \\
두 번째 실험 : clip grad norm(73.9 > 과적합 발생)


Torchtext - 데이터셋 불러와서 정의

In [27]:
train_data = data.TabularDataset(
                            path = f'{my_folder}/train.csv',
                            format = 'csv',
                            fields = [('id', ID), ('document', TEXT), ('label', LABEL)],
                            skip_header = True,)
test_data = data.TabularDataset(
                            path = f'{my_folder}/test.csv',
                            format = 'csv',
                            fields = [('id', ID), ('document', TEXT)],
                            skip_header = True,)
train_data, valid_data = train_data.split(split_ratio = 0.8)

Torchtext - 단어 집합 만들기

In [28]:
TEXT.build_vocab(train_data, min_freq=5, max_size= 1000) # 단어 집합 생성
# LABEL.build_vocab(train_data)

In [29]:
vars(train_data[1])

{'document': ['우리나라', '좀비', '영화', '는', '왜', '이러냐', '..'],
 'id': '1494',
 'label': '0'}

Torchtext - 배치화 시키기

In [30]:
batch_size = 64
train_loader = data.Iterator(train_data, batch_size=batch_size, shuffle=True)
val_loader = data.Iterator(valid_data, batch_size=batch_size, train=False, sort=False)
test_loader = data.Iterator(test_data, batch_size=batch_size, train=False, sort=False)

In [31]:
print(next(iter(train_loader)))
print(next(iter(val_loader)))
print(next(iter(test_loader)))


[torchtext.legacy.data.batch.Batch of size 64]
	[.id]:[torch.LongTensor of size 64]
	[.document]:[torch.LongTensor of size 19x64]
	[.label]:[torch.LongTensor of size 64]

[torchtext.legacy.data.batch.Batch of size 64]
	[.id]:[torch.LongTensor of size 64]
	[.document]:[torch.LongTensor of size 20x64]
	[.label]:[torch.LongTensor of size 64]

[torchtext.legacy.data.batch.Batch of size 64]
	[.id]:[torch.LongTensor of size 64]
	[.document]:[torch.LongTensor of size 19x64]


Embedding layer 어떻게 처리할 지 생각해보자

GRU Model 정의

In [32]:
from torch import nn as nn 
import torch.nn.functional as F
class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        super(GRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(embed_dim, self.hidden_dim,
                          num_layers=self.n_layers * 2,
                          batch_first=True,
                          bidirectional = True)
        self.out = nn.Linear(self.hidden_dim * 2, n_classes)

    def forward(self, x):
        x = self.embed(x)
        print(x.shape)
        # h_0 = self._init_state(batch_size=x.size(0)) # 첫번째 히든 스테이트를 0벡터로 초기화
        x, _ = self.gru(x)  # GRU의 리턴값은 (배치 크기, 시퀀스 길이, 은닉 상태의 크기)
        h_t = x[:,-1,:] # (배치 크기, 은닉 상태의 크기)의 텐서로 크기가 변경됨. 즉, 마지막 time-step의 은닉 상태만 가져온다.
        self.dropout(h_t)
        logit = self.out(h_t)  # (배치 크기, 은닉 상태의 크기) -> (배치 크기, 출력층의 크기)
        return logit

    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [33]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()       
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        even_div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        odd_div_term = torch.exp(torch.arange(1, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * even_div_term)
        pe[:, 1::2] = torch.cos(position * odd_div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        pe.requires_grad = False
        self.register_buffer('pe', pe)
    def forward(self, x):
        return x + self.pe[:x.size(0), :]

class Transformer(nn.Module):
    def __init__(self, n_vocab, n_embed_pre, n_embed_post, n_layers, n_head, n_linear, n_classes, dropout):
        super(Transformer, self).__init__()
        self.src_mask = None
        self.embed = nn.Embedding(n_vocab, n_embed_post)
        self.pos_enc = PositionalEncoding(n_embed_post)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=n_embed_post, nhead = n_head, dropout=dropout)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=n_layers)        
        self.linear = nn.Linear(n_embed_post, n_classes)
        self.n_embed_post = n_embed_post
        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embed.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        # print(src.shape)
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            self.src_mask = self._generate_square_subsequent_mask(src.shape[0]).to(src.device)
        # print(src.shape)
        # src = self.embed(src) * math.sqrt(self.n_embed_post)
        src = self.embed(src)
        # print(src.shape)
        src = self.pos_enc(src)
        # print(src.shape)
        # print('yes')
        # print(self.src_mask.shape)
        output = self.encoder(src, self.src_mask)
        # print(output.shape)
        # print('yes')
        output = self.linear(output)
        return output

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

In [34]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# model = GRU(1, 256, len(TEXT.vocab), 128, 2, 0.3).to(DEVICE)
model = Transformer(n_vocab = len(TEXT.vocab),n_embed_pre = 25, n_embed_post = 512, n_layers = 1, n_head = 8, n_linear = 1024, n_classes = 1, dropout = 0).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

RuntimeError: ignored

In [12]:
def train(model, optimizer, train_iter):
    model.train()
    for b, batch in enumerate(train_iter):
        x, y = batch.document.to(DEVICE), batch.label.long().to(DEVICE)
        logit = torch.squeeze(model(x), -1)
        logit = logit.permute(1, 0)
        loss = F.cross_entropy(logit, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [13]:
def evaluate(model, val_iter):
    """evaluate model"""
    model.eval()
    corrects, total_loss = 0, 0
    with torch.no_grad():
      for batch in val_iter:
          x, y = batch.document.to(DEVICE), batch.label.to(DEVICE)
          logit = torch.squeeze(model(x), -1)
          logit = logit.permute(1, 0)
          loss = F.cross_entropy(logit, y)
          total_loss += loss.item()
          corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
      size = len(val_iter.dataset)
      avg_loss = total_loss / size
      avg_accuracy = 100.0 * corrects / size
      return avg_loss, avg_accuracy

In [14]:
def evaluate_test(model, test_loader, device):
    result = []
    model.to(device)
    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_loader):
            x = batch.document.to(device)
            out = model(x)
            pred = torch.argmax(out, dim=1)
            result.append(pred)
    return torch.cat(result).cpu().numpy()

In [16]:
best_val_loss = None
val_accuracy_accum = 0
Epochs = 1
for e in tqdm(range(Epochs)):
    train(model, optimizer, train_loader)
    train_loss, train_accuracy = evaluate(model, train_loader)
    val_loss, val_accuracy = evaluate(model, val_loader)
    val_accuracy_accum += val_accuracy

    print("[Epoch: %d] train loss : %5.2f | train acc : %5.2f | val loss : %5.2f | val accuracy : %5.2f" % (e+1, train_loss, train_accuracy, val_loss, val_accuracy))

    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("/gdrive/MyDrive/ColabNotebooks/dacon_senti/snapshot"):
            os.makedirs("/gdrive/MyDrive/ColabNotebooks/dacon_senti/snapshot")
        torch.save(model.state_dict(), f'/gdrive/MyDrive/ColabNotebooks/dacon_senti/snapshot/sentiment.pt')
        best_val_loss = val_loss
print(f'final validation score : {val_accuracy_accum / Epochs}')

  0%|          | 0/1 [00:00<?, ?it/s]

torch.Size([20, 64])
torch.Size([64])
torch.Size([20, 64])
torch.Size([64])
torch.Size([18, 64])
torch.Size([64])
torch.Size([20, 64])
torch.Size([64])
torch.Size([21, 64])
torch.Size([64])
torch.Size([22, 64])
torch.Size([64])
torch.Size([24, 64])
torch.Size([64])
torch.Size([20, 64])
torch.Size([64])
torch.Size([19, 64])
torch.Size([64])
torch.Size([23, 64])
torch.Size([64])
torch.Size([18, 64])
torch.Size([64])
torch.Size([20, 64])
torch.Size([64])
torch.Size([19, 64])
torch.Size([64])
torch.Size([21, 64])
torch.Size([64])
torch.Size([19, 64])
torch.Size([64])
torch.Size([20, 64])
torch.Size([64])
torch.Size([19, 64])
torch.Size([64])
torch.Size([22, 64])
torch.Size([64])
torch.Size([19, 64])
torch.Size([64])
torch.Size([17, 64])
torch.Size([64])
torch.Size([19, 64])
torch.Size([64])
torch.Size([19, 64])
torch.Size([64])
torch.Size([20, 64])
torch.Size([64])
torch.Size([21, 64])
torch.Size([64])
torch.Size([20, 64])
torch.Size([64])
torch.Size([21, 64])
torch.Size([64])
torch.Size([

100%|██████████| 1/1 [00:02<00:00,  2.15s/it]

[Epoch: 1] train loss :  0.01 | train acc : 79.84 | val loss :  0.02 | val accuracy : 56.24
final validation score : 56.23999786376953





검증 데이터셋으로 구한 초모수를 통해 전체 데이터셋을 훈련시켜 테스트에 사용

In [17]:
train_data = data.TabularDataset(
                            path = f'{my_folder}/train.csv',
                            format = 'csv',
                            fields = [('id', ID), ('document', TEXT), ('label', LABEL)],
                            skip_header = True,)
test_data = data.TabularDataset(
                            path = f'{my_folder}/test.csv',
                            format = 'csv',
                            fields = [('id', ID), ('document', TEXT)],
                            skip_header = True,)

In [18]:
TEXT.build_vocab(train_data, min_freq=5, max_size= 1000) # 단어 집합 생성
# LABEL.build_vocab(train_data)

In [24]:
batch_size = 64
train_loader = data.Iterator(train_data, batch_size=batch_size, shuffle=True)
test_loader = data.Iterator(test_data, batch_size=batch_size, train=False, sort=False)

In [25]:
print(next(iter(train_loader)))
print(next(iter(test_loader)))


[torchtext.legacy.data.batch.Batch of size 64]
	[.id]:[torch.LongTensor of size 64]
	[.document]:[torch.LongTensor of size 20x64]
	[.label]:[torch.LongTensor of size 64]

[torchtext.legacy.data.batch.Batch of size 64]
	[.id]:[torch.LongTensor of size 64]
	[.document]:[torch.LongTensor of size 19x64]


In [26]:
model = Transformer(n_vocab = len(TEXT.vocab),n_embed_pre = 25, n_embed_post = 512, n_layers = 1, n_head = 8, n_linear = 1024, n_classes = 1, dropout = 0).to(DEVICE)
model.load_state_dict(torch.load(f'/gdrive/MyDrive/ColabNotebooks/dacon_senti/snapshot/sentiment.pt'))
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
Epochs = 100
for e in tqdm(range(Epochs)):
    train(model, optimizer, train_loader)
    train_loss, train_accuracy = evaluate(model, train_loader)
torch.save(model.state_dict(), f'/gdrive/MyDrive/ColabNotebooks/dacon_senti/snapshot/sentiment.pt')

RuntimeError: ignored

TEST 진행

In [None]:
model.load_state_dict(torch.load(f'/gdrive/MyDrive/ColabNotebooks/dacon_senti/snapshot/sentiment.pt'))
result = evaluate_test(model, test_loader, DEVICE)

100%|██████████| 79/79 [00:01<00:00, 76.45it/s]

[0 1 0 ... 1 1 1]





In [None]:
submission = pd.read_csv(f"{my_folder}/sample_submission.csv")
submission["label"] = result
print(submission)
submission.to_csv(f"{my_folder}/submission.csv", index=False)

        id  label
0        1      0
1        2      1
2        3      0
3        4      1
4        5      0
...    ...    ...
4995  4996      0
4996  4997      0
4997  4998      1
4998  4999      1
4999  5000      1

[5000 rows x 2 columns]
