<a href="https://colab.research.google.com/github/miiiingi/dacon_sentiment/blob/main/dacon_sentimental_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/gdrive/')
my_folder = '/gdrive/MyDrive/ColabNotebooks/dacon_senti/dataset/dataset'

Drive already mounted at /gdrive/; to attempt to forcibly remount, call drive.mount("/gdrive/", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
!pip install konlpy
from konlpy.tag import Mecab
from konlpy.tag import Okt
from tqdm import tqdm
from torchtext.legacy import data, datasets
import torch
import random
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
SEED = 722
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

seed_all(SEED)



In [323]:
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh
%cd ..

Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 109, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 109 (delta 7), reused 10 (delta 3), pack-reused 91[K
Receiving objects: 100% (109/109), 1.27 MiB | 8.33 MiB/s, done.
Resolving deltas: 100% (46/46), done.
/content/Mecab-ko-for-Google-Colab
Installing konlpy.....
Done
Installing mecab-0.996-ko-0.9.2.tar.gz.....
Downloading mecab-0.996-ko-0.9.2.tar.gz.......
from https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz
--2022-01-19 08:56:18--  https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz
Resolving bitbucket.org (bitbucket.org)... 104.192.141.1, 2406:da00:ff00::3403:4be7, 2406:da00:ff00::22e9:9f55, ...
Connecting to bitbucket.org (bitbucket.org)|104.192.141.1|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://bbuseruploads.s3.amazonaws.com/eunjeon/meca

Torchtext - Field 정의

In [3]:
tokenizer = Mecab()
ID = data.Field(sequential=False, use_vocab=False)
TEXT = data.Field(sequential=True, use_vocab=True, tokenize=tokenizer.morphs, lower=True, batch_first=True)
LABEL = data.LabelField(sequential=False, use_vocab=False, is_target=True)

첫 번째 실험 : tokenizer을 morphs(75.37 / clip grad norm x > 과적합 심하게 발생) > nouns(68.4)로 변화 / morphs \\
두 번째 실험 : clip grad norm(73.9 > 과적합 발생)


Torchtext - 데이터셋 불러와서 정의

In [4]:
train_data = data.TabularDataset(
                            path = f'{my_folder}/train.csv',
                            format = 'csv',
                            fields = [('id', ID), ('document', TEXT), ('label', LABEL)],
                            skip_header = True,)
test_data = data.TabularDataset(
                            path = f'{my_folder}/test.csv',
                            format = 'csv',
                            fields = [('id', ID), ('document', TEXT)],
                            skip_header = True,)
train_data, valid_data = train_data.split(split_ratio = 0.5)

Torchtext - 단어 집합 만들기

In [5]:
TEXT.build_vocab(train_data, min_freq=2, max_size= 1000) # 단어 집합 생성
# LABEL.build_vocab(train_data)

In [6]:
vars(train_data[1])

{'document': ['우리', '나라', '좀', '비', '영화', '는', '왜', '이러', '냐', '.', '.'],
 'id': '1494',
 'label': '0'}

Torchtext - 배치화 시키기

In [7]:
batch_size = 64
train_loader = data.Iterator(train_data, batch_size=batch_size, shuffle=True)
val_loader = data.Iterator(valid_data, batch_size=batch_size, train=False, sort=False)
test_loader = data.Iterator(test_data, batch_size=batch_size, train=False, sort=False)

In [8]:
print(next(iter(train_loader)))
print(next(iter(val_loader)))
print(next(iter(test_loader)))


[torchtext.legacy.data.batch.Batch of size 64]
	[.id]:[torch.LongTensor of size 64]
	[.document]:[torch.LongTensor of size 64x25]
	[.label]:[torch.LongTensor of size 64]

[torchtext.legacy.data.batch.Batch of size 64]
	[.id]:[torch.LongTensor of size 64]
	[.document]:[torch.LongTensor of size 64x26]
	[.label]:[torch.LongTensor of size 64]

[torchtext.legacy.data.batch.Batch of size 64]
	[.id]:[torch.LongTensor of size 64]
	[.document]:[torch.LongTensor of size 64x23]


Embedding layer 어떻게 처리할 지 생각해보자

GRU Model 정의

In [225]:
from torch import nn as nn 
import torch.nn.functional as F
class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        super(GRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(embed_dim, self.hidden_dim,
                          num_layers=self.n_layers * 2,
                          batch_first=True,
                          bidirectional = True)
        self.out = nn.Linear(self.hidden_dim * 2, n_classes)

    def forward(self, x):
        # print(x.shape)
        x = self.embed(x)
        # print(x.shape)
        # h_0 = self._init_state(batch_size=x.size(0)) # 첫번째 히든 스테이트를 0벡터로 초기화
        x, _ = self.gru(x)  # GRU의 리턴값은 (배치 크기, 시퀀스 길이, 은닉 상태의 크기)
        h_t = x[:,-1,:] # (배치 크기, 은닉 상태의 크기)의 텐서로 크기가 변경됨. 즉, 마지막 time-step의 은닉 상태만 가져온다.
        self.dropout(h_t)
        logit = self.out(h_t)  # (배치 크기, 은닉 상태의 크기) -> (배치 크기, 출력층의 크기)
        return logit

    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [301]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        import math
        super(PositionalEncoding, self).__init__()       
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        even_div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        odd_div_term = torch.exp(torch.arange(1, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * even_div_term)
        pe[:, 1::2] = torch.cos(position * odd_div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        pe.requires_grad = False
        self.register_buffer('pe', pe)
    def forward(self, x):
        return x + self.pe[:x.size(0), :]

class Transformer(nn.Module):
    def __init__(self, n_vocab, n_embed_pre, n_embed_post, n_layers, n_head, n_linear, n_classes, dropout):
        super(Transformer, self).__init__()
        self.src_mask = None
        # self.embed = nn.Embedding(n_vocab, n_embed_post)
        self.pos_enc = PositionalEncoding(n_embed_pre) # 나중에 비교해보자 인코딩 방법
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=n_embed_post, nhead = n_head, dropout=dropout, batch_first = True)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=n_layers)        
        self.linear = nn.Linear(n_embed_post, n_classes)
        # self.init_weights()
    # def init_weights(self):
    #     nn.init.zeros_(self.decoder.bias)
    #     nn.init.zeros_(self.decoder2.bias)
    #     nn.init.xavier_normal_(self.decoder.weight)
    #     nn.init.xavier_normal_(self.decoder2.weight)
    def forward(self, src):
        print(src.shape)
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            self.src_mask = self._generate_square_subsequent_mask(src.shape[1]).to(src.device)
        # print(src.shape)
        # src = self.embed(src)
        # print(src.shape)
        src = self.pos_enc(src)
        # print(src.shape)
        exit()
        # print(self.src_mask.shape)
        output = self.encoder(src, self.src_mask)
        # print(output.shape)
        output = self.linear(output)
        # position = src[:, 0, :]
        # src_original = torch.transpose(src_original, 0, 1)
        # src = torch.transpose(src, 0, 1)
        # feature_time = torch.transpose(feature_time, 0, 1)
        # output, attention = self.transformer_encoder(src, self.src_mask)
        # if self.type_enc == 'vanila' : 
        #     output = torch.cat((output, feature_time), 2)
        # elif self.type_enc != 'vanila' : 
        #     output = output 
        # output = self.decoder(output)
        # output = self.decoder2(output)
        # return output, attention, position 
        return output
    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

In [302]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# model = GRU(1, 256, len(TEXT.vocab), 128, 2, 0.3).to(DEVICE)
model = Transformer(n_vocab = len(TEXT.vocab),n_embed_pre = 25, n_embed_post = 128, n_layers = 1, n_head = 8, n_linear = 128, n_classes = 2, dropout = 0.3).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [307]:
def train(model, optimizer, train_iter):
    model.train()
    for b, batch in enumerate(train_iter):
        x, y = batch.document.to(DEVICE), batch.label.long().to(DEVICE)
        print(x.shape)
        print(y.shape)
        print(x)
        optimizer.zero_grad()
        print(model(x).shape)

        logit = torch.argmax(model(x), dim = 2)
        print(logit.shape)
        print(y.shape)
        loss = F.cross_entropy(logit, y)

        loss.backward()
        optimizer.step()

In [308]:
def evaluate(model, val_iter):
    """evaluate model"""
    model.eval()
    corrects, total_loss = 0, 0
    with torch.no_grad():
      for batch in val_iter:
          x, y = batch.document.to(DEVICE), batch.label.to(DEVICE)
          logit = model(x)
          loss = F.cross_entropy(logit, y, reduction='sum')
          total_loss += loss.item()
          corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
      size = len(val_iter.dataset)
      avg_loss = total_loss / size
      avg_accuracy = 100.0 * corrects / size
      return avg_loss, avg_accuracy

In [309]:
def evaluate_test(model, test_loader, device):
    result = []
    model.to(device)
    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_loader):
            x = batch.document.to(device)
            out = model(x)
            pred = torch.argmax(out, dim=1)
            result.append(pred)
    return torch.cat(result).cpu().numpy()

In [310]:
best_val_loss = None
val_accuracy_accum = 0
Epochs = 1
for e in tqdm(range(Epochs)):
    train(model, optimizer, train_loader)
    train_loss, train_accuracy = evaluate(model, train_loader)
    val_loss, val_accuracy = evaluate(model, val_loader)
    val_accuracy_accum += val_accuracy

    print("[Epoch: %d] train loss : %5.2f | train acc : %5.2f | val loss : %5.2f | val accuracy : %5.2f" % (e+1, train_loss, train_accuracy, val_loss, val_accuracy))

    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("/gdrive/MyDrive/ColabNotebooks/dacon_senti/snapshot"):
            os.makedirs("/gdrive/MyDrive/ColabNotebooks/dacon_senti/snapshot")
        torch.save(model.state_dict(), f'/gdrive/MyDrive/ColabNotebooks/dacon_senti/snapshot/sentiment.pt')
        best_val_loss = val_loss
print(f'final validation score : {val_accuracy_accum / Epochs}')

  0%|          | 0/1 [00:00<?, ?it/s]

torch.Size([64, 22])
torch.Size([64])
tensor([[  0,   0, 120,  ...,   1,   1,   1],
        [  0,   0,   0,  ...,   1,   1,   1],
        [  0,   0, 205,  ...,   1,   1,   1],
        ...,
        [279, 279,  46,  ...,   1,   1,   1],
        [ 23,   0,  22,  ...,   1,   1,   1],
        [284,   3, 208,  ...,   1,   1,   1]], device='cuda:0')
torch.Size([64, 22])





RuntimeError: ignored

TEST 진행

In [None]:
model.load_state_dict(torch.load(f'/gdrive/MyDrive/ColabNotebooks/dacon_senti/snapshot/sentiment.pt'))
result = evaluate_test(model, test_loader, DEVICE)

100%|██████████| 79/79 [00:01<00:00, 76.45it/s]

[0 1 0 ... 1 1 1]





In [None]:
submission = pd.read_csv(f"{my_folder}/sample_submission.csv")
submission["label"] = result
print(submission)
submission.to_csv(f"{my_folder}/submission.csv", index=False)

        id  label
0        1      0
1        2      1
2        3      0
3        4      1
4        5      0
...    ...    ...
4995  4996      0
4996  4997      0
4997  4998      1
4998  4999      1
4999  5000      1

[5000 rows x 2 columns]
