In [1]:
import os
import json
import torch
import urllib
import random
import torch.nn as nn
import numpy as np
import pandas as pd

from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import build_vocab_from_iterator
from sklearn.model_selection import train_test_split

In [2]:
SEED = 123

def seed_everything(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
print(device)

url = "https://storage.googleapis.com/download.tensorflow.org/data/sarcasm.json"
urllib.request.urlretrieve(url, 'sarcasm.json')

cuda


('sarcasm.json', <http.client.HTTPMessage at 0x7f70e3c18dc0>)

In [4]:
with open('/home/pervinco/Datasets/sarcasm.json') as f:
    datas = json.load(f)

df = pd.DataFrame(datas)
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [5]:
from torchtext.data.utils import get_tokenizer

## example
sample = "Hello, my name is minjun kim."
tokenizer = get_tokenizer("basic_english")
tokenizer(sample)

['hello', ',', 'my', 'name', 'is', 'minjun', 'kim', '.']

In [6]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(sentences):
    for text in sentences:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(df['headline'].tolist()),
                                  specials=["<Unknown>"], ## 어휘에 없는 단어들을 "<Unknown>"로 대체
                                  min_freq=2,
                                  max_tokens=1000,)
vocab.set_default_index(vocab['<Unknown>']) ## 생성된 어휘에서 "<UNK>" 토큰을 기본 인덱스로 설정

In [7]:
str_to_idx = vocab.get_stoi()
idx_to_str = vocab.get_itos()

print(idx_to_str)

['<Unknown>', "'", 'to', 'of', 'the', 's', 'in', ',', 'for', 'a', 'on', '.', 'and', 'with', 'is', 'trump', 'new', 'man', 'from', 'at', 'you', 't', 'it', 'about', 'this', 'by', 'after', '?', 'be', 'that', 'how', 'out', 'he', 'as', 'up', 'not', 'what', 'can', 'are', 'your', 'his', 'who', 'just', 'has', 'will', 'more', 'all', 'one', 'into', 'report', 'i', 'why', 'have', 'area', 'woman', 'over', 'donald', 'u', 'says', 'day', 'obama', 'time', 'no', 'first', 'like', 'people', 'women', 'get', 'her', 'we', 'world', 'an', 'now', 'nation', 'house', 'life', 'off', 'clinton', 'they', 'make', 'still', 'than', 'was', 'my', 'white', 'back', 'down', 'if', 'when', 'family', 'could', 'she', 'their', 'do', 'before', 'americans', 'gop', 'most', 'way', '5', 'black', 'year', 'here', 'study', 'years', 'bill', 'should', 'would', 'him', 'president', 'best', 'so', 'america', 'police', 'only', 'watch', 'school', 'show', 'american', 'really', 'being', 'but', 'know', 'home', 'mom', 'things', 'death', 'during', 'go

In [8]:
vocab(tokenizer(sample))

[0, 7, 83, 347, 14, 0, 474, 11]

In [23]:
x_train, x_test, y_train, y_test = train_test_split(df["headline"],
                                                    df['is_sarcastic'],
                                                    stratify=df['is_sarcastic'],
                                                    test_size=0.2,
                                                    random_state=SEED)

print(len(x_train[0]))
print(x_train[0])

print(len(x_train[1]))
print(x_train[1])

78
former versace store clerk sues over secret 'black code' for minority shoppers
84
the 'roseanne' revival catches up to our thorny political mood, for better and worse


In [9]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, vocab, tokenizer):
        super().__init__()
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]

        return self.vocab(self.tokenizer(text)), label
    
def collate_batch(batch, max_sequence_length):
    label_list, text_list = [], []

    for text, label in batch:
        processed_text = torch.tensor(text[:max_sequence_length], dtype=torch.int64) ## 길이를 max_sequence_length를 넘지 못하게 만든다.
        text_list.append(processed_text)
        label_list.append(label)

    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = pad_sequence(text_list, batch_first=True, padding_value=0) ## padding을 통해 데이터의 길이를 일정하게 맞춰준다.

    return text_list.to(device), label_list.to(device)

In [10]:
train_dataset = CustomDataset(x_train, y_train, vocab=vocab, tokenizer=tokenizer)
valid_dataset = CustomDataset(x_test, y_test, vocab=vocab, tokenizer=tokenizer)

In [25]:
for data in train_dataset:
    print(data)
    break

([0, 0, 0, 262, 142, 214, 925, 186, 32, 0], 1)


In [11]:
MAX_SEQUENCE_LEN = 100
BATCH_SIZE = 32

train_dataloader = DataLoader(train_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              collate_fn=lambda x : collate_batch(x, MAX_SEQUENCE_LEN)) ## 최대 길이가 100

valid_dataloader = DataLoader(valid_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=False,
                              collate_fn=lambda x : collate_batch(x, MAX_SEQUENCE_LEN))

In [27]:
for data in train_dataloader:
    tokens, labels = data[0], data[1]
    print(tokens.shape, labels.shape)
    print(tokens)
    break

torch.Size([32, 22]) torch.Size([32])
tensor([[  0, 467,   1,   5, 863, 522,  38,   1,   0,   1,   0,   1,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0],
        [191,   0,   0,   3, 120,   0,   2,  33,   1,  29, 191,   0,   1,   0,
           0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0, 526,   0,   0,   0,   0,  19,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0],
        [147,  47,   3,  17,   1,   5,   0,   0,   2,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0],
        [ 83,   0, 622,  83,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0],
        [  0, 786,   0, 652,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0],
        [946,   0,   1,   5, 608, 522,   8,   0, 807,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,  19,   4,   0, 483,  32,   0,   0, 

In [28]:
NUM_VOCAB = len(vocab)
print(len(vocab))

x, y = next(iter(train_dataloader))
x = x.to(device)
y = y.to(device)

1000


In [35]:
EMBEDDING_DIM = 30
embedding = nn.Embedding(len(vocab), EMBEDDING_DIM).to(device) ## len(vocab)개의 단어들을 EMBEDDING_DIM 크기의 실수 벡터로 변환하는 임베딩을 생성.
print(embedding)

Embedding(1000, 30)


In [36]:
embedding_out = embedding(x)
print(embedding_out.shape)
print(embedding_out)

torch.Size([32, 21, 30])
tensor([[[ 1.5194,  0.5919,  1.2933,  ...,  0.7421, -1.0791, -1.8894],
         [ 0.8243,  0.7813, -0.7459,  ..., -0.2022, -0.6529, -1.1443],
         [-1.9865, -0.7665,  1.3789,  ...,  0.1180,  0.6861, -1.3133],
         ...,
         [-0.4216, -0.4395, -0.5644,  ...,  0.5132,  0.0333, -1.7277],
         [ 0.8243,  0.7813, -0.7459,  ..., -0.2022, -0.6529, -1.1443],
         [ 0.8243,  0.7813, -0.7459,  ..., -0.2022, -0.6529, -1.1443]],

        [[ 0.8243,  0.7813, -0.7459,  ..., -0.2022, -0.6529, -1.1443],
         [ 0.8243,  0.7813, -0.7459,  ..., -0.2022, -0.6529, -1.1443],
         [ 0.8243,  0.7813, -0.7459,  ..., -0.2022, -0.6529, -1.1443],
         ...,
         [ 0.8243,  0.7813, -0.7459,  ..., -0.2022, -0.6529, -1.1443],
         [ 0.8243,  0.7813, -0.7459,  ..., -0.2022, -0.6529, -1.1443],
         [ 0.8243,  0.7813, -0.7459,  ..., -0.2022, -0.6529, -1.1443]],

        [[ 0.9251,  0.2349, -0.9838,  ...,  0.9991, -0.0461,  0.6726],
         [ 0.8243,  

In [31]:
HIDDEN_SIZE = 64
NUM_LAYERS = 1
BIDIRECTIONAL = 1
SEQ_LENGTH = x.size(1)

lstm = nn.LSTM(input_size=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, batch_first=True, device=device)
lstm

LSTM(30, 64, batch_first=True)

In [21]:
h_0 = torch.zeros(NUM_LAYERS * BIDIRECTIONAL, SEQ_LENGTH, HIDDEN_SIZE).to(device)
c_0 = torch.zeros(NUM_LAYERS * BIDIRECTIONAL, SEQ_LENGTH, HIDDEN_SIZE).to(device)

In [33]:
lstm_out, (hidden, cell) = lstm(embedding_out)
print(lstm_out.shape)
print(hidden.shape, cell.shape)

torch.Size([32, 21, 64])
torch.Size([1, 32, 64]) torch.Size([1, 32, 64])


In [40]:
def EmbeddingLSTM(x, vocab_size, embedding_dim, hidden_size, bidrectional, num_layers, device):
    print(f"Input Shape : {x.shape}")
    x = x.to(device)
    
    
    embedding = nn.Embedding(vocab_size, embedding_dim, device=device)
    embedding_out = embedding(x)
    print(f"Embedded Input Shape : {embedding_out.shape}")

    lstm = nn.LSTM(input_size=embedding_dim,
                   hidden_size=hidden_size,
                   num_layers=num_layers,
                   batch_first=True,
                   device=device)
    
    bidi = 2 if bidrectional else 1
    out, (h, c) = lstm(embedding_out)
    print(f"Output Shape : {out.shape}")
    print(f"Hidden Shape : {h.shape}")
    print(f"Cell State Shape : {c.shape}")

In [41]:
EmbeddingLSTM(x, vocab_size=len(vocab), embedding_dim=30, hidden_size=64, bidrectional=False, num_layers=2, device=device)

Input Shape : torch.Size([32, 21])
Embedded Input Shape : torch.Size([32, 21, 30])
Output Shape : torch.Size([32, 21, 64])
Hidden Shape : torch.Size([2, 32, 64])
Cell State Shape : torch.Size([2, 32, 64])


In [42]:
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes, vocab_size, embedding_dim, hidden_size, num_layers, bidirectional=True, drop_prob=0.1):
        super(TextClassificationModel, self).__init__()
        self.num_classes = num_classes 
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = 2 if bidirectional else 1
        
        self.embedding = nn.Embedding(num_embeddings=vocab_size, 
                                      embedding_dim=embedding_dim)
        
        self.lstm = nn.LSTM(input_size=embedding_dim, 
                            hidden_size=hidden_size, 
                            num_layers=num_layers, 
                            batch_first=True,
                            bidirectional=bidirectional,
                           )
        
        self.dropout = nn.Dropout(drop_prob)
        
        self.relu = nn.ReLU()
        
        self.fc = nn.Linear(hidden_size*self.bidirectional, hidden_size)
        self.output = nn.Linear(hidden_size, num_classes)
        
    def init_hidden_and_cell_state(self, batch_size, device):
        # LSTM 입력시 초기 Cell 에 대한 가중치 초기화를 진행합니다.
        # (num_layers*bidirectional, batch_size, hidden_size)
        self.hidden_and_cell = (
            torch.zeros(self.num_layers*self.bidirectional, batch_size, self.hidden_size).to(device),
            torch.zeros(self.num_layers*self.bidirectional, batch_size, self.hidden_size).to(device),
        )
        
    def forward(self, x):
        x = self.embedding(x)
        output, (h, c) = self.lstm(x, self.hidden_and_cell)
        # (batch_size, seq_length, hidden_size*bidirectional)
        # last sequence 의 (batch_size, hidden_size*bidirectional)
        h = output[:, -1, :]
        o = self.dropout(h)
        o = self.relu(self.fc(o))
        o = self.dropout(o)
        return self.output(o)

In [43]:
config = {
    'num_classes': 2, 
    'vocab_size': len(vocab),
    'embedding_dim': 16, 
    'hidden_size': 32, 
    'num_layers': 2, 
    'bidirectional': True,
}

model = TextClassificationModel(**config)
model.to(device)

TextClassificationModel(
  (embedding): Embedding(1000, 16)
  (lstm): LSTM(16, 32, num_layers=2, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (relu): ReLU()
  (fc): Linear(in_features=64, out_features=32, bias=True)
  (output): Linear(in_features=32, out_features=2, bias=True)
)

In [44]:
# loss 정의: CrossEntropyLoss
loss_fn = nn.CrossEntropyLoss()

# 옵티마이저 정의: bert.paramters()와 learning_rate 설정
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [45]:
def train(model, data_loader, loss_fn, optimizer, device):
    # 모델을 훈련모드로 설정합니다. training mode 일 때 Gradient 가 업데이트 됩니다. 반드시 train()으로 모드 변경을 해야 합니다.
    model.train()
    
    # loss와 accuracy 계산을 위한 임시 변수 입니다. 0으로 초기화합니다.
    running_loss = 0
    corr = 0
    counts = 0
    
    # 예쁘게 Progress Bar를 출력하면서 훈련 상태를 모니터링 하기 위하여 tqdm으로 래핑합니다.
    prograss_bar = tqdm(data_loader, unit='batch', total=len(data_loader), mininterval=1)
    
    # mini-batch 학습을 시작합니다.
    for idx, (txt, lbl) in enumerate(prograss_bar):
        # txt, lbl 데이터를 device 에 올립니다. (cuda:0 혹은 cpu)
        txt = txt.to(device)
        lbl = lbl.to(device)
        
        # 누적 Gradient를 초기화 합니다.
        optimizer.zero_grad()
        
        # LSTM Weight 초기화
        model.init_hidden_and_cell_state(len(txt), device)
        
        # Forward Propagation을 진행하여 결과를 얻습니다.
        output = model(txt)
        
        # 손실함수에 output, lbl 값을 대입하여 손실을 계산합니다.
        loss = loss_fn(output, lbl)
        
        # 오차역전파(Back Propagation)을 진행하여 미분 값을 계산합니다.
        loss.backward()
        
        # 계산된 Gradient를 업데이트 합니다.
        optimizer.step()
        
        # Probability Max index 를 구합니다.
        output = output.argmax(dim=1)
        
        # 정답 개수를 구합니다.
        corr += (output == lbl).sum().item()
        counts += len(lbl)
        
        # batch 별 loss 계산하여 누적합을 구합니다.
        running_loss += loss.item()
        
        # 프로그레스바에 학습 상황 업데이트
        prograss_bar.set_description(f"training loss: {running_loss/(idx+1):.5f}, training accuracy: {corr / counts:.5f}")
        
    # 누적된 정답수를 전체 개수로 나누어 주면 정확도가 산출됩니다.
    acc = corr / len(data_loader.dataset)
    
    # 평균 손실(loss)과 정확도를 반환합니다.
    # train_loss, train_acc
    return running_loss / len(data_loader), acc

In [46]:
def evaluate(model, data_loader, loss_fn, device):
    # model.eval()은 모델을 평가모드로 설정을 바꾸어 줍니다. 
    # dropout과 같은 layer의 역할 변경을 위하여 evaluation 진행시 꼭 필요한 절차 입니다.
    model.eval()
    
    # Gradient가 업데이트 되는 것을 방지 하기 위하여 반드시 필요합니다.
    with torch.no_grad():
        # loss와 accuracy 계산을 위한 임시 변수 입니다. 0으로 초기화합니다.
        corr = 0
        running_loss = 0
        
        # 배치별 evaluation을 진행합니다.
        for txt, lbl in data_loader:
            # txt, lbl 데이터를 device 에 올립니다. (cuda:0 혹은 cpu)
            txt = txt.to(device)
            lbl = lbl.to(device)
            
            # LSTM Weight 초기화
            model.init_hidden_and_cell_state(len(txt), device)
    
            # 모델에 Forward Propagation을 하여 결과를 도출합니다.
            output = model(txt)
            
            # 검증 손실을 구합니다.
            loss = loss_fn(output, lbl)
            
            # Probability Max index 를 구합니다.
            output = output.argmax(dim=1)
            
            # 정답 개수를 구합니다.
            corr += (output == lbl).sum().item()
            
            # batch 별 loss 계산하여 누적합을 구합니다.
            running_loss += loss.item()
        
        # validation 정확도를 계산합니다.
        # 누적한 정답숫자를 전체 데이터셋의 숫자로 나누어 최종 accuracy를 산출합니다.
        acc = corr / len(data_loader.dataset)
        
        # 결과를 반환합니다.
        # val_loss, val_acc
        return running_loss / len(data_loader), acc

In [47]:
# 최대 Epoch을 지정합니다.
num_epochs = 10

# checkpoint로 저장할 모델의 이름을 정의 합니다.
model_name = 'LSTM-Text-Classification'

min_loss = np.inf

# Epoch 별 훈련 및 검증을 수행합니다.
for epoch in range(num_epochs):
    # Model Training
    # 훈련 손실과 정확도를 반환 받습니다.
    train_loss, train_acc = train(model, train_dataloader, loss_fn, optimizer, device)

    # 검증 손실과 검증 정확도를 반환 받습니다.
    val_loss, val_acc = evaluate(model, valid_dataloader, loss_fn, device)   
    
    # val_loss 가 개선되었다면 min_loss를 갱신하고 model의 가중치(weights)를 저장합니다.
    if val_loss < min_loss:
        print(f'[INFO] val_loss has been improved from {min_loss:.5f} to {val_loss:.5f}. Saving Model!')
        min_loss = val_loss
        torch.save(model.state_dict(), f'/home/pervinco/Models/LSTM/{model_name}.pth')
    
    # Epoch 별 결과를 출력합니다.
    print(f'epoch {epoch+1:02d}, loss: {train_loss:.5f}, acc: {train_acc:.5f}, val_loss: {val_loss:.5f}, val_accuracy: {val_acc:.5f}')

training loss: 0.67011, training accuracy: 0.58249: 100%|██████████| 668/668 [00:01<00:00, 438.62batch/s]


[INFO] val_loss has been improved from inf to 0.59229. Saving Model!
epoch 01, loss: 0.67011, acc: 0.58249, val_loss: 0.59229, val_accuracy: 0.69843


training loss: 0.51376, training accuracy: 0.75149: 100%|██████████| 668/668 [00:01<00:00, 478.83batch/s]


[INFO] val_loss has been improved from 0.59229 to 0.47432. Saving Model!
epoch 02, loss: 0.51376, acc: 0.75149, val_loss: 0.47432, val_accuracy: 0.76675


training loss: 0.43716, training accuracy: 0.79543: 100%|██████████| 668/668 [00:01<00:00, 468.91batch/s]


[INFO] val_loss has been improved from 0.47432 to 0.43340. Saving Model!
epoch 03, loss: 0.43716, acc: 0.79543, val_loss: 0.43340, val_accuracy: 0.79633


training loss: 0.39646, training accuracy: 0.81818: 100%|██████████| 668/668 [00:01<00:00, 470.45batch/s]


[INFO] val_loss has been improved from 0.43340 to 0.41255. Saving Model!
epoch 04, loss: 0.39646, acc: 0.81818, val_loss: 0.41255, val_accuracy: 0.80663


training loss: 0.37130, training accuracy: 0.83343: 100%|██████████| 668/668 [00:01<00:00, 488.25batch/s]


[INFO] val_loss has been improved from 0.41255 to 0.40428. Saving Model!
epoch 05, loss: 0.37130, acc: 0.83343, val_loss: 0.40428, val_accuracy: 0.81056


training loss: 0.35089, training accuracy: 0.83975: 100%|██████████| 668/668 [00:01<00:00, 493.18batch/s]


[INFO] val_loss has been improved from 0.40428 to 0.39339. Saving Model!
epoch 06, loss: 0.35089, acc: 0.83975, val_loss: 0.39339, val_accuracy: 0.81580


training loss: 0.33484, training accuracy: 0.84996: 100%|██████████| 668/668 [00:01<00:00, 510.81batch/s]


[INFO] val_loss has been improved from 0.39339 to 0.38385. Saving Model!
epoch 07, loss: 0.33484, acc: 0.84996, val_loss: 0.38385, val_accuracy: 0.82347


training loss: 0.32160, training accuracy: 0.85515: 100%|██████████| 668/668 [00:01<00:00, 507.26batch/s]


epoch 08, loss: 0.32160, acc: 0.85515, val_loss: 0.41028, val_accuracy: 0.81805


training loss: 0.30820, training accuracy: 0.86128: 100%|██████████| 668/668 [00:01<00:00, 500.46batch/s]


epoch 09, loss: 0.30820, acc: 0.86128, val_loss: 0.40018, val_accuracy: 0.83077


training loss: 0.29848, training accuracy: 0.86788: 100%|██████████| 668/668 [00:01<00:00, 470.61batch/s]


[INFO] val_loss has been improved from 0.38385 to 0.38335. Saving Model!
epoch 10, loss: 0.29848, acc: 0.86788, val_loss: 0.38335, val_accuracy: 0.82890
