# IMDB_RNN model

# 0. Setting

In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.legacy import data # No more Field class (torchtext==0.9.0)
from torchtext.legacy import datasets
import random

In [2]:
# Seed fixing
SEED = 0
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1ecb8b5f9b0>

In [3]:
# Hyper param
BATCH_SIZE = 2048
lr = 0.0001
EPOCHS = 10

In [7]:
torch.cuda.is_available()

False

In [4]:
# GPU Setting
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device('cuda' if USE_CUDA else 'cpu')
DEVICE

device(type='cpu')

---
# __Block and Server reset__
---

# 1. Data load and preprocessing

In [5]:
TEXT = data.Field(sequential=True, batch_first=True, lower=True) # toLower == True
LABEL = data.Field(sequential=False, batch_first=True) # Label is not sequential data.

In [6]:
# 전체 데이터를 훈련 데이터와 테스트 데이터를 8:2 비율로 나누기
trainset, testset = datasets.IMDB.splits(TEXT, LABEL)

In [7]:
print('trainset : \n', trainset.fields)

trainset : 
 {'text': <torchtext.legacy.data.field.Field object at 0x000001C6D5E30790>, 'label': <torchtext.legacy.data.field.Field object at 0x000001C6D5E30B50>}


In [8]:
print('test : \n', testset.fields)

test : 
 {'text': <torchtext.legacy.data.field.Field object at 0x000001C6D5E30790>, 'label': <torchtext.legacy.data.field.Field object at 0x000001C6D5E30B50>}


In [9]:
print('-'*100)
print(trainset[0].text)
print('-'*100)
print(trainset[0].label)
print('-'*100)

----------------------------------------------------------------------------------------------------
['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life,', 'such', 'as', '"teachers".', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', "high's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"teachers".', 'the', 'scramble', 'to', 'survive', 'financially,', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp,', 'the', 'pettiness', 'of', 'the', 'whole', 'situation,', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school,', 'i', 'immediately', 'recalled', '.........', 'at', '..........',

In [10]:
# Create vovabulary
TEXT.build_vocab(trainset, min_freq=5)
LABEL.build_vocab(trainset)

In [11]:
vocab_size = len(TEXT.vocab)
n_classes = 2
print('Size of vocab : {}'.format(vocab_size))
print('Number of classes : {}'.format(n_classes))

Size of vocab : 46159
Number of classes : 2


# 2. Data loader

In [12]:
trainset, valset = trainset.split(split_ratio=0.8)

In [13]:
train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (trainset, valset, testset), batch_size=BATCH_SIZE, shuffle=True, repeat=False
)

In [14]:
print('Number of mini-batch of train data : {}'.format(len(train_iter)))
print('Number of mini-batch of test data : {}'.format(len(test_iter)))
print('Number of mini-batch of validate data : {}'.format(len(val_iter)))

Number of mini-batch of train data : 20
Number of mini-batch of test data : 25
Number of mini-batch of validate data : 5


In [15]:
batch = next(iter(train_iter)) # 첫번째 미니배치
print(batch.text.shape)

torch.Size([1024, 1001])


In [16]:
batch = next(iter(train_iter)) # 두번째 미니배치
print(batch.text.shape)

torch.Size([1024, 2470])


# 3. GRU Model

In [17]:
class GRU(nn.Module):
    """
    return torch model GRU(n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2)
    """
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        super(GRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(embed_dim, self.hidden_dim,
                          num_layers=self.n_layers,
                          batch_first=True)
        self.out = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, x):
        x = self.embed(x)
        h_0 = self._init_state(batch_size=x.size(0)) # Init first hidden state
        x, _ = self.gru(x, h_0)  # return (batch_size, length of sequence, size of hidden state)
        h_t = x[:,-1,:] # get latest hidden state of time-step.
        self.dropout(h_t)
        logit = self.out(h_t)  # to 2 dim
        return logit

    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [18]:
model = GRU(1, 256, vocab_size, 128, n_classes, dropout_p=0.5).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [19]:
def train(model, optimizer, train_iter):
    model.train()
    for b, batch in enumerate(train_iter):
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        y.data.sub_(1)  # Covert label value to 0 and 1
        optimizer.zero_grad()

        logit = model(x)
        loss = F.cross_entropy(logit, y)
        loss.backward()
        optimizer.step()

In [20]:
def evaluate(model, val_iter):
    model.eval()
    corrects, total_loss = 0, 0
    for batch in val_iter:
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        y.data.sub_(1)
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction='sum')
        total_loss += loss.item()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
    size = len(val_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [None]:
best_val_loss = None
for i in range(1, EPOCHS + 1):
    train(model, optimizer, train_iter)
    val_loss, val_accuracy = evaluate(model, val_iter)
    
    print('[Epoch : %d] val loss : %5.2f | val accuracy : %5.2f' % (i, val_loss, val_accuracy))
    
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir('snapshot'):
            os.makedirs('snapshot')
        torch.save(model.state_dict(), './snapshot/txtclassification.pt')
        best_val_loss = val_loss