<a href="https://colab.research.google.com/github/namdori61/colab-playground/blob/master/NLP_Sentiment_analysis_CNN_IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Reference
Open source sentiment analysis tutorial: https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import torch
from torchtext import data
from torchtext import datasets
import random
import numpy as np

In [0]:
# For REPRODUCIBILITY, https://pytorch.org/docs/stable/notes/randomness.html
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [5]:
TEXT = data.Field(tokenize='spacy', batch_first=True)
LABEL = data.LabelField(dtype=torch.float)

train_data, test_data = datasets.IMDB.splits(text_field = TEXT, label_field = LABEL)

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:02<00:00, 39.8MB/s]


In [6]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data,
                 max_size = MAX_VOCAB_SIZE,
                 vectors = 'glove.6B.100d',
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:27, 2.23MB/s]                           
100%|█████████▉| 399365/400000 [00:14<00:00, 27861.51it/s]

In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

In [0]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.convs = nn.ModuleList(
            [
             nn.Conv2d(in_channels = 1,
                       out_channels = n_filters,
                       kernel_size = (fs, embedding_dim))
             for fs in filter_sizes
            ]
        )
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):
        #text = [batch_size, sent_len]

        embedded = self.embedding(text)
        #embedded = [batch_size, set_len, emb_dim]
        embedded = embedded.unsqueeze(1)        
        #embedded = [batch_size, 1, sent_len, emb_dim]

        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        #conved_n = [batch_size, n_filters, set_len - filter_size[n] + 1]

        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        #pooled_n = [batch_size, n_filters]

        cat = self.dropout(torch.cat(pooled, dim=1))
        #cat = [batch_size, n_filters * len(filter_sizes)]

        return self.fc(cat)

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [20]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,620,801 trainable parameters


In [21]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.6476,  1.2903,  0.0906,  ..., -0.7511, -0.6667,  0.3439],
        [-0.1272, -0.3361, -0.2779,  ..., -0.8548,  0.2575, -0.1269],
        [-0.1098, -0.2944,  0.9547,  ..., -0.1711,  0.4998, -0.3169]])

In [0]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [0]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)

        loss = criterion(predictions, batch.label)

        acc = binary_accuracy(predictions, batch.label)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)

            loss = criterion(predictions, batch.label)

            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc /len(iterator)

In [27]:
from tqdm import tqdm

N_EPOCHS = 5

best_vaild_loss = float('inf')

for epoch in tqdm(range(N_EPOCHS), desc='training'):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    if valid_loss < best_vaild_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'imdb_cnn_model.pt')
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')


training:   0%|          | 0/5 [00:00<?, ?it/s][A
training:  20%|██        | 1/5 [00:18<01:12, 18.12s/it][A

Epoch: 01
	Train Loss: 0.654 | Train Acc: 61.16%
	 Val. Loss: 0.537 |  Val. Acc: 75.95%



training:  40%|████      | 2/5 [00:36<00:54, 18.12s/it][A

Epoch: 02
	Train Loss: 0.438 | Train Acc: 79.76%
	 Val. Loss: 0.357 |  Val. Acc: 84.76%



training:  60%|██████    | 3/5 [00:54<00:36, 18.14s/it][A

Epoch: 03
	Train Loss: 0.304 | Train Acc: 87.14%
	 Val. Loss: 0.316 |  Val. Acc: 86.33%



training:  80%|████████  | 4/5 [01:12<00:18, 18.10s/it][A

Epoch: 04
	Train Loss: 0.221 | Train Acc: 91.28%
	 Val. Loss: 0.311 |  Val. Acc: 87.05%



training: 100%|██████████| 5/5 [01:30<00:00, 18.08s/it]

Epoch: 05
	Train Loss: 0.157 | Train Acc: 94.13%
	 Val. Loss: 0.313 |  Val. Acc: 87.77%





In [28]:
model.load_state_dict(torch.load('imdb_cnn_model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.351 | Test Acc: 85.59%
