In [0]:
# tutorial: https://github.com/bentrevett/pytorch-sentiment-analysis
import torch
from torchtext import data
from torchtext import datasets

SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Include length for packed padded sequence
text = data.Field(tokenize='spacy', include_lengths=True)
label = data.LabelField(dtype=torch.float)

In [2]:
train_data, test_data = datasets.IMDB.splits(text, label)

import random
train_data, val_data = train_data.split(random_state=random.seed(SEED))

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:07<00:00, 11.3MB/s]


In [3]:
# Use pretrained Embeddings
VOCAB_SIZE = 25000

# Vocab is lookup table for every word
text.build_vocab(train_data,
                 max_size = VOCAB_SIZE,
                 vectors = 'glove.6B.100d',
                 unk_init = torch.Tensor.normal_)
label.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:31, 2.20MB/s]                           
100%|█████████▉| 399101/400000 [00:24<00:00, 16576.24it/s]

In [0]:
batch_size = 64

device = torch.device(
            'cuda' if torch.cuda.is_available()
            else 'cpu')

train_iterator, \
val_iterator, \
test_iterator = data.BucketIterator.splits(
                    (train_data, val_data, test_data),
                    batch_size = batch_size,
                    sort_within_batch = True,
                    device = device)

# Build Model

In [0]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
                 num_layers, is_bidirectional, dropout_rate, padding_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim,
                                      padding_idx = padding_idx)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim,
                          num_layers = num_layers,
                          bidirectional = is_bidirectional,
                          dropout = dropout_rate)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))

        # Pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)

        # Unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))

        return self.fc(hidden)

In [0]:
input_dim = len(text.vocab)
embedding_dim = 100 # Should be same as dim of pre trained embeddings
hidden_dim = 256
output_dim = 1
num_layers = 2
is_bidirectional = True
dropout_rate = 0.5
pad_idx = text.vocab.stoi[text.pad_token]


model = RNN(input_dim, embedding_dim, hidden_dim, output_dim,
            num_layers, is_bidirectional, dropout_rate, pad_idx)

pretrained_embeddings = text.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

# Zero <unk> and <pad> embeddings which are prior initailized using unk_init
unk_idx = text.vocab.stoi[text.unk_token]
model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

In [7]:
print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1596,  0.0409, -0.7114,  ...,  0.7691, -0.3105, -0.1981],
        [-0.9153,  0.4699, -0.6548,  ..., -0.2684, -0.3351, -0.2960],
        [-0.1853, -0.0486, -0.1572,  ..., -0.3927, -0.0297, -0.1951]])


In [8]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

4810857

In [9]:
print(model)

RNN(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


# Define Train and Eval functions

In [10]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model.to(device)
criterion.to(device)

BCEWithLogitsLoss()

In [0]:
def accuracy(y_pred, y_orig):
    y_pred = torch.round(torch.sigmoid(y_pred))
    correct = (y_pred == y_orig).float()
    accuracy = correct.sum() / len(correct)

    return accuracy

In [0]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()
    for data in iterator:
        optimizer.zero_grad()

        text, text_lengths = data.text
        
        y_pred = model(text, text_lengths).squeeze(1)
        loss = criterion(y_pred, data.label)
        acc = accuracy(y_pred, data.label)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    with torch.no_grad():
        for data in iterator:
            text, text_lengths = data.text

            y_pred = model(text, text_lengths).squeeze(1)
            loss = criterion(y_pred, data.label)
            acc = accuracy(y_pred, data.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [0]:
import time
def epoch_time(s, e):
    diff = e - s
    diff_min = int(diff / 60)
    diff_sec = int(diff - (diff_min * 60))

    return diff_min, diff_sec

# Train model

In [15]:
epochs = 5
best_val_loss = float('inf')

for epoch in range(epochs):
    start_time = time.time()

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_iterator, criterion)

    end_time = time.time()
    epoch_min, epoch_sec = epoch_time(start_time, end_time)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'senti-lstm.pt')

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_min}m {epoch_sec}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 37s
	Train Loss: 0.682 | Train Acc: 55.73%
	 Val. Loss: 0.691 |  Val. Acc: 52.84%
Epoch: 02 | Epoch Time: 1m 36s
	Train Loss: 0.647 | Train Acc: 61.92%
	 Val. Loss: 0.564 |  Val. Acc: 71.23%
Epoch: 03 | Epoch Time: 1m 37s
	Train Loss: 0.515 | Train Acc: 75.69%
	 Val. Loss: 0.396 |  Val. Acc: 82.66%
Epoch: 04 | Epoch Time: 1m 37s
	Train Loss: 0.389 | Train Acc: 82.96%
	 Val. Loss: 0.434 |  Val. Acc: 81.27%
Epoch: 05 | Epoch Time: 1m 37s
	Train Loss: 0.365 | Train Acc: 84.63%
	 Val. Loss: 0.339 |  Val. Acc: 85.32%


# Test Model

In [16]:
model.load_state_dict(torch.load('senti-lstm.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.363 | Test Acc: 83.77%


# Run model on user input

In [17]:
import spacy 
nlp = spacy.load('en')

sentence = "Hello there. How do you do?"
tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
indexed = [text.vocab.stoi[t] for t in tokenized]
length = [len(indexed)]

print(tokenized)
print(indexed)
print(length)

tensor = torch.LongTensor(indexed).to(device)
print(tensor.shape)
tensor = tensor.unsqueeze(1)
print(tensor.shape)
length_tensor = torch.LongTensor(length)

print(tensor)
print(length_tensor)

y_pred = torch.sigmoid(model(tensor, length_tensor))
print(y_pred, y_pred.item())

['Hello', 'there', '.', 'How', 'do', 'you', 'do', '?']
[7609, 67, 4, 572, 57, 31, 57, 58]
[8]
torch.Size([8])
torch.Size([8, 1])
tensor([[7609],
        [  67],
        [   4],
        [ 572],
        [  57],
        [  31],
        [  57],
        [  58]], device='cuda:0')
tensor([8])
tensor([[0.0558]], device='cuda:0', grad_fn=<SigmoidBackward>) 0.055780038237571716


In [0]:
import spacy
nlp = spacy.load('en')

def predict_senti(model, sentence):
    model.eval()

    tokenized = [token.text for token in nlp.tokenizer(sentence)]
    indexed = [text.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]

    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)

    y_pred = torch.sigmoid(model(tensor, length_tensor))

    return y_pred.item()

In [20]:
sentences = [
    'This movies is alright',
    'Pathetic hero. Anyways, the movie is good',
    'Wasted my money on this film',
    'The movie is not good, its amazing',
    'The movie is not good',
    'The movie is bad',
]

for sentence in sentences:
    print(sentence, predict_senti(model, sentence))

This movies is alright 0.02195347659289837
Pathetic hero. Anyways, the movie is good 0.9661369919776917
Wasted my money on this film 0.04008140787482262
The movie is not good, its amazing 0.9844872355461121
The movie is not good 0.33917102217674255
The movie is bad 0.015250181779265404
