In [1]:
## ALL IMPORTS FOR A NEW NOTEBOOK

import os, sys, random, math
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import itertools as it
import scipy
import glob
import matplotlib
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
from torch.optim import Optimizer
import torchvision.transforms.transforms as txf
import torch.optim.lr_scheduler as lr_scheduler
from collections import OrderedDict

from sklearn import metrics
from sklearn import preprocessing as pp
from sklearn import model_selection as ms

import torch_utils
from tqdm.notebook import tqdm_notebook as tqdm
import time

font = {'size'   : 20}

matplotlib.rc('font', **font)

In [2]:
from torchtext import data
from torchtext import datasets

torch_utils.seed_everything(1234)

In [3]:
TEXT = data.Field(tokenize="spacy", include_lengths=True)
LABEL = data.LabelField(dtype=torch.float)

In [4]:
%%time
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

CPU times: user 1min 7s, sys: 755 ms, total: 1min 8s
Wall time: 1min 7s


In [5]:
train_data, valid_data = train_data.split(split_ratio=0.8, random_state=random.seed(1234))

In [6]:
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE, vectors = "glove.6B.100d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

100%|█████████▉| 398101/400000 [00:30<00:00, 21642.65it/s]

In [7]:
BATCH_SIZE = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_sizes=(BATCH_SIZE, BATCH_SIZE, BATCH_SIZE),
    sort_within_batch=True,
    device=device
)

In [15]:
class RNN(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
        super(RNN, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(
            embedding_dim,
            hidden_dim, 
            num_layers=n_layers,
            bidirectional=bidirectional,
            dropout=dropout
        )
        
        fc_in = hidden_dim
        if bidirectional:
            fc_in*=2
        
        self.fc = nn.Linear(fc_in, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, txt, txt_len):
        embedded = self.dropout(self.embedding(txt))
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, txt_len)
        
        packed_out, (ht, ct) = self.rnn(packed_embedded)
        
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_out)
        
        hidden = self.dropout(torch.cat((ht[-2,:,:], ht[-1,:,:]), dim=1))
        
        return self.fc(hidden)

In [47]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_NUM = 1
N_LAYERS=2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(
    INPUT_DIM,
    EMBEDDING_DIM,
    HIDDEN_DIM,
    OUTPUT_NUM,
    N_LAYERS,
    BIDIRECTIONAL,
    DROPOUT,
    PAD_IDX
)

In [48]:
torch_utils.count_model_params(model)

4810857

In [49]:
pretrained_embeddings = TEXT.vocab.vectors
pretrained_embeddings.shape

torch.Size([25002, 100])

In [50]:
model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.2737,  0.3614,  0.3459,  ..., -1.0780, -0.2267,  0.7168],
        [-0.6632,  0.0449,  0.2853,  ...,  0.2924, -0.8625,  0.4472],
        [-0.0929,  0.4367,  0.6385,  ...,  0.6854,  0.2857, -0.4070]])


In [51]:
optimizer = torch_utils.RAdam(model.parameters())

In [52]:
criterion = nn.BCEWithLogitsLoss().to(device)
model = model.to(device)

In [53]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds==y).float()
    acc = correct.sum()/len(correct)
    return acc

In [54]:
def train(model, iterator, optimizer, criterion):
    epoch_l = 0
    epoch_a = 0
    
    model.train()
    
    for (txt, txt_len), label in tqdm(iterator):
        optimizer.zero_grad()
        
        preds = model(txt, txt_len).squeeze(dim=1)
        loss = criterion(preds, label)
        
        acc = binary_accuracy(preds, label)
        
        loss.backward()
        optimizer.step()
        
        epoch_l+=loss.item()
        epoch_a+=acc.item()
    
    return epoch_l/len(iterator), epoch_a/len(iterator)

In [55]:
def evaluate(model, iterator, criterion):
    ls = 0
    ac = 0
    
    model.eval()
    
    with torch.no_grad():
        for (txt, txt_len), label in tqdm(iterator):
            preds = model(txt, txt_len).squeeze(dim=1)
            loss = criterion(preds, label)
            acc = binary_accuracy(preds, label)
            
            ls+=loss.item()
            ac+=acc.item()
    
    return ls/len(iterator), ac/len(iterator)

In [56]:
N_EPOCHS=25
best_valid_loss = float("inf")

for e in range(N_EPOCHS):
    st = time.time()
    
    trl, tra = train(model, train_iterator, optimizer, criterion)
    vll, vla = evaluate(model, valid_iterator, criterion)
    
    if vll<best_valid_loss:
        best_valid_loss = vll
        torch.save(model.state_dict(), "tut2-model.pt")
        
    print("EPOCH {} time {} seconds".format(e+1, time.time()-st))
    print("EPOCH {} train_loss {} train accuracy {}".format(e+1, trl, tra))
    print("EPOCH {} valid_loss {} valid accuracy {}".format(e+1, vll, vla))
    
    torch_utils.clear_cuda()

HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 1 time 35.06717658042908 seconds
EPOCH 1 train_loss 0.6678135415997368 train accuracy 0.5764776357827476
EPOCH 1 valid_loss 0.6689833462992801 valid accuracy 0.5684335443037974


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 2 time 34.623358964920044 seconds
EPOCH 2 train_loss 0.5895412014886594 train accuracy 0.6919928115015974
EPOCH 2 valid_loss 0.6891329318662233 valid accuracy 0.5567642405063291


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 3 time 35.28020763397217 seconds
EPOCH 3 train_loss 0.5602907975451253 train accuracy 0.722444089456869
EPOCH 3 valid_loss 0.4703198448766636 valid accuracy 0.7917325949367089


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 4 time 34.79752564430237 seconds
EPOCH 4 train_loss 0.4743995021897764 train accuracy 0.778404552715655
EPOCH 4 valid_loss 0.43405736171746556 valid accuracy 0.8146756329113924


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 5 time 33.3135986328125 seconds
EPOCH 5 train_loss 0.4002790800489176 train accuracy 0.8267771565495208
EPOCH 5 valid_loss 0.3485623713158354 valid accuracy 0.8589794303797469


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 6 time 33.49093151092529 seconds
EPOCH 6 train_loss 0.3756969945784956 train accuracy 0.8366114217252396
EPOCH 6 valid_loss 0.3823942879710016 valid accuracy 0.8265427215189873


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 7 time 34.517621755599976 seconds
EPOCH 7 train_loss 0.31956646436700425 train accuracy 0.8700579073482428
EPOCH 7 valid_loss 0.3519630905570863 valid accuracy 0.848496835443038


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 8 time 34.231544494628906 seconds
EPOCH 8 train_loss 0.2886659193296021 train accuracy 0.8844349041533547
EPOCH 8 valid_loss 0.3340438828815388 valid accuracy 0.8659018987341772


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 9 time 34.34150505065918 seconds
EPOCH 9 train_loss 0.2461722497456371 train accuracy 0.9050519169329073
EPOCH 9 valid_loss 0.2705227662396582 valid accuracy 0.8935917721518988


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 10 time 34.76779556274414 seconds
EPOCH 10 train_loss 0.21761020556235086 train accuracy 0.9170327476038339
EPOCH 10 valid_loss 0.2701240150800234 valid accuracy 0.8943829113924051


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 11 time 34.360140562057495 seconds
EPOCH 11 train_loss 0.20094884817783062 train accuracy 0.9217252396166135
EPOCH 11 valid_loss 0.27353463967955566 valid accuracy 0.8975474683544303


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 12 time 34.412373542785645 seconds
EPOCH 12 train_loss 0.1910690041586233 train accuracy 0.9268670127795527
EPOCH 12 valid_loss 0.25574024370576764 valid accuracy 0.903876582278481


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 13 time 34.013755083084106 seconds
EPOCH 13 train_loss 0.18198327507121495 train accuracy 0.9300119808306709
EPOCH 13 valid_loss 0.2628028146639655 valid accuracy 0.9032832278481012


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 14 time 33.957863092422485 seconds
EPOCH 14 train_loss 0.14583162326639454 train accuracy 0.946785143769968
EPOCH 14 valid_loss 0.3032908025433463 valid accuracy 0.8945806962025317


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 15 time 33.46272921562195 seconds
EPOCH 15 train_loss 0.13112593816515927 train accuracy 0.9519269169329073
EPOCH 15 valid_loss 0.28457975477168834 valid accuracy 0.8943829113924051


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 16 time 34.05236077308655 seconds
EPOCH 16 train_loss 0.11895009543234929 train accuracy 0.956819089456869
EPOCH 16 valid_loss 0.3053111016420247 valid accuracy 0.901503164556962


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 17 time 33.76441931724548 seconds
EPOCH 17 train_loss 0.11014374682364372 train accuracy 0.9612120607028753
EPOCH 17 valid_loss 0.32084689578159326 valid accuracy 0.900118670886076


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 18 time 33.49815607070923 seconds
EPOCH 18 train_loss 0.1010108947152861 train accuracy 0.9637080670926518
EPOCH 18 valid_loss 0.288611830082498 valid accuracy 0.9042721518987342


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 19 time 33.90289640426636 seconds
EPOCH 19 train_loss 0.08800960738604632 train accuracy 0.9690495207667732
EPOCH 19 valid_loss 0.3385876460160023 valid accuracy 0.9017009493670886


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 20 time 34.393409967422485 seconds
EPOCH 20 train_loss 0.07962755150735949 train accuracy 0.9719448881789138
EPOCH 20 valid_loss 0.3485571901414166 valid accuracy 0.9020965189873418


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 21 time 33.00711703300476 seconds
EPOCH 21 train_loss 0.07358908205748366 train accuracy 0.9734424920127795
EPOCH 21 valid_loss 0.3591820115011327 valid accuracy 0.9017009493670886


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 22 time 33.79064226150513 seconds
EPOCH 22 train_loss 0.06656544002891777 train accuracy 0.9757887380191693
EPOCH 22 valid_loss 0.3558947813426015 valid accuracy 0.9034810126582279


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 23 time 33.27406597137451 seconds
EPOCH 23 train_loss 0.055985894909622476 train accuracy 0.9820786741214057
EPOCH 23 valid_loss 0.356627658074368 valid accuracy 0.9066455696202531


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 24 time 35.8248085975647 seconds
EPOCH 24 train_loss 0.06308447104393722 train accuracy 0.9775359424920128
EPOCH 24 valid_loss 0.352033241436805 valid accuracy 0.9020965189873418


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


EPOCH 25 time 34.722976207733154 seconds
EPOCH 25 train_loss 0.04738657427847552 train accuracy 0.983426517571885
EPOCH 25 valid_loss 0.3473031512208141 valid accuracy 0.9036787974683544


In [57]:
torch_utils.clear_cuda()

In [58]:
model.load_state_dict(torch.load("tut2-model.pt", map_location=device))

<All keys matched successfully>

In [59]:
torch_utils.clear_cuda()

In [60]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(test_loss, 100.0*test_acc,"%")

HBox(children=(IntProgress(value=0, max=391), HTML(value='')))


0.2770649896329626 89.09606778103372 %


In [61]:
torch_utils.clear_cuda()

In [62]:
import spacy
nlp = spacy.load("en")

In [66]:
LABEL.vocab.stoi

defaultdict(None, {'neg': 0, 'pos': 1})

In [67]:
def predict_sentiment(model, txt):
    model.eval()
    
    tokenized = [tok.text for tok in nlp.tokenizer(txt)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(dim=1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    
    return prediction.item()

In [98]:
review="""Imagine a horse who watches a marathon of all the episodes of Game of Thrones, Harry Potter and Power Rangers in one go. Then the horse gets drunk and takes some tranquilizers. The horse has a nightmare. This is it."""
predict_sentiment(model, review)

0.6744261384010315