In [1]:
# Part of my study from : https://github.com/bentrevett/pytorch-sentiment-analysis

In [2]:
## ALL IMPORTS FOR A NEW NOTEBOOK

import os, sys, random, math
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import itertools as it
import scipy
import glob
import matplotlib
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
from torch.optim import Optimizer
import torchvision.transforms.transforms as txf
import torch.optim.lr_scheduler as lr_scheduler
from collections import OrderedDict

from sklearn import metrics
from sklearn import preprocessing as pp
from sklearn import model_selection as ms

import torch_utils
from tqdm.notebook import tqdm_notebook as tqdm
import time

font = {'size'   : 20}

matplotlib.rc('font', **font)

In [3]:
SEED = 947
torch_utils.seed_everything(SEED)

In [4]:
from torchtext import data

In [5]:
TEXT = data.Field(tokenize="spacy")
LABEL = data.LabelField(dtype=torch.float)

In [8]:
from torchtext import datasets
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [9]:
print("Number of training examples: {}".format(len(train_data)))
print("Number of test examples: {}".format(len(test_data)))

Number of training examples: 25000
Number of test examples: 25000


In [10]:
print(vars(train_data.examples[0]))

{'text': ['Action', '.', 'Comedy', '.', 'Suspense', '.', 'This', 'movie', 'has', 'it', 'all.<br', '/><br', '/>The', 'Plot', 'goes', 'that', '4', 'would', 'be', 'professional', 'thieves', 'are', 'invited', 'to', 'take', 'part', 'in', 'a', 'heist', 'in', 'a', 'small', 'town', 'in', 'Montana', '.', 'every', 'type', 'of', 'crime', 'movie', 'archetype', 'character', 'is', 'here', '.', 'Frank', ',', 'the', 'master', 'mind', '.', 'Carlos', ',', 'the', 'weapons', 'expert', '.', 'Max', ',', 'the', 'explosives', 'expert', '.', 'Nick', ',', 'the', 'safe', 'cracker', 'and', 'Ray', ',', 'the', 'car', 'man', '.', 'Unfortunately', 'for', 'Frank', ',', 'he', 'is', 'apprehended', 'by', '2', 'bumbling', 'detectives', '(', 'portrayed', 'very', 'well', 'by', 'Ed', "O'Niel", 'and', 'Daniel', 'Roebuck', ')', 'that', 'have', 'been', 'chasing', 'him', 'from', 'New', 'Jersey', 'write', 'after', 'he', 'sends', 'out', 'the', 'letters', 'to', 'the', 'other', '4.<br', '/><br', '/>Our', '4', 'characters', 'meet', '

In [11]:
train_data, valid_data = train_data.split(random_state=random.seed(SEED), split_ratio=0.8)

In [12]:
print("train len {}".format(len(train_data)))
print("valid len {}".format(len(valid_data)))
print("test len {}".format(len(test_data)))

train len 20000
valid len 5000
test len 25000


In [13]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [14]:
len(TEXT.vocab), len(LABEL.vocab)

(25002, 2)

In [15]:
TEXT.vocab.freqs.most_common(20)

[('the', 231625),
 (',', 219352),
 ('.', 189108),
 ('a', 125105),
 ('and', 125098),
 ('of', 115281),
 ('to', 106932),
 ('is', 87194),
 ('in', 70061),
 ('I', 61871),
 ('it', 60885),
 ('that', 56096),
 ('"', 50728),
 ("'s", 49374),
 ('this', 48373),
 ('-', 42226),
 ('/><br', 40588),
 ('was', 39579),
 ('as', 34646),
 ('with', 34340)]

In [16]:
TEXT.vocab.itos[:10]

['<unk>', '<pad>', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']

In [17]:
LABEL.vocab.stoi

defaultdict(None, {'pos': 0, 'neg': 1})

In [18]:
BATCH_SIZE = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_sizes=(BATCH_SIZE, BATCH_SIZE, BATCH_SIZE),
    device=device
)

In [21]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, txt):
        e = self.embedding(txt)
        out, hidden = self.rnn(e)
        
        return self.fc(hidden.squeeze(0))

In [22]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [23]:
def count_model_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [24]:
count_model_params(model)

2592105

In [26]:
optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [27]:
model = model.to(device)
criterion = criterion.to(device)

In [28]:
def binary_accuracy(preds, y):
    rounded_pred = torch.round(torch.sigmoid(preds))
    correct = (rounded_pred==y).float()
    acc = correct.sum()/len(correct)
    return acc

In [29]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        preds = model(batch.text).squeeze(1)
        loss = criterion(preds, batch.label)
        acc = binary_accuracy(preds, batch.label)
        loss.backward()
        optimizer.step()
        
        epoch_loss+=loss.item()
        epoch_acc+=acc.item()
        
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [30]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:

            preds = model(batch.text).squeeze(1)
            loss = criterion(preds, batch.label)
            acc = binary_accuracy(preds, batch.label)
            
            epoch_loss+=loss.item()
            epoch_acc+=acc.item()
        
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [31]:
N_EPOCHS = 5
best_valid_loss = float("inf")

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    if valid_loss<best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut1-model.pt")
    
    print("EPOCH: {} TIME: {} s".format(epoch+1, time.time()-start_time))
    print("EPOCH: {} Train Loss{} Train ACC {}%".format(epoch+1, train_loss, 100.0*train_acc))
    print("EPOCH: {} Valid Loss{} Valid ACC {}%".format(epoch+1, valid_loss, 100.0*valid_acc))

EPOCH: 1 TIME: 17.019481897354126 s
EPOCH: 1 Train Loss0.693676843810767 Train ACC 49.555710862619804%
EPOCH: 1 Valid Loss0.6931708115565626 Valid ACC 50.29667721518988%
EPOCH: 2 TIME: 16.98063373565674 s
EPOCH: 2 Train Loss0.6932437218035372 Train ACC 50.14976038338658%
EPOCH: 2 Valid Loss0.6930699484257759 Valid ACC 51.04825949367089%
EPOCH: 3 TIME: 17.080568313598633 s
EPOCH: 3 Train Loss0.6932315735009532 Train ACC 50.07987220447284%
EPOCH: 3 Valid Loss0.6935107330732708 Valid ACC 49.723101265822784%
EPOCH: 4 TIME: 17.226268529891968 s
EPOCH: 4 Train Loss0.6931580048018751 Train ACC 50.41932907348243%
EPOCH: 4 Valid Loss0.6931160114988496 Valid ACC 50.51424050632911%
EPOCH: 5 TIME: 17.0508713722229 s
EPOCH: 5 Train Loss0.6931761535592734 Train ACC 50.35942492012779%
EPOCH: 5 Valid Loss0.6934082115752788 Valid ACC 49.960443037974684%


In [32]:
torch_utils.clear_cuda()

In [33]:
model.load_state_dict(torch.load("tut1-model.pt", map_location=device))
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(test_loss, 100.0*test_acc)

0.689970259013993 55.19181585982632
