In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import torch
from torch import nn
import numpy as np
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

In [2]:
with open('../data/IMDB/reviews.txt') as f:
    reviews=f.read()
with open('../data/IMDB/labels.txt') as f:
    labels=f.read()

In [3]:
reviews[:100]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life'

In [4]:
reviews[:10000]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   \nstory of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is tu

In [5]:
from string import punctuation
print(punctuation)
reviews=reviews.lower()

all_text = ''.join([c for c in reviews if c not in punctuation])

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [6]:
all_text[:100]

'bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life '

In [7]:
reviews_split = all_text.split('\n')
words = all_text.split()

In [8]:
reviews_split[:1]

['bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   ']

In [9]:
words[:20]

['bromwell',
 'high',
 'is',
 'a',
 'cartoon',
 'comedy',
 'it',
 'ran',
 'at',
 'the',
 'same',
 'time',
 'as',
 'some',
 'other',
 'programs',
 'about',
 'school',
 'life',
 'such']

In [10]:
from collections import Counter

counts = Counter(words)
vocab = sorted(counts,key=counts.get,reverse=True)

vocab2int = {word:ii for ii,word in enumerate(vocab,1)}
int2vocab = {ii:word for ii,word in enumerate(vocab,1)}

review_ints=[]

for review in reviews_split:
    review_ints.append([vocab2int[word] for word in review.split()])

In [11]:
' '.join([int2vocab[c] for c in review_ints[:1][0]])

'bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my years in the teaching profession lead me to believe that bromwell high s satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i m here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn t'

In [12]:
labels_split = labels.split('\n')
encoded_labels = np.array([1 if label == 'positive' else 0 for label in labels_split])

In [13]:
encoded_labels

array([1, 0, 1, ..., 1, 0, 0])

In [14]:
labels_split[:10]

['positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative']

In [15]:
review_lens = Counter([len(x) for x in review_ints])
print('Zero size lengths: {}'.format(review_lens[0]))
print('Max size lengths: {}'.format(max(review_lens)))

Zero size lengths: 1
Max size lengths: 2514


In [16]:
print("Number of review lengths before removing outliners: {}".format(len(review_ints)))

review_ints = [consider for consider in review_ints if len(consider)>0 ]
encoded_labels = [encoded_labels[ii] for ii,consider in enumerate(review_ints) if len(consider)>0]
encoded_labels=np.array(encoded_labels)
print("Number of review lengths After removing outliners: {}".format(len(review_ints)))

Number of review lengths before removing outliners: 25001
Number of review lengths After removing outliners: 25000


In [17]:
def pad_features(review_ints,seq_length):
    features=np.zeros((len(review_ints),seq_length),dtype=int)
    for ii,review in enumerate(review_ints):
        if len(review)<seq_length:
            ch_len = seq_length - len(review);
            features[ii,ch_len:]=np.array(review)
        else:
            features[ii,:]=np.array(review[:seq_length])
    
    return features
    

In [18]:
seq_length = 200

features=pad_features(review_ints,seq_length=seq_length)

assert len(features)==len(review_ints),"Different rows"
assert len(features[0])==seq_length,"Each row should be seq_length"

In [19]:
features.shape

(25000, 200)

In [20]:
print(features[:10,:10])

[[    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [22382    42 46418    15   706 17139  3389    47    77    35]
 [ 4505   505    15     3  3342   162  8312  1652     6  4819]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [   54    10    14   116    60   798   552    71   364     5]]


In [21]:
split_frac=0.8

indices=int(len(features)*split_frac)
train_x,train_y = features[:indices],encoded_labels[:indices]

valid_indices = int((len(features)-indices)/2)+indices
valid_x,valid_y = features[indices:valid_indices],encoded_labels[indices:valid_indices]

test_indices = int((len(features)-indices)/2)+valid_indices
test_x,test_y = features[valid_indices:test_indices],encoded_labels[valid_indices:test_indices]

In [22]:
features[-2501,-10:]

array([ 713,   15,   10,  301,  464,   11,   18,  114, 2418,   21])

In [23]:
valid_x[-1,-10:]

array([ 713,   15,   10,  301,  464,   11,   18,  114, 2418,   21])

In [24]:
print("\t\t\t Feature Shapes:")
print("Train set: \t \t{}".format(train_x.shape),
      "\nValidation set: \t{}".format(valid_x.shape),
      "\nTest set: \t \t{}".format(test_x.shape))

			 Feature Shapes:
Train set: 	 	(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 	 	(2500, 200)


In [25]:
train_data = TensorDataset(torch.from_numpy(train_x),torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(valid_x),torch.from_numpy(valid_y))
test_data = TensorDataset(torch.from_numpy(test_x),torch.from_numpy(test_y))

batch_size=50

train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True)
valid_loader = DataLoader(valid_data,batch_size=batch_size,shuffle=True)
test_loader = DataLoader(test_data,batch_size=batch_size,shuffle=True)

In [26]:
data_iter=iter(train_loader)
sample_x,sample_y=data_iter.next()

print(sample_x[:10,:8])
print()
print(sample_y[:10])

tensor([[     0,      0,      0,      0,     10,     43,    329,      1],
        [    10,     68,   7150,  10895,      1,     84,   2778,      4],
        [    10,    143,    165,    756,     32,      1,   1140,      4],
        [    11,   1242,    426,    262,    707,     45,      4,      1],
        [     1,   1693,     13,  21599,      6,     60,    330,    578],
        [     0,      0,      0,      0,      0,      0,      0,      0],
        [     0,      0,      0,      0,      0,      0,      0,      0],
        [     0,      0,      0,      0,      0,      0,      0,      0],
        [     0,     11,     20,      6,    605,     48,     92,    421],
        [     0,      0,      0,      0,      0,      0,      0,      0]])

tensor([ 1,  0,  0,  0,  1,  1,  0,  1,  1,  1])


In [27]:
train_on_gpu=torch.cuda.is_available()

if train_on_gpu:
    print("Training on GPU!")
else:
    print("Training on CPU")

Training on CPU


In [28]:
class SentimentRNN(nn.Module):
    
    def __init__(self,vocab_size,output_size,embedding_dim,hidden_dim,
                 n_layers,drop_prob=0.5):
        super(SentimentRNN,self).__init__()
        
        self.output_size=output_size
        self.n_layers=n_layers
        self.hidden_dim=hidden_dim
        
        self.embedding=nn.Embedding(vocab_size,embedding_dim)
        self.lstm=nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,
                         num_layers=self.n_layers,batch_first=True,
                         dropout=drop_prob)
        self.dropout = nn.Dropout(0.3)
        
        self.fc = nn.Linear(self.hidden_dim,output_size)
        self.sig = nn.Sigmoid()
        
    def forward(self,x,hidden):
        
        batch_size=x.size(0)
        embeds = self.embedding(x)
        out,hidden = self.lstm(embeds,hidden)
        
        out=self.dropout(out)
        
        out=out.contiguous().view(-1,self.hidden_dim)
        out=self.fc(out)
        sig_out = self.sig(out)
        
        sig_out=sig_out.view(batch_size,-1)
        sig_out=sig_out[:,-1]
        
        
        return sig_out,hidden
    
    def init_hidden(self,batch_size):
        
        weight=next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [29]:
vocab_size=len(vocab2int)+1
output_size=1
embedding_dim = 400
hidden_dim=256
n_layers = 2

model = SentimentRNN(vocab_size,output_size,embedding_dim,hidden_dim,
                    n_layers)
print(model)

SentimentRNN(
  (embedding): Embedding(74073, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [30]:
lr=0.001
criterion=nn.BCELoss()
optimizer=torch.optim.Adam(model.parameters(),lr=lr)

In [31]:
epochs = 4

counter =0
print_every =10
clip = 5

if train_on_gpu:
    model.cuda()
    
model.train()

for e in range(epochs):
    
    h=model.init_hidden(batch_size)
    
    for inputs,labels in train_loader:
        counter+=1
        
        if(train_on_gpu):
            inputs,labels=inputs.cuda(),labels.cuda()
        
        h=tuple([each.data for each in h])
        
        model.zero_grad()
        
        output,h = model.forward(inputs,h)
        
        loss=criterion(output.squeeze(),labels.float())
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(),clip)
        optimizer.step()
        
        if counter % print_every == 0:
            
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = model.forward(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            model.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/4... Step: 10... Loss: 0.696259... Val Loss: 0.691355
Epoch: 1/4... Step: 20... Loss: 0.689714... Val Loss: 0.689483
Epoch: 1/4... Step: 30... Loss: 0.743144... Val Loss: 0.687450
Epoch: 1/4... Step: 40... Loss: 0.709624... Val Loss: 0.680319
Epoch: 1/4... Step: 50... Loss: 0.733990... Val Loss: 0.666152
Epoch: 1/4... Step: 60... Loss: 0.656674... Val Loss: 0.669443
Epoch: 1/4... Step: 70... Loss: 0.638460... Val Loss: 0.640823
Epoch: 1/4... Step: 80... Loss: 0.616885... Val Loss: 0.659592
Epoch: 1/4... Step: 90... Loss: 0.657624... Val Loss: 0.665507
Epoch: 1/4... Step: 100... Loss: 0.648893... Val Loss: 0.647210
Epoch: 1/4... Step: 110... Loss: 0.651835... Val Loss: 0.628727
Epoch: 1/4... Step: 120... Loss: 0.681216... Val Loss: 0.628602
Epoch: 1/4... Step: 130... Loss: 0.533658... Val Loss: 0.611060
Epoch: 1/4... Step: 140... Loss: 0.568968... Val Loss: 0.635869
Epoch: 1/4... Step: 150... Loss: 0.577348... Val Loss: 0.630646
Epoch: 1/4... Step: 160... Loss: 0.624711... Val 

KeyboardInterrupt: 

In [None]:
## getting test loss and accuracy

In [32]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = model.init_hidden(batch_size)

model.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()

    # get predicted outputs
    output, h = model(inputs, h)

    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())

    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer

    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.558
Test accuracy: 0.705


Now predecting on any given text

In [33]:
test_review_neg = 'The worst movie I have seen; acting was terrible and I want my money back. This movie had bad acting and the dialogue was slow.'


In [37]:
from string import punctuation

def tokenize_review(test_review):
    test_review = test_review.lower() # lowercase
    # get rid of punctuation
    test_text = ''.join([c for c in test_review if c not in punctuation])

    # splitting by spaces
    test_words = test_text.split()

    # tokens
    test_ints = []
    test_ints.append([vocab2int[word] for word in test_words])

    return test_ints

In [38]:
def predict(net, test_review, sequence_length=200):

    model.eval()

    # tokenize review
    test_ints = tokenize_review(test_review)

    # pad tokenized sequence
    seq_length=sequence_length
    features = pad_features(test_ints, seq_length)

    # convert to tensor to pass into your model
    feature_tensor = torch.from_numpy(features)

    batch_size = feature_tensor.size(0)

    # initialize hidden state
    h = model.init_hidden(batch_size)

    if(train_on_gpu):
        feature_tensor = feature_tensor.cuda()

    # get the output from the model
    output, h = model(feature_tensor, h)

    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze()) 
    # printing output value, before rounding
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))

    # print custom response
    if(pred.item()==1):
        print("Positive review detected!")
    else:
        print("Negative review detected.")

In [39]:
seq_length = 200

predict(model,test_review_neg,seq_length)

Prediction value, pre-rounding: 0.126142
Negative review detected.
