In [1]:
!wget 'https://raw.githubusercontent.com/omyfish/deep-learning/master/SentimentRNN/data/labels.txt'
!wget 'https://raw.githubusercontent.com/omyfish/deep-learning/master/SentimentRNN/data/reviews.txt'

import torch

In [2]:
with open('reviews.txt', 'r') as t:
    reviews = t.read()

with open('labels.txt', 'r') as t:
    labels = t.read()

In [3]:
print(reviews[:100])
print(labels[:50])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life
positive
negative
positive
negative
positive
negat


In [4]:
from string import punctuation
from collections import Counter
import numpy as np

reviews = reviews.lower()
reviews = ''.join(r for r in reviews if r not in punctuation)

char = sorted(Counter(reviews.split()))
reviewToInt = {ch: ii + 1 for ii, ch in enumerate(char)}

reviews_split = [r for r in reviews.split('\n')]

reviews_int = np.array([[reviewToInt[r] for r in review.split() ] for review in reviews_split])

print(reviews_int.size)
 


25001


In [5]:
labels_split = labels.split('\n')
labels_int = np.array([1 if label == 'positive' else 0 for label in labels_split])
print(labels_int.size)

25001


In [6]:
reviews_counter = Counter([len(r) for r in reviews_int])
print(reviews_counter[0]) # remove review with zero length

1


In [7]:
review_idx = [ii for ii, review in enumerate(reviews_int) if len(review) > 0]
reviews_int = np.array([reviews_int[idx] for idx in review_idx])
labels_int = np.array([labels_int[idx] for idx in review_idx])

print(reviews_int.size)
print(labels_int.size)

25000
25000


In [8]:
def padding_features(reviews_int, seq_length=200):
    features = np.zeros((len(reviews_int), seq_length), dtype=int)
    for i, row in enumerate(reviews_int):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

In [9]:
seq_length = 200
features = padding_features(reviews_int, seq_length)
print(features.shape)

(25000, 200)


In [10]:
from torch.utils.data import DataLoader

frac = 0.8
train_count = int(len(features) * 0.8)
train_x, remaining_x = features[:train_count], features[train_count:]
train_y, remaining_y = labels_int[:train_count], labels_int[train_count:]

remaining_count = int(len(reviews_int) * 0.1)
valid_x, test_x = remaining_x[:remaining_count], remaining_x[:remaining_count] 
valid_y, test_y = remaining_y[:remaining_count], remaining_y[:remaining_count]

print(train_x.shape)
print(valid_x.shape)
print(test_x.shape)

(20000, 200)
(2500, 200)
(2500, 200)


In [11]:
from torch.utils.data import TensorDataset

train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

batch_size = 50
train_loader = DataLoader(train_data, batch_size = batch_size, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle=True)


In [12]:
train_on_gpu = torch.cuda.is_available()
print('Train on gpu: {}'.format(train_on_gpu))

Train on gpu: False


In [13]:
import torch.nn as nn

class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, output, embed_dim, hidden_dim, n_layers, dropoutrate):
        super(SentimentRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True, dropout=dropoutrate)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, output)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x, hidden):
        batch_size = x.size(0)
        
        x = x.long()
        x = self.embedding(x)
        x, hidden = self.lstm(x, hidden)
        
        out = x.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(out)
        out = self.fc(out)
        sigout = self.sigmoid(out)
        
        sigout = sigout.view(batch_size, -1)
        sigout = sigout[:, -1] #Get the label of each batch
        
        return sigout, hidden

    def hidden_init(self, batch_size):
        weight = next(self.parameters()).data
        if train_on_gpu:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden
    

In [14]:
input_size = len(reviews_int)
output = 1 # 0~1
hidden = 256
embed_dim= 400
n_layers = 2
rnn = SentimentRNN(input_size, output, embed_dim, hidden,n_layers, 0.2)
h = rnn.hidden_init(batch_size)



In [15]:
vocab_size = len(reviewToInt) + 1
output = 1 # 0~1
hidden = 256
embed_dim= 400
n_layers = 2
net = SentimentRNN(vocab_size, output, embed_dim, hidden,n_layers, 0.2)
print(net)

SentimentRNN(
  (embedding): Embedding(74073, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [16]:
import torch.optim as optim

optimizer = optim.Adam(net.parameters(), lr=0.003)
criterion = nn.BCELoss() #Binary cross entrophy loss

In [17]:
epochs = 4
show_every = 100
counter = 0
clip = 5 

if train_on_gpu:
    net.cuda()

net.train()
for e in range(epochs):
    h = net.hidden_init(batch_size)
    for inputs, labels in train_loader:
        counter+=1
        net.zero_grad()
        
        if train_on_gpu:
            inputs, labels = inputs.cuda(), labels.cuda()

        h = tuple([each.data for each in h])
        
        out, h = net(inputs, h)
        loss = criterion(out.squeeze(), labels.float())
        
        loss.backward()
        
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        
        optimizer.step()
        if counter % show_every == 0:
            net.eval()
            val_losses = [];
            val_h = net.hidden_init(batch_size)
            for inputs, labels in valid_loader:
                 if train_on_gpu:
                    inputs, labels = inputs.cuda(), labels.cuda()
                    val_h = tuple([each.data for each in val_h])
                    
                    out, val_h = net(inputs, val_h)
                    loss = criterion(out.squeeze(), labels.float())
                    val_losses.append(loss.item())
                    
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))      
            net.train()
        

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Epoch: 1/4... Step: 100... Loss: 0.665965... Val Loss: nan
Epoch: 1/4... Step: 200... Loss: 0.701921... Val Loss: nan
Epoch: 1/4... Step: 300... Loss: 0.701650... Val Loss: nan
Epoch: 1/4... Step: 400... Loss: 0.696841... Val Loss: nan
Epoch: 2/4... Step: 500... Loss: 0.665835... Val Loss: nan
Epoch: 2/4... Step: 600... Loss: 0.385749... Val Loss: nan
Epoch: 2/4... Step: 700... Loss: 0.362624... Val Loss: nan
Epoch: 2/4... Step: 800... Loss: 0.364873... Val Loss: nan
Epoch: 3/4... Step: 900... Loss: 0.314485... Val Loss: nan
Epoch: 3/4... Step: 1000... Loss: 0.324695... Val Loss: nan
Epoch: 3/4... Step: 1100... Loss: 0.287880... Val Loss: nan
Epoch: 3/4... Step: 1200... Loss: 0.339451... Val Loss: nan
Epoch: 4/4... Step: 1300... Loss: 0.048610... Val Loss: nan
Epoch: 4/4... Step: 1400... Loss: 0.283166... Val Loss: nan
Epoch: 4/4... Step: 1500... Loss: 0.200042... Val Loss: nan
Epoch: 4/4... Step: 1600... Loss: 0.161544... Val Loss: nan


In [None]:
losses = []
num_correct = 0

net.eval()
for inputs, labels in test_loader:
    if train_on_gpu:
        inputs, labels = inputs.cuda(), labels.cuda()

    test_h = net.hidden_init(batch_size)
    test_h = tuple([each.data for each in test_h])
    
    out, test_val = net(inputs, test_h)
    loss = criterion(out.squeeze(), labels.float())
    losses.append(loss.item())
    prep = torch.round(out.squeeze())
    correct_tensor = prep.eq(labels.float().view_as(prep))
    correct = np.squeeze(correct_tensor.numpy()) if train_on_gpu == False else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)
    
# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)

print("Test accuracy: {:.3f}".format(test_acc))

In [None]:
test_sentence = "I dont like it"
test_sentence = test_sentence.lower()
test_sentence = ''.join([r for r  in test_sentence if r not in punctuation])
embeded_review = np.array([[reviewToInt[r] for r in test_sentence.split()]])

padded_feature = padding_features(embeded_review)
padded_tensor = torch.from_numpy(padded_feature)
if train_on_gpu:
  padded_tensor = padded_tensor.cuda()

batch_size = padded_tensor.size(0)

h = net.hidden_init(batch_size)
out, h = net(padded_tensor, h)

pred = torch.round(out.squeeze())

 # print custom response
if(pred.item()==1):
    print("Positive review detected!")
else:
    print("Negative review detected.")
