In [4]:
# data preprocessing
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
tqdm.pandas()

# model 
import torch
import torch.nn as nn
import torch.utils.data as data
import torchtext

  from pandas import Panel


### Source: 
- https://colab.research.google.com/github/agungsantoso/deep-learning-v2-pytorch/blob/master/sentiment-rnn/Sentiment_RNN_Exercise.ipynb#scrollTo=bpZdIItNpmwU

#### Configuration

In [6]:
vocab_size = 50000
max_len = 100
batch_size = 32

#### Preprocessing Pipeline

##### Some part of the cleaing code is taken from here:  https://www.kaggle.com/hengzheng/pytorch-starter
#### Cleaning steps
- Remove Punctuation
- Remote numbers and special chars
- Padding and Tokenizer

In [7]:
from cleaning import clean_numbers, clean_text, replace_typical_misspell

In [8]:
def load_and_clean():
    train = pd.read_csv("data/quora/train.csv", nrows=100000, usecols=["question_text", "target"])
    
    # convert it into lower 
    train["text"] = train["question_text"].progress_apply(lambda x : x.lower())
    
    # clean the text by removing special chars, numbers etc
    train["text"] = train["text"].progress_apply(lambda x : clean_text(x))
    train["text"] = train["text"].progress_apply(lambda x: clean_numbers(x))
    
    # fix the spelling
    train["text"] = train["text"].progress_apply(lambda x: replace_typical_misspell(x))
    
    return train

In [9]:
train = load_and_clean()

100%|██████████| 100000/100000 [00:00<00:00, 1041724.06it/s]
100%|██████████| 100000/100000 [00:02<00:00, 38794.41it/s]
100%|██████████| 100000/100000 [00:01<00:00, 99822.29it/s]
100%|██████████| 100000/100000 [00:02<00:00, 45013.64it/s]


In [10]:
train.columns

Index(['question_text', 'target', 'text'], dtype='object')

In [11]:
text = train.text.values
targets = train.target.values
corpus = " ".join(text)
words = corpus.split()

In [12]:
words[:10]

['how',
 'did',
 'quebec',
 'nationalists',
 'see',
 'their',
 'province',
 'as',
 'a',
 'nation']

#### Encoding the words

In [13]:
from collections import Counter

In [14]:
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
word2idx = {word: ii for ii, word in enumerate(vocab, 1)}
idx2word = dict(zip(word2idx.values(), word2idx.keys()))

In [15]:
encoded = []
for ques in text:
    encoded.append([word2idx[word] for word in ques.split()])

In [16]:
vocab_size = len(word2idx)
print(f"Unique words : {vocab_size}")

Unique words : 50475


In [17]:
ques = ""
for token in encoded[0]:
    ques = ques + " " + idx2word[token]
print(ques)

 how did quebec nationalists see their province as a nation in the ####s ?


In [18]:
#### Min and Max len of Sequence so that we can pad the encoded sequence
seq_lens = [len(seq) for seq in encoded]
print(f"min len {min(seq_lens)}\nmax len {max(seq_lens)}")

min len 1
max len 143


In [19]:
def pad_features(sequences, seq_length=100):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    ## getting the correct rows x cols shape
    features = np.zeros((len(sequences), seq_length), dtype=int)
    
    ## for each review, I grab that review
    for i, seq in enumerate(sequences):
        features[i, -len(seq):] = np.array(seq)[:seq_length]
    
    return features

In [20]:
seq_length = 100
padded_sequence = pad_features(encoded, seq_length=100)

In [21]:
## train test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequence, targets, test_size=0.05)

In [22]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(95000, 100) (5000, 100) (95000,) (5000,)


In [23]:
train_data = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
valid_data = torch.utils.data.TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

In [24]:
train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(dataset=valid_data, batch_size=batch_size, shuffle=False)

#### Model

In [25]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.


In [26]:
import torch.nn as nn

class SentimentRNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layer
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)
        
        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if(train_on_gpu):
          hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                   weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
          hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                   weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden
        

In [30]:
# Instantiate the model w/ hyperparams
vocab_size = len(word2idx) + 1 # +1 for zero padding + our word tokens
output_size = 1
embedding_dim = 400 
hidden_dim = 256
n_layers = 1

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

SentimentRNN(
  (embedding): Embedding(50476, 400)
  (lstm): LSTM(400, 256, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


  "num_layers={}".format(dropout, num_layers))


In [31]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


In [35]:
# training params

epochs = 4 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, None)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs, None)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/4... Step: 100... Loss: 0.123443... Val Loss: 0.118740
Epoch: 1/4... Step: 200... Loss: 0.088895... Val Loss: 0.125022
Epoch: 1/4... Step: 300... Loss: 0.041370... Val Loss: 0.120382
Epoch: 1/4... Step: 400... Loss: 0.121991... Val Loss: 0.120787
Epoch: 1/4... Step: 500... Loss: 0.038624... Val Loss: 0.126568
Epoch: 1/4... Step: 600... Loss: 0.102510... Val Loss: 0.122921
Epoch: 1/4... Step: 700... Loss: 0.088225... Val Loss: 0.122281
Epoch: 1/4... Step: 800... Loss: 0.063609... Val Loss: 0.121687
Epoch: 1/4... Step: 900... Loss: 0.068324... Val Loss: 0.121364
Epoch: 1/4... Step: 1000... Loss: 0.032588... Val Loss: 0.122497
Epoch: 1/4... Step: 1100... Loss: 0.040477... Val Loss: 0.120436
Epoch: 1/4... Step: 1200... Loss: 0.197443... Val Loss: 0.118960
Epoch: 1/4... Step: 1300... Loss: 0.224305... Val Loss: 0.122955
Epoch: 1/4... Step: 1400... Loss: 0.064327... Val Loss: 0.124899
Epoch: 1/4... Step: 1500... Loss: 0.091575... Val Loss: 0.121040
Epoch: 1/4... Step: 1600... Loss: 