## Encoder-Decoder Architecture 
Dataset: Tweets | Task : Sentiment Analysis

*Submitted: Pavithra Solai* on June 10 2021

#### Load the Datasets into DataFrames

In [121]:
# Import Library
import random
import torch, torchtext
#from torchtext import data 
from torchtext.legacy import data

# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fc5375009b0>

In [2]:
#Upload tweets csv file
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving augmented_tweets_train.csv to augmented_tweets_train.csv
Saving train_tweets.csv to train_tweets.csv
Saving tweets.csv to tweets.csv
Saving valid_tweets.csv to valid_tweets.csv
User uploaded file "augmented_tweets_train.csv" with length 282397 bytes
User uploaded file "train_tweets.csv" with length 117565 bytes
User uploaded file "tweets.csv" with length 160041 bytes
User uploaded file "valid_tweets.csv" with length 20312 bytes


In [122]:
import pandas as pd
df_train = pd.read_csv('train_tweets.csv')
print("Length of train dataset: ",len(df_train))
#df_train = pd.read_csv('augmented_tweets_train.csv')
#print("Length of augmented train dataset: ",len(df_train))
df_valid = pd.read_csv('valid_tweets.csv')
print("Length of Validation dataset: ",len(df_valid))

Length of train dataset:  1159
Length of Validation dataset:  205


In [123]:
df_train["tweets"]=df_train["tweets"].astype(str)
df_train["labels"] = df_train["labels"].astype(int)

In [124]:
df_train.labels.value_counts()

0    791
1    299
2     69
Name: labels, dtype: int64

In [125]:
#Shuffling the dataset - Useful for Data augmentation
import pandas as pd
from sklearn.utils import shuffle

df_train = shuffle(df_train)
df_train.reset_index(inplace=True, drop=True)

#### Create a PyTorch Dataset

- Associate the data.Field and data.LabelField to Tweets and Labels
- We use Spacy for tokenization
- fields is a list of tuples that associate DataFrame columns to data.Field and data.LabelField

In [126]:
Tweet = data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)
fields = [('tweets', Tweet),('labels',Label)]

In [127]:
pd.set_option('display.max_colwidth', None)

df_train[:10]

Unnamed: 0.1,Unnamed: 0,tweets,labels
0,444,Lady well then who s the president,1
1,849,Think the GOP is engaged in a War on Women Why is Obama meeting with Women s worst enemy the Muslim Bro. tomorrow at OUR whitehouse,0
2,778,LOOOOOOOOOOOOOOOOOOOOOOOOOOOOL This is why i rate obama vsOYFhSa,0
3,894,Obama in Boca saywhatttt,0
4,947,Obama will portray Romney as a rich white guy outta touch w the working class whos policies are similar to his so why not keep the hip cat,2
5,940,We want a president with a spine not a president with fecklessness. America can do better than Obama. resist44 tcot if you agree,1
6,739,Harry Styles describe a Michelle Obama como una mujer muy atractiva. OBAMA ESCONDE A TU ESPOSA Y A TUS HIJOS STYLES VIENE EN CAMINO.,0
7,776,LIMBAUGH Obama Puts Out Figurative Bounty on Supreme Court... Z552mkRl,1
8,606,Major Obama Donor Accused Of Fraud I hope no one vets me. I donated 5 and one time I stole a bracelet from Target. 91ecaJAE,0
9,703,Righties call Pres. Obama a thug and bully for comments on health care law hearings. Comment on FB or tweet so we can share on edshow,0


**data.Example** shows how each record of the Dataset will be read from the DataFrames. We create train and validation dataset

In [128]:

example = [data.Example.fromlist([df_train.tweets[i],df_train.labels[i]], fields) for i in range(df_train.shape[0])] 
twitter_train_dataset = data.Dataset(example, fields)

example = [data.Example.fromlist([df_valid.tweets[i],df_valid.labels[i]], fields) for i in range(df_valid.shape[0])] 
twitter_valid_dataset = data.Dataset(example, fields)

Example of the Train Dataset for Tweets:

In [129]:
vars(twitter_train_dataset.examples[5])

{'labels': 1,
 'tweets': ['We',
  'want',
  'a',
  'president',
  'with',
  'a',
  'spine',
  'not',
  'a',
  'president',
  'with',
  'fecklessness',
  '.',
  'America',
  'can',
  'do',
  'better',
  'than',
  'Obama',
  '.',
  'resist44',
  'tcot',
  'if',
  'you',
  'agree']}

#### Building The Vocabulary based on Spacy Tokens

We build the vocab based on the distinct number of non-repetitive tokens and store it in Pickle file

In [130]:
Tweet.build_vocab(twitter_train_dataset)
Label.build_vocab(twitter_train_dataset)

print('Size of input vocab : ', len(Tweet.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Tweet.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_iterator = data.BucketIterator(twitter_train_dataset, batch_size = 32, 
                                                            sort_key = lambda x: len(x.tweets),
                                                            sort_within_batch=True, device = device)
import os, pickle
with open('train_tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Tweet.vocab.stoi, tokens)

Size of input vocab :  4236
Size of label vocab :  3
Top 10 words appreared repeatedly : [('Obama', 1050), ('.', 781), ('the', 528), ('to', 403), ('s', 307), ('of', 242), ('a', 233), ('you', 219), ('with', 214), ('is', 208)]
Labels :  defaultdict(None, {0: 0, 1: 1, 2: 2})


In [131]:
Tweet.build_vocab(twitter_valid_dataset)
Label.build_vocab(twitter_valid_dataset)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
valid_iterator = data.BucketIterator(twitter_valid_dataset, batch_size = 32, 
                                                            sort_key = lambda x: len(x.tweets),
                                                            sort_within_batch=True, device = device)

with open('valid_tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Tweet.vocab.stoi, tokens)

In [132]:
# For printing tensors - This makes it look legible
torch.set_printoptions(profile="short", precision=2, sci_mode=False, linewidth=150)

#### Encoder Decoder Model Architecture

Here we use a multi-step LSTM cell to simulate a encoder that reads words from a sentence one by one and create feature-rich embedding (h_t) which is set to a single-step LSTM cell that acts as a decoder. 

Using the Decoder's outputs, we send it to a Fully connected layer and then to softmax followed by Cross Entropy loss

**Points to Note**
1. The embedding tensor has all the embeddings of the tokens in a sentence for a given a batch of sentences
2. We iterate over each token embedding and send it to a LSTM cell. i.e. For a batch of 32 sentences, we send the 1st token of all the 32 sentences, use the hidden layer (h_1) output from that step and feed the 2nd word along with (h_1) and so on and so forth
3. When we start with the first token, the hidden and cell state have to be randomly initialized.
4. This is done for both Encoder and Decoder
5. For Decoder, we need to set a hidden layer dimension and in my case, I have used 150 for hidden_dim for Decoder


In [147]:
import torch.nn as nn
import torch.nn.functional as F

class EncoderDecoder(nn.Module):

    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_enc_dim, hidden_dec_dim, output_dim, n_layers, dropout):
        
        super().__init__()     
        self.hidden_dim = hidden_enc_dim
        self.hidden_dec_dim = hidden_dec_dim     
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Encoder-Decoder Architecture
        self.encoder = nn.LSTMCell(embedding_dim, self.hidden_dim)

        self.decoder = nn.LSTMCell(self.hidden_dim, self.hidden_dec_dim)
     
        # Dense layer
        self.fc = nn.Linear(150, output_dim)

             
    def forward(self, text, text_lengths, verbatim=False):
        
        # Input is batch of text vocab indices - 32 x longest_sentence_length
        # text = [batch size, sentence length]

        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim] = [32, longest_sentence_length_in_batch, 300]

        ########## ENCODER ########

        batch_size = embedded.size()[0]
      
        hidden = torch.randn(batch_size, self.hidden_dim,device="cuda", requires_grad=True)
        cell_state = torch.randn(batch_size, self.hidden_dim, device="cuda", requires_grad=True)
        #hidden = [32,100]
        #cell_state = [32,100]

        sentence_len = text.size()[1]
        if verbatim:
          print("Length of the sentence : ",sentence_len)

        for i in range(0,sentence_len):
          if verbatim:
            print(f"\nFeeding word %d of the sentence with length %d "%((i+1),sentence_len))

          hidden, cell_state = self.encoder(embedded[:,i,:],(hidden, cell_state))

          if verbatim:
            print(f"\nHidden state -- h_%d -- for batch size of %d: \n" % ((i+1),batch_size))
            print(hidden, hidden.size())

        output_encoder =  hidden #Last Hidden Layer #[32 x 100]
        if verbatim:
          print("\nOutput of Encoder : \n******************\n ", hidden, hidden.size())
        

        ######### DECODER ##########
        hidden = torch.randn(batch_size, self.hidden_dec_dim,device="cuda", requires_grad=True)
        cell_state = torch.randn(batch_size, self.hidden_dec_dim, device="cuda", requires_grad=True)
        #hidden = [32,150]
        #cell_state = [32,150]

        output_decoder, cell_decoder = self.decoder(output_encoder,(hidden, cell_state)) #[32 x 150]
        if verbatim:
          print("\nOutput of Decoder : \n******************\n",output_decoder, output_decoder.size())
        
        
        dense_outputs = self.fc(output_decoder)   
        
        # Final activation function softmax
        output = F.softmax(dense_outputs, dim=1)
           
        return output


In [148]:
""
# Define hyperparameters
size_of_vocab = len(Tweet.vocab)
embedding_dim = 300
num_hidden_nodes_encoder = 100
num_hidden_nodes_decoder = 150
num_output_nodes = 3
num_layers = 1
dropout = 0.2
batch_size = 32

# Instantiate the model
model = EncoderDecoder(size_of_vocab, 
                       embedding_dim, 
                       num_hidden_nodes_encoder, 
                       num_hidden_nodes_decoder, 
                       num_output_nodes, 
                       num_layers, 
                       dropout = dropout)

In [149]:
print(model)

#No. of trainable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

EncoderDecoder(
  (embedding): Embedding(1368, 300)
  (encoder): LSTMCell(300, 100)
  (decoder): LSTMCell(100, 150)
  (fc): Linear(in_features=150, out_features=3, bias=True)
)
The model has 722,853 trainable parameters


In [150]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

#### Model Training and Evaluation

In [151]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        tweet, tweet_lengths = batch.tweets  
        
        # convert to 1D tensor
        predictions = model(tweet, tweet_lengths).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.labels)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.labels)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [152]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            tweet, tweet_lengths = batch.tweets
            
            # convert to 1d tensor
            predictions = model(tweet, tweet_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.labels)
            acc = binary_accuracy(predictions, batch.labels)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [153]:
import time
N_EPOCHS = 25
best_valid_loss = float('inf')
val_losses = []
train_losses = []

val_accuracy = []
train_accuracy = []

for epoch in range(N_EPOCHS):
    start_time = time.time()
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    train_losses.append(train_loss)
    train_accuracy.append(train_acc)

    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    val_losses.append(valid_loss)
    val_accuracy.append(valid_acc)


    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'Epoch {epoch+1} | Time Taken: {(time.time() - start_time):.2f}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')
     

Epoch 1 | Time Taken: 0.30s
	Train Loss: 1.083 | Train Acc: 49.29%
	 Val. Loss: 1.074 |  Val. Acc: 56.11% 

Epoch 2 | Time Taken: 0.28s
	Train Loss: 1.061 | Train Acc: 61.31%
	 Val. Loss: 1.046 |  Val. Acc: 68.99% 

Epoch 3 | Time Taken: 0.28s
	Train Loss: 1.017 | Train Acc: 67.05%
	 Val. Loss: 0.983 |  Val. Acc: 68.75% 

Epoch 4 | Time Taken: 0.28s
	Train Loss: 0.948 | Train Acc: 68.65%
	 Val. Loss: 0.912 |  Val. Acc: 70.98% 

Epoch 5 | Time Taken: 0.28s
	Train Loss: 0.890 | Train Acc: 68.74%
	 Val. Loss: 0.868 |  Val. Acc: 71.88% 

Epoch 6 | Time Taken: 0.29s
	Train Loss: 0.869 | Train Acc: 68.82%
	 Val. Loss: 0.853 |  Val. Acc: 71.43% 

Epoch 7 | Time Taken: 0.27s
	Train Loss: 0.858 | Train Acc: 69.08%
	 Val. Loss: 0.853 |  Val. Acc: 70.98% 

Epoch 8 | Time Taken: 0.27s
	Train Loss: 0.850 | Train Acc: 69.58%
	 Val. Loss: 0.845 |  Val. Acc: 70.98% 

Epoch 9 | Time Taken: 0.28s
	Train Loss: 0.846 | Train Acc: 70.26%
	 Val. Loss: 0.844 |  Val. Acc: 70.98% 

Epoch 10 | Time Taken: 0.30s

In [145]:
#load weights and tokenizer

path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./valid_tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_tweet(tweet):
    
    categories = {0: "Negative", 1:"Positive", 2:"Neutral"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor, verbatim=True)

    _, pred = torch.max(prediction, 1) 
    
    return categories[pred.item()]

#### Hidden Layer States Along With Output of Encoder and Decoder

In [146]:
sentiment = classify_tweet("A valid explanation for why Trump won't let women on the golf course.")
print("Sentiment Classified as " + sentiment)

Length of the sentence :  15

Feeding word 1 of the sentence with length 15 

Hidden state -- h_1 -- for batch size of 1: 

tensor([[ 0.22, -0.55, -0.15, -0.35, -0.05, -0.13,  0.15,  0.19, -0.09,  0.50, -0.20, -0.31,  0.37, -0.38, -0.40, -0.20,  0.31, -0.45,  0.18,  0.07,
          0.36, -0.68,  0.47,  0.36, -0.03,  0.26, -0.46,  0.37, -0.26, -0.42,  0.26,  0.04,  0.02,  0.18,  0.09,  0.02,  0.20, -0.09,  0.16, -0.03,
          0.36, -0.07, -0.00, -0.36, -0.36, -0.06, -0.14, -0.21, -0.02, -0.12,  0.50, -0.16,  0.03, -0.03,  0.02, -0.19, -0.06,  0.01, -0.02, -0.09,
         -0.17, -0.45, -0.16, -0.05,  0.13,  0.27,  0.45, -0.07,  0.08,  0.03, -0.12,  0.01,  0.16, -0.09,  0.17, -0.15, -0.46,  0.05, -0.12, -0.23,
          0.25,  0.46, -0.27,  0.35, -0.02, -0.04, -0.03, -0.05,  0.06,  0.09, -0.10, -0.52,  0.04,  0.53, -0.25,  0.45,  0.04, -0.04,  0.20,  0.01]],
       device='cuda:0', grad_fn=<ThnnFusedLstmCellBackward>) torch.Size([1, 100])

Feeding word 2 of the sentence with length 15 