Importing the required Libraries

In [None]:
import pandas as pd
import numpy as np
import random
import torch, torchtext
from torchtext import data 

Uploading the Stanford dataset and extracting the labels

In [None]:
dataset_sentences = pd.read_csv('/content/datasetSentences.txt', sep='\t')
print(dataset_sentences.shape, '\n', dataset_sentences.head())

sentiment_labels = pd.read_csv('/content/sentiment_labels.txt', sep='|')                
sentiment_labels = sentiment_labels.rename(columns={'phrase ids': 'phrase_id', 
                                                    'sentiment values': 'sentiment_value'})
print(sentiment_labels.shape, '\n', sentiment_labels.head())

(11855, 2) 
    sentence_index                                           sentence
0               1  The Rock is destined to be the 21st Century 's...
1               2  The gorgeously elaborate continuation of `` Th...
2               3                     Effective but too-tepid biopic
3               4  If you sometimes like to go to the movies to h...
4               5  Emerges as something rare , an issue movie tha...
(239232, 2) 
    phrase_id  sentiment_value
0          0          0.50000
1          1          0.50000
2          2          0.44444
3          3          0.50000
4          4          0.42708


Preparing the dataset

In [None]:
sentence_sentiment = dataset_sentences
phrase_sentiment = dict(zip(list(sentiment_labels.phrase_id), 
                            list(sentiment_labels.sentiment_value)))


with open('/content/STree.txt') as file:
    stree = file.readlines()
stree = [line.strip().split('|') for line in stree] 

Phrase to Sentence mapping

In [None]:
from statistics import mean

# average sentiments per sentence (otherwise long sentences get high sentiments)
sentiments = [mean(phrase_sentiment[int(phrase_id)] for phrase_id in phrase_ids) 
              for phrase_ids in stree]
# define neutral sentiment as values within [0.5, 0.55) 
sentiments = [2 if sentiment >= 0.5 and sentiment < 0.55 else sentiment 
              for sentiment in sentiments]
# all negative sentiment values to 0, all positive sentiment values to 1
sentiments = [int(round(sentiment)) 
              for sentiment in sentiments]
# concatenate sentence sentiment values with corresponding sentence texts
sentence_sentiment['sentiment_value'] = sentiments

sentence_sentiment

Unnamed: 0,sentence_index,sentence,sentiment_value
0,1,The Rock is destined to be the 21st Century 's...,0
1,2,The gorgeously elaborate continuation of `` Th...,0
2,3,Effective but too-tepid biopic,0
3,4,If you sometimes like to go to the movies to h...,0
4,5,"Emerges as something rare , an issue movie tha...",0
...,...,...,...
11850,11851,A real snooze .,0
11851,11852,No surprises .,0
11852,11853,We 've seen the hippie-turned-yuppie plot befo...,0
11853,11854,Her fans walked out muttering words like `` ho...,0


In [None]:
sentence_sentiment = sentence_sentiment.rename(columns={'sentence': 'tweets', 'sentiment_value': 'labels'})
sentence_sentiment = sentence_sentiment[['tweets', 'labels']]


In [None]:
sentence_sentiment

Unnamed: 0,tweets,labels
0,The Rock is destined to be the 21st Century 's...,0
1,The gorgeously elaborate continuation of `` Th...,0
2,Effective but too-tepid biopic,0
3,If you sometimes like to go to the movies to h...,0
4,"Emerges as something rare , an issue movie tha...",0
...,...,...
11850,A real snooze .,0
11851,No surprises .,0
11852,We 've seen the hippie-turned-yuppie plot befo...,0
11853,Her fans walked out muttering words like `` ho...,0


In [None]:
SEED = 1234
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fb3967e6ab0>

Defining Tweet & Label fields

In [None]:
Tweet = torchtext.legacy.data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = torchtext.legacy.data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [None]:
fields = [('tweets', Tweet),('labels',Label)]

In [None]:
#example = [torchtext.legacy.data.Example.fromlist([df.tweets[i],df.labels[i]], fields) for i in range(df.shape[0])]
example = [torchtext.legacy.data.Example.fromlist([sentence_sentiment.tweets[i],sentence_sentiment.labels[i]], fields) for i in range(sentence_sentiment.shape[0])]


In [None]:
stanfordDataset = torchtext.legacy.data.Dataset(example, fields)

In [None]:
Data split for train, test & validate

In [None]:
(train, valid, test) = stanfordDataset.split(split_ratio=[70, 15, 15], random_state = random.seed(SEED))

In [None]:
(len(train), len(valid), len(test))

(8298, 1779, 1778)

In [None]:
vars(train.examples[10])

{'labels': 0,
 'tweets': ['Not',
  'only',
  'are',
  'the',
  'film',
  "'s",
  'Sopranos',
  'gags',
  'incredibly',
  'dated',
  'and',
  'unfunny',
  ',',
  'they',
  'also',
  'demonstrate',
  'how',
  'desperate',
  'the',
  'makers',
  'of',
  'this',
  '`',
  'we',
  "'re",
  '-',
  'doing',
  '-',
  'it',
  '-',
  'for',
  '-',
  'the',
  '-',
  'cash',
  "'",
  'sequel',
  'were',
  '.']}

Vocabulary build-up

In [None]:
Tweet.build_vocab(train)
Label.build_vocab(train)

In [None]:
!pip install googletrans



In [None]:
import random

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import googletrans


Data Augmentation using back_translate

In [None]:
def back_translate(sentence, p=0.1):
  if random.uniform(0,1) > p:
    return sentence

  # combine tokenized sentence into one string
  sentence = ' '.join(sentence)

  # instantiate translator
  translator = googletrans.Translator()

  # choose a target language
  available_langs = list(googletrans.LANGUAGES.keys())
  trans_lang = random.choice(available_langs)

  # translate to the target language
  translations = translator.translate(sentence, lang_tgt=trans_lang) 
  #print(translations)

  # translate back to original language
  translations_en_random = translator.translate(translations, lang_src=trans_lang, lang_tgt='en') 
  #print(translations_en_random)

  # select only one translation
  if len(translations_en_random) > 1:
    translations_en_random = translations_en_random[0]

  return word_tokenize(translations_en_random)

Data Augmentation using random_deletion

In [None]:
def random_deletion(sentence, p=0.5): 
    # return if single word
    if len(sentence) == 1: 
        return sentence
    # delete words
    remaining = list(filter(lambda x: random.uniform(0,1) > p, sentence)) 
    # if nothing left, sample a random word
    if len(remaining) == 0: 
        return [random.choice(sentence)] 
    else:
        return remaining

Data Augmentation using random_swap

In [None]:
def random_swap(sentence, n=5): 
    if len(sentence) < 2:
      return sentence
    length = range(len(sentence)) 
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return sentence

In [None]:
for example in train.examples:
    #example.tweets = back_translate(example.tweets, p=0.1)
    example.tweets = random_deletion(example.tweets, p=0.5)
    example.tweets = random_swap(example.tweets, n=1)

In [None]:
MAX_VOCAB_SIZE = 25_000

Tweet.build_vocab(train, max_size = MAX_VOCAB_SIZE)
Label.build_vocab(train)

In [None]:
print('Size of input vocab : ', len(Tweet.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Tweet.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  12015
Size of label vocab :  3
Top 10 words appreared repeatedly : [('.', 3918), (',', 3444), ('the', 3034), ('and', 2198), ('of', 2156), ('a', 2088), ('to', 1523), ('-', 1406), ('is', 1239), ("'s", 1221)]
Labels :  defaultdict(None, {0: 0, 1: 1, 2: 2})


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
train_iterator, valid_iterator, test_iterator = torchtext.legacy.data.BucketIterator.splits((train, valid, test), batch_size = 16, 
                                                            sort_key = lambda x: len(x.tweets),
                                                            sort_within_batch=True, device = device)

In [None]:
next(iter(train_iterator))


[torchtext.legacy.data.batch.Batch of size 16]
	[.tweets]:('[torch.LongTensor of size 16x17]', '[torch.LongTensor of size 16]')
	[.labels]:[torch.LongTensor of size 16]

In [None]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Tweet.vocab.stoi, tokens)

Model

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, 
                 n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                               hidden_dim, 
                               num_layers=n_layers, 
                               dropout=dropout,
                               batch_first=True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
        
        # Final activation function (softmax)
        output = F.softmax(dense_outputs[0], dim=1)
            
        return output

Hyper-Parameters

In [None]:
# Define hyperparameters
size_of_vocab = len(Tweet.vocab)
embedding_dim = 300
num_hidden_nodes = 100
num_output_nodes = 3
num_layers = 2
dropout = 0.2

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout = dropout)

print(model)

classifier(
  (embedding): Embedding(12015, 300)
  (encoder): LSTM(300, 100, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=100, out_features=3, bias=True)
)


In [None]:
#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,846,403 trainable parameters


In [None]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

Train the Model

In [None]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        tweet, tweet_lengths = batch.tweets   
        
        # convert to 1D tensor
        predictions = model(tweet, tweet_lengths).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.labels)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.labels)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

Evaluate the Model

In [None]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            tweet, tweet_lengths = batch.tweets
            
            # convert to 1d tensor
            predictions = model(tweet, tweet_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.labels)
            acc = binary_accuracy(predictions, batch.labels)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

Run the Model

In [None]:
N_EPOCHS = 20
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

	Train Loss: 0.981 | Train Acc: 55.17%
	 Val. Loss: 0.948 |  Val. Acc: 55.69% 

	Train Loss: 0.867 | Train Acc: 67.57%
	 Val. Loss: 0.960 |  Val. Acc: 57.76% 

	Train Loss: 0.823 | Train Acc: 72.83%
	 Val. Loss: 0.978 |  Val. Acc: 56.47% 

	Train Loss: 0.797 | Train Acc: 75.24%
	 Val. Loss: 0.989 |  Val. Acc: 54.91% 

	Train Loss: 0.779 | Train Acc: 77.32%
	 Val. Loss: 0.949 |  Val. Acc: 59.43% 

	Train Loss: 0.765 | Train Acc: 79.08%
	 Val. Loss: 0.948 |  Val. Acc: 59.65% 

	Train Loss: 0.753 | Train Acc: 80.08%
	 Val. Loss: 0.945 |  Val. Acc: 60.10% 

	Train Loss: 0.743 | Train Acc: 81.39%
	 Val. Loss: 0.958 |  Val. Acc: 58.82% 

	Train Loss: 0.736 | Train Acc: 82.14%
	 Val. Loss: 0.964 |  Val. Acc: 58.31% 

	Train Loss: 0.730 | Train Acc: 82.84%
	 Val. Loss: 0.954 |  Val. Acc: 59.65% 

	Train Loss: 0.723 | Train Acc: 83.45%
	 Val. Loss: 0.948 |  Val. Acc: 59.99% 

	Train Loss: 0.719 | Train Acc: 83.68%
	 Val. Loss: 0.957 |  Val. Acc: 59.21% 

	Train Loss: 0.715 | Train Acc: 83.95%
	

In [None]:
model.load_state_dict(torch.load('saved_weights.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.972 | Test Acc: 57.65%


In [None]:
#load weights and tokenizer

path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_tweet(tweet):
    
    categories = {0: "Negative", 1:"Positive", 2:"Neutral"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor)

    _, pred = torch.max(prediction, 1) 
    
    return categories[pred.item()]

User Input Testing

In [None]:
classify_tweet("A valid explanation for why Trump won't let women on the golf course.")

'Negative'

In [None]:
classify_tweet("In his teen years, Obama has been known to use marijuana and cocaine.")

'Negative'