## Downloading Stanford Data

In [11]:
!nvidia-smi

Thu Jun  3 18:12:35 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
!wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip

--2021-06-03 17:44:41--  http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip [following]
--2021-06-03 17:44:41--  https://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6372817 (6.1M) [application/zip]
Saving to: ‘stanfordSentimentTreebank.zip’


2021-06-03 17:44:42 (6.89 MB/s) - ‘stanfordSentimentTreebank.zip’ saved [6372817/6372817]



In [2]:
!unzip /content/stanfordSentimentTreebank.zip

Archive:  /content/stanfordSentimentTreebank.zip
   creating: stanfordSentimentTreebank/
  inflating: stanfordSentimentTreebank/datasetSentences.txt  
   creating: __MACOSX/
   creating: __MACOSX/stanfordSentimentTreebank/
  inflating: __MACOSX/stanfordSentimentTreebank/._datasetSentences.txt  
  inflating: stanfordSentimentTreebank/datasetSplit.txt  
  inflating: __MACOSX/stanfordSentimentTreebank/._datasetSplit.txt  
  inflating: stanfordSentimentTreebank/dictionary.txt  
  inflating: __MACOSX/stanfordSentimentTreebank/._dictionary.txt  
  inflating: stanfordSentimentTreebank/original_rt_snippets.txt  
  inflating: __MACOSX/stanfordSentimentTreebank/._original_rt_snippets.txt  
  inflating: stanfordSentimentTreebank/README.txt  
  inflating: __MACOSX/stanfordSentimentTreebank/._README.txt  
  inflating: stanfordSentimentTreebank/sentiment_labels.txt  
  inflating: __MACOSX/stanfordSentimentTreebank/._sentiment_labels.txt  
  inflating: stanfordSentimentTreebank/SOStr.txt  
  inflatin

## Processing Stanford Sentimental Data

In [50]:
import os
import sys

import pandas


def get_phrase_sentiments(base_directory):
    def group_labels(label):
        if label in ["very negative", "negative"]:
            return "negative"
        elif label in ["positive", "very positive"]:
            return "positive"
        else:
            return "neutral"

    dictionary = pandas.read_csv(os.path.join(base_directory, "dictionary.txt"), sep="|")
    dictionary.columns = ["phrase", "id"]
    dictionary = dictionary.set_index("id")

    sentiment_labels = pandas.read_csv(os.path.join(base_directory, "sentiment_labels.txt"), sep="|")
    sentiment_labels.columns = ["id", "sentiment"]
    sentiment_labels = sentiment_labels.set_index("id")

    phrase_sentiments = dictionary.join(sentiment_labels)

    phrase_sentiments["fine"] = pandas.cut(phrase_sentiments.sentiment, [0, 0.2, 0.4, 0.6, 0.8, 1.0],
                                           include_lowest=True,
                                           labels=["very negative", "negative", "neutral", "positive", "very positive"])
    phrase_sentiments["coarse"] = phrase_sentiments.fine.apply(group_labels)
    return phrase_sentiments


def get_sentence_partitions(base_directory):
    sentences = pandas.read_csv(os.path.join(base_directory, "datasetSentences.txt"), index_col="sentence_index",
                                sep="\t")
    splits = pandas.read_csv(os.path.join(base_directory, "datasetSplit.txt"), index_col="sentence_index")
    return sentences.join(splits)


def partition(base_directory):
    phrase_sentiments = get_phrase_sentiments(base_directory).reset_index(level=0)
    sentence_partitions = get_sentence_partitions(base_directory)
    # noinspection PyUnresolvedReferences
    data = sentence_partitions.join(phrase_sentiments.set_index("phrase"), on="sentence")
    data["splitset_label"] = data["splitset_label"].fillna(1).astype(int)
    # data["sentence"] = data["sentence"].str.replace(r"\s('s|'d|'re|'ll|'m|'ve|n't)\b", lambda m: m.group(1))
    return data.groupby("splitset_label")

In [51]:
base_directory, output_directory = '/content/stanfordSentimentTreebank','/content/Dataset/';
os.makedirs(output_directory, exist_ok=True)
for splitset, partition in partition(base_directory):
    split_name = {1: "train", 2: "test", 3: "dev"}[splitset]
    filename = os.path.join(output_directory, "stanford-sentiment-treebank.%s.csv" % split_name)
    del partition["splitset_label"]
    partition.to_csv(filename)

## Reading the refined CSV's

In [52]:
import pandas as pd
train_data = pd.read_csv('/content/Dataset/stanford-sentiment-treebank.train.csv')
test_data = pd.read_csv('/content/Dataset/stanford-sentiment-treebank.test.csv')
dev_data = pd.read_csv('/content/Dataset/stanford-sentiment-treebank.dev.csv')

## Shape of the Data

In [58]:
print(train_data.shape)
print(test_data.shape)
print(dev_data.shape)

(8544, 6)
(2210, 6)
(1101, 6)


## Checking the Labels

In [59]:
print(train_data.sentiment.value_counts())
print(test_data.sentiment.value_counts())
print(dev_data.sentiment.value_counts())

0.777780    219
0.722220    215
0.277780    206
0.833330    200
0.222220    198
           ... 
0.027778     17
0.958330     16
0.000000     14
0.972220     12
0.986110     11
Name: sentiment, Length: 73, dtype: int64
0.27778    72
0.83333    65
0.77778    62
0.16667    55
0.19444    49
           ..
0.72917     1
0.61458     1
0.68750     1
0.78125     1
0.71875     1
Name: sentiment, Length: 82, dtype: int64
0.833330    32
0.277780    30
0.750000    30
0.722220    29
0.791670    29
            ..
0.041667     2
0.069444     2
0.958330     2
0.986110     1
0.083333     1
Name: sentiment, Length: 72, dtype: int64


Upscale the labels to fit into 25 classes of sentiments according to the paper. Currently the maximum Categories are 82.

## Analysis of Null values and interpolate them.

In [80]:
print("The Total null values in Train Data:- ",train_data['sentiment'].isnull().sum())
print("The Total null values in Test Data:- ",test_data['sentiment'].isnull().sum())
print("The Total null values in Dev Data:- ",dev_data['sentiment'].isnull().sum())

The Total null values in Train Data:-  427
The Total null values in Test Data:-  85
The Total null values in Dev Data:-  57


In [82]:
train_data.sentiment.interpolate(method ='linear', limit_direction ='forward',inplace=True)
test_data.sentiment.interpolate(method ='linear', limit_direction ='forward',inplace=True)
dev_data.sentiment.interpolate(method ='linear', limit_direction ='forward',inplace=True)

In [83]:
print("The Total null values in Train Data:- ",train_data['sentiment'].isnull().sum())
print("The Total null values in Test Data:- ",test_data['sentiment'].isnull().sum())
print("The Total null values in Dev Data:- ",dev_data['sentiment'].isnull().sum())

The Total null values in Train Data:-  0
The Total null values in Test Data:-  0
The Total null values in Dev Data:-  0


## Upscaling labels to make them in 25 categories

In [84]:
train_data['label'] = (train_data['sentiment'] * 24).clip(0, 24).astype('int')
test_data['label'] = (test_data['sentiment'] * 24).clip(0, 24).astype('int')
dev_data['label'] =  (dev_data['sentiment'] * 24).clip(0, 24).astype('int')

In [91]:
print(len(train_data.label.unique()))
print(len(test_data.label.unique()))
print(len(dev_data.label.unique()))

25
25
25


## Data Augmentation Functions

In [6]:
## Radom Insertion
def random_insertion(sentence, n): 
    words = remove_stopwords(sentence) 
    for _ in range(n):
        new_synonym = get_synonyms(random.choice(words))
        sentence.insert(randrange(len(sentence)+1), new_synonym) 
    return sentence

In [7]:
## Random Deletion
def random_deletion(words, p=0.5): 
    if len(words) == 1: # return if single word
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words)) 
    if len(remaining) == 0: # if not left, sample a random word
        return [random.choice(words)] 
    else:
        return remaining

In [8]:
## Random Swap
def random_swap(sentence, n=5): 
    length = range(len(sentence)) 
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return sentence

In [9]:
!pip install googletrans==3.1.0a0



In [10]:
## Back Translation
import random
import googletrans
from googletrans import Translator

translator = Translator()
sentence = ['The dog slept on the rug']

available_langs = list(googletrans.LANGUAGES.keys()) 
trans_lang = random.choice(available_langs) 
print(f"Translating to {googletrans.LANGUAGES[trans_lang]}")

translations = translator.translate(sentence, dest=trans_lang) 
t_text = [t.text for t in translations]
print(t_text)

translations_en_random = translator.translate(t_text, src=trans_lang, dest='en') 
en_text = [t.text for t in translations_en_random]
print(en_text)

Translating to igbo
['Nkịta rahụrụ n’elu ute']
['The dog slept on the mat']


## TorchText

In [92]:
# Import Library
import random
import torch, torchtext
from torchtext.legacy import data 

# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7ff693ed3810>

In [93]:
Sentence = data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [94]:
fields = [('sentence', Sentence),('label',Label)]

In [95]:
train_example = [data.Example.fromlist([train_data.sentence[i],train_data.label[i]], fields) for i in range(train_data.shape[0])]
test_example = [data.Example.fromlist([test_data.sentence[i],test_data.label[i]], fields) for i in range(test_data.shape[0])]
dev_example = [data.Example.fromlist([dev_data.sentence[i],dev_data.label[i]], fields) for i in range(dev_data.shape[0])] 

In [96]:
# Creating dataset
#twitterDataset = data.TabularDataset(path="tweets.csv", format="CSV", fields=fields, skip_header=True)

train_dataset = data.Dataset(train_example, fields)
test_dataset = data.Dataset(test_example, fields)
dev_dataset = data.Dataset(dev_example, fields)

In [97]:
vars(train_dataset.examples[10])

{'label': 21,
 'sentence': ['Good',
  'fun',
  ',',
  'good',
  'action',
  ',',
  'good',
  'acting',
  ',',
  'good',
  'dialogue',
  ',',
  'good',
  'pace',
  ',',
  'good',
  'cinematography',
  '.']}

In [99]:
Sentence.build_vocab(train_dataset)
Label.build_vocab(train_dataset)

In [100]:
print('Size of input vocab : ', len(Sentence.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Sentence.vocab.freqs.most_common(10)))
print('Labels : ', Sentence.vocab.stoi)

Size of input vocab :  17212
Size of label vocab :  25
Top 10 words appreared repeatedly : [('.', 8041), (',', 7131), ('the', 6087), ('and', 4474), ('of', 4446), ('a', 4423), ('to', 3024), ('-', 2737), ("'s", 2544), ('is', 2540)]


In [101]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [102]:
train_iterator, test_iterator, dev_iterator = data.BucketIterator.splits((train_dataset, test_dataset, dev_dataset), batch_size = 32, 
                                                            sort_key = lambda x: len(x.sentence),
                                                            sort_within_batch=True, device = device)

In [104]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Sentence.vocab.stoi, tokens)

In [106]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
        
        # Final activation function softmax
        output = F.softmax(dense_outputs[0], dim=1)
            
        return output

In [107]:
# Define hyperparameters
size_of_vocab = len(Sentence.vocab)
embedding_dim = 300
num_hidden_nodes = 100
num_output_nodes = len(Label.vocab)
num_layers = 2
dropout = 0.2

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout = dropout)

In [108]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

classifier(
  (embedding): Embedding(17212, 300)
  (encoder): LSTM(300, 100, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=100, out_features=25, bias=True)
)
The model has 5,407,725 trainable parameters


In [109]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [120]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        sentence, sentence_lengths = batch.sentence   
        
        # convert to 1D tensor
        predictions = model(sentence, sentence_lengths).squeeze()  
        
        # print(predictions)
        # compute the loss
        loss = criterion(predictions, batch.label)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [121]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            sentence, sentence_lengths = batch.sentence
            
            # convert to 1d tensor
            predictions = model(sentence, sentence_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [123]:
N_EPOCHS = 20
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    dev_loss, dev_acc = evaluate(model, dev_iterator, criterion)
    
    # save the best model
    if test_loss < best_valid_loss:
        best_valid_loss = test_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}% \n')
    print(f'\t Dev Loss: {dev_loss:.3f} |  Dev Acc: {dev_acc*100:.2f}% \n')

	Train Loss: 3.178 | Train Acc: 11.06%
	 Test Loss: 3.177 |  Test Acc: 10.54% 

	 Dev Loss: 3.182 |  Dev Acc: 9.81% 

	Train Loss: 3.161 | Train Acc: 12.48%
	 Test Loss: 3.174 |  Test Acc: 10.80% 

	 Dev Loss: 3.177 |  Dev Acc: 10.66% 

	Train Loss: 3.136 | Train Acc: 15.65%
	 Test Loss: 3.173 |  Test Acc: 11.25% 

	 Dev Loss: 3.180 |  Dev Acc: 9.68% 

	Train Loss: 3.111 | Train Acc: 18.24%
	 Test Loss: 3.174 |  Test Acc: 10.18% 

	 Dev Loss: 3.176 |  Dev Acc: 9.95% 

	Train Loss: 3.089 | Train Acc: 19.94%
	 Test Loss: 3.173 |  Test Acc: 10.67% 

	 Dev Loss: 3.177 |  Dev Acc: 9.77% 

	Train Loss: 3.072 | Train Acc: 21.50%
	 Test Loss: 3.174 |  Test Acc: 10.22% 

	 Dev Loss: 3.176 |  Dev Acc: 9.86% 

	Train Loss: 3.051 | Train Acc: 24.67%
	 Test Loss: 3.171 |  Test Acc: 10.18% 

	 Dev Loss: 3.173 |  Dev Acc: 9.64% 

	Train Loss: 3.031 | Train Acc: 27.28%
	 Test Loss: 3.173 |  Test Acc: 10.71% 

	 Dev Loss: 3.176 |  Dev Acc: 10.17% 

	Train Loss: 3.013 | Train Acc: 29.25%
	 Test Loss: 3.

In [124]:
N_EPOCHS = 50
# best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    dev_loss, dev_acc = evaluate(model, dev_iterator, criterion)
    
    # save the best model
    if test_loss < best_valid_loss:
        best_valid_loss = test_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}% \n')
    print(f'\t Dev Loss: {dev_loss:.3f} |  Dev Acc: {dev_acc*100:.2f}% \n')

	Train Loss: 2.877 | Train Acc: 40.91%
	 Test Loss: 3.162 |  Test Acc: 11.83% 

	 Dev Loss: 3.177 |  Dev Acc: 10.40% 

	Train Loss: 2.874 | Train Acc: 41.26%
	 Test Loss: 3.164 |  Test Acc: 12.05% 

	 Dev Loss: 3.170 |  Dev Acc: 11.33% 

	Train Loss: 2.869 | Train Acc: 41.82%
	 Test Loss: 3.166 |  Test Acc: 11.43% 

	 Dev Loss: 3.171 |  Dev Acc: 11.15% 

	Train Loss: 2.862 | Train Acc: 42.51%
	 Test Loss: 3.162 |  Test Acc: 11.70% 

	 Dev Loss: 3.168 |  Dev Acc: 11.42% 

	Train Loss: 2.856 | Train Acc: 43.22%
	 Test Loss: 3.168 |  Test Acc: 11.25% 

	 Dev Loss: 3.171 |  Dev Acc: 11.24% 

	Train Loss: 2.850 | Train Acc: 43.62%
	 Test Loss: 3.167 |  Test Acc: 11.16% 

	 Dev Loss: 3.174 |  Dev Acc: 10.22% 

	Train Loss: 2.842 | Train Acc: 44.62%
	 Test Loss: 3.166 |  Test Acc: 11.52% 

	 Dev Loss: 3.172 |  Dev Acc: 10.53% 

	Train Loss: 2.835 | Train Acc: 45.51%
	 Test Loss: 3.167 |  Test Acc: 11.43% 

	 Dev Loss: 3.178 |  Dev Acc: 9.73% 

	Train Loss: 2.828 | Train Acc: 46.30%
	 Test Los

In [125]:
N_EPOCHS = 100
# best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    dev_loss, dev_acc = evaluate(model, dev_iterator, criterion)
    
    # save the best model
    if test_loss < best_valid_loss:
        best_valid_loss = test_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}% \n')
    print(f'\t Dev Loss: {dev_loss:.3f} |  Dev Acc: {dev_acc*100:.2f}% \n')

	Train Loss: 2.595 | Train Acc: 68.88%
	 Test Loss: 3.193 |  Test Acc: 8.30% 

	 Dev Loss: 3.186 |  Dev Acc: 8.75% 

	Train Loss: 2.592 | Train Acc: 69.25%
	 Test Loss: 3.192 |  Test Acc: 8.17% 

	 Dev Loss: 3.186 |  Dev Acc: 8.93% 

	Train Loss: 2.590 | Train Acc: 69.64%
	 Test Loss: 3.190 |  Test Acc: 8.75% 

	 Dev Loss: 3.191 |  Dev Acc: 8.48% 

	Train Loss: 2.586 | Train Acc: 70.08%
	 Test Loss: 3.183 |  Test Acc: 9.51% 

	 Dev Loss: 3.189 |  Dev Acc: 8.39% 

	Train Loss: 2.584 | Train Acc: 70.61%
	 Test Loss: 3.187 |  Test Acc: 9.15% 

	 Dev Loss: 3.191 |  Dev Acc: 8.70% 

	Train Loss: 2.577 | Train Acc: 70.99%
	 Test Loss: 3.186 |  Test Acc: 9.29% 

	 Dev Loss: 3.197 |  Dev Acc: 8.04% 

	Train Loss: 2.571 | Train Acc: 71.51%
	 Test Loss: 3.190 |  Test Acc: 8.75% 

	 Dev Loss: 3.192 |  Dev Acc: 8.57% 

	Train Loss: 2.566 | Train Acc: 72.14%
	 Test Loss: 3.186 |  Test Acc: 9.33% 

	 Dev Loss: 3.193 |  Dev Acc: 8.12% 

	Train Loss: 2.562 | Train Acc: 72.65%
	 Test Loss: 3.190 |  Tes

In [126]:
path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_tweet(tweet):
    
    #categories = {0: "Negative", 1:"Positive", 2:"Neutral"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor)

    _, pred = torch.max(prediction, 1) 
    
    #return categories[pred.item()]
    return pred.item()

## Setimental Analysis of Train, Test and Dev Data

In [128]:
train_data['Predicted_Label'] = train_data['sentence'].apply(classify_tweet)

In [129]:
test_data['Predicted_Label'] = test_data['sentence'].apply(classify_tweet)

In [130]:
dev_data['Predicted_Label'] = test_data['sentence'].apply(classify_tweet)

## Postive Tweets

In [131]:
import numpy as np
train_data['Flag'] = np.where(train_data['Predicted_Label']==train_data['label'], 1, 0)
test_data['Flag'] = np.where(test_data['Predicted_Label']==test_data['label'], 1, 0)
dev_data['Flag'] = np.where(dev_data['Predicted_Label']==dev_data['label'], 1, 0)

## Train Postive Tweets

In [132]:
train_data[train_data['Flag']==1]

Unnamed: 0,sentence_index,sentence,id,sentiment,fine,coarse,label,Predicted_Label,Flag
397,2011,Imperfect ?,45647.0,0.277780,negative,negative,6,6,1
989,2603,"Collateral Damage is trash , but it earns extr...",64756.0,0.194440,very negative,negative,4,4,1
1541,3155,"`` White Oleander , '' the movie , is akin to ...",227645.0,0.236110,negative,negative,5,5,1
1849,3463,... less a story than an inexplicable nightmar...,62594.0,0.222220,negative,negative,5,5,1
2755,4369,"While not all that bad of a movie , it 's nowh...",70892.0,0.222220,negative,negative,5,5,1
...,...,...,...,...,...,...,...,...,...
8501,11813,... unbearably lame .,221849.0,0.097222,very negative,negative,2,2,1
8511,11823,Boring and meandering .,222656.0,0.125000,very negative,negative,3,3,1
8514,11826,A reality-snubbing hodgepodge .,222072.0,0.222220,negative,negative,5,5,1
8522,11834,An opportunity missed .,222315.0,0.222220,negative,negative,5,5,1


## Test Set Postive Tweets

In [133]:
test_data[test_data['Flag']==1]

Unnamed: 0,sentence_index,sentence,id,sentiment,fine,coarse,label,Predicted_Label,Flag
265,283,Has it ever been possible to say that Williams...,223594.0,0.36111,negative,negative,8,8,1
713,756,"If you 're not into the Pokemon franchise , th...",223951.0,0.12500,very negative,negative,3,3,1
745,788,"At nearly three hours , the whole of Safe Cond...",24744.0,0.23611,negative,negative,5,5,1
840,884,It 's neither as romantic nor as thrilling as ...,25755.0,0.19444,very negative,negative,4,4,1
1081,7919,The ending is a cop-out .,226239.0,0.18056,very negative,negative,4,4,1
...,...,...,...,...,...,...,...,...,...
2137,9005,"Gangs of New York is an unapologetic mess , wh...",145530.0,0.19444,very negative,negative,4,4,1
2146,9015,About as cutting-edge as Pet Rock : The Movie .,143478.0,0.16667,very negative,negative,4,4,1
2157,9027,An atonal estrogen opera that demonizes femini...,143712.0,0.16667,very negative,negative,4,4,1
2180,9050,`` An entire film about researchers quietly re...,227414.0,0.16667,very negative,negative,4,4,1


## Dev Set Postive Tweets

In [134]:
dev_data[dev_data['Flag']==1]

Unnamed: 0,sentence_index,sentence,id,sentiment,fine,coarse,label,Predicted_Label,Flag
39,1158,Here 's yet another studio horror franchise mu...,25527.0,0.19444,very negative,negative,4,4,1
571,7358,-LRB- T -RRB- here 's only so much anyone can ...,,0.231483,,,5,5,1
621,7408,"Despite all evidence to the contrary , this cl...",144793.0,0.20833,negative,negative,4,4,1
640,7427,"Try as I may , I ca n't think of a single good...",226829.0,0.13889,very negative,negative,3,3,1
643,7430,It treats women like idiots .,146753.0,0.19444,very negative,negative,4,4,1
647,7434,Impostor has a handful of thrilling moments an...,146278.0,0.22222,negative,negative,5,5,1
667,7455,You really have to wonder how on earth anyone ...,151139.0,0.041667,very negative,negative,1,1,1
675,7463,"Unfortunately , it 's not silly fun unless you...",226870.0,0.125,very negative,negative,3,3,1
749,7537,"In an effort , I suspect , not to offend by ap...",146291.0,0.23611,negative,negative,5,5,1
762,7550,So unremittingly awful that labeling it a dog ...,148685.0,0.0,very negative,negative,0,0,1


## Negative Tweets

## Train Set Negative Tweets

In [135]:
train_data[train_data['Flag']==0]

Unnamed: 0,sentence_index,sentence,id,sentiment,fine,coarse,label,Predicted_Label,Flag
0,1,The Rock is destined to be the 21st Century 's...,226166.0,0.69444,positive,positive,16,1,0
1,2,The gorgeously elaborate continuation of `` Th...,226300.0,0.83333,very positive,positive,19,0,0
2,61,Singer\/composer Bryan Adams contributes a sle...,225801.0,0.62500,positive,positive,15,3,0
3,62,You 'd think by now America would have had eno...,14646.0,0.50000,neutral,neutral,12,3,0
4,63,Yet the act is still charming here .,14644.0,0.72222,positive,positive,17,1,0
...,...,...,...,...,...,...,...,...,...
8539,11851,A real snooze .,222071.0,0.11111,very negative,negative,2,1,0
8540,11852,No surprises .,225165.0,0.22222,negative,negative,5,2,0
8541,11853,We 've seen the hippie-turned-yuppie plot befo...,226985.0,0.75000,positive,positive,18,3,0
8542,11854,Her fans walked out muttering words like `` ho...,223632.0,0.13889,very negative,negative,3,5,0


## Test Set Negative Tweets

In [136]:
test_data[test_data['Flag']==0]

Unnamed: 0,sentence_index,sentence,id,sentiment,fine,coarse,label,Predicted_Label,Flag
0,3,Effective but too-tepid biopic,13995.0,0.51389,neutral,neutral,12,2,0
1,4,If you sometimes like to go to the movies to h...,14123.0,0.73611,positive,positive,17,3,0
2,5,"Emerges as something rare , an issue movie tha...",13999.0,0.86111,very positive,positive,20,3,0
3,6,The film provides some great insight into the ...,14498.0,0.59722,neutral,neutral,14,3,0
4,7,Offers that rare combination of entertainment ...,14351.0,0.83333,very positive,positive,19,3,0
...,...,...,...,...,...,...,...,...,...
2205,11621,An imaginative comedy\/thriller .,13851.0,0.77778,positive,positive,18,5,0
2206,11623,"-LRB- A -RRB- rare , beautiful film .",13691.0,0.95833,very positive,positive,22,3,0
2207,11626,-LRB- An -RRB- hilarious romantic comedy .,,0.95833,,,22,0,0
2208,11628,Never -LRB- sinks -RRB- into exploitation .,,0.95833,,,22,2,0


## Dev Set Negative Tweets

In [137]:
dev_data[dev_data['Flag']==0]

Unnamed: 0,sentence_index,sentence,id,sentiment,fine,coarse,label,Predicted_Label,Flag
0,1117,It 's a lovely film with lovely performances b...,25730.0,0.79167,positive,positive,19,2,0
1,1118,"No one goes unindicted here , which is probabl...",225163.0,0.51389,neutral,neutral,12,3,0
2,1119,And if you 're not nearly moved to tears by a ...,222358.0,0.76389,positive,positive,18,3,0
3,1120,"A warm , funny , engaging film .",24502.0,0.88889,very positive,positive,21,3,0
4,1121,Uses sharp humor and insight into human nature...,27115.0,0.80556,very positive,positive,19,3,0
...,...,...,...,...,...,...,...,...,...
1093,7898,Even horror fans will most likely not find wha...,145161.0,0.12500,very negative,negative,3,2,0
1096,7901,it seems to me the film is about the art of ri...,163906.0,0.29167,negative,negative,7,2,0
1097,7902,It 's just disappointingly superficial -- a mo...,146522.0,0.33333,negative,negative,7,2,0
1099,7904,Sometimes it feels as if it might have been ma...,148760.0,0.44444,neutral,neutral,10,1,0


In [139]:
dev_data.to_csv('/content/Dataset/final_dev.csv',index=False)
train_data.to_csv('/content/Dataset/final_train.csv',index=False)
test_data.to_csv('/content/Dataset/final_test.csv',index=False)

In [140]:
!zip Dataset.zip /content/Dataset/*

  adding: content/Dataset/final_dev.csv (deflated 63%)
  adding: content/Dataset/final_test.csv (deflated 63%)
  adding: content/Dataset/final_train.csv (deflated 64%)
  adding: content/Dataset/stanford-sentiment-treebank.dev.csv (deflated 63%)
  adding: content/Dataset/stanford-sentiment-treebank.test.csv (deflated 63%)
  adding: content/Dataset/stanford-sentiment-treebank.train.csv (deflated 63%)


In [142]:
print('Labels : ', Label.vocab.stoi)

Labels :  defaultdict(None, {19: 0, 16: 1, 7: 2, 18: 3, 6: 4, 4: 5, 10: 6, 17: 7, 12: 8, 9: 9, 13: 10, 15: 11, 5: 12, 21: 13, 3: 14, 8: 15, 14: 16, 20: 17, 1: 18, 2: 19, 11: 20, 22: 21, 0: 22, 23: 23, 24: 24})
