## Downloading Stanford Data

In [None]:
!nvidia-smi

Fri Jun  4 10:29:11 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip

--2021-06-04 10:29:18--  http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip [following]
--2021-06-04 10:29:18--  https://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6372817 (6.1M) [application/zip]
Saving to: ‘stanfordSentimentTreebank.zip’


2021-06-04 10:29:19 (6.87 MB/s) - ‘stanfordSentimentTreebank.zip’ saved [6372817/6372817]



In [None]:
!unzip /content/stanfordSentimentTreebank.zip

Archive:  /content/stanfordSentimentTreebank.zip
   creating: stanfordSentimentTreebank/
  inflating: stanfordSentimentTreebank/datasetSentences.txt  
   creating: __MACOSX/
   creating: __MACOSX/stanfordSentimentTreebank/
  inflating: __MACOSX/stanfordSentimentTreebank/._datasetSentences.txt  
  inflating: stanfordSentimentTreebank/datasetSplit.txt  
  inflating: __MACOSX/stanfordSentimentTreebank/._datasetSplit.txt  
  inflating: stanfordSentimentTreebank/dictionary.txt  
  inflating: __MACOSX/stanfordSentimentTreebank/._dictionary.txt  
  inflating: stanfordSentimentTreebank/original_rt_snippets.txt  
  inflating: __MACOSX/stanfordSentimentTreebank/._original_rt_snippets.txt  
  inflating: stanfordSentimentTreebank/README.txt  
  inflating: __MACOSX/stanfordSentimentTreebank/._README.txt  
  inflating: stanfordSentimentTreebank/sentiment_labels.txt  
  inflating: __MACOSX/stanfordSentimentTreebank/._sentiment_labels.txt  
  inflating: stanfordSentimentTreebank/SOStr.txt  
  inflatin

## Processing Stanford Sentimental Data

In [None]:
import os
import sys

import pandas


def get_phrase_sentiments(base_directory):
    def group_labels(label):
        if label in ["very negative", "negative"]:
            return "negative"
        elif label in ["positive", "very positive"]:
            return "positive"
        else:
            return "neutral"

    dictionary = pandas.read_csv(os.path.join(base_directory, "dictionary.txt"), sep="|")
    dictionary.columns = ["phrase", "id"]
    dictionary = dictionary.set_index("id")

    sentiment_labels = pandas.read_csv(os.path.join(base_directory, "sentiment_labels.txt"), sep="|")
    sentiment_labels.columns = ["id", "sentiment"]
    sentiment_labels = sentiment_labels.set_index("id")

    phrase_sentiments = dictionary.join(sentiment_labels)

    phrase_sentiments["fine"] = pandas.cut(phrase_sentiments.sentiment, [0, 0.2, 0.4, 0.6, 0.8, 1.0],
                                           include_lowest=True,
                                           labels=["very negative", "negative", "neutral", "positive", "very positive"])
    phrase_sentiments["coarse"] = phrase_sentiments.fine.apply(group_labels)
    return phrase_sentiments


def get_sentence_partitions(base_directory):
    sentences = pandas.read_csv(os.path.join(base_directory, "datasetSentences.txt"), index_col="sentence_index",
                                sep="\t")
    splits = pandas.read_csv(os.path.join(base_directory, "datasetSplit.txt"), index_col="sentence_index")
    return sentences.join(splits)


def partition(base_directory):
    phrase_sentiments = get_phrase_sentiments(base_directory).reset_index(level=0)
    sentence_partitions = get_sentence_partitions(base_directory)
    # noinspection PyUnresolvedReferences
    data = pandas.merge(sentence_partitions, phrase_sentiments, right_on="phrase", left_on="sentence", how='left')
    data["splitset_label"] = data["splitset_label"].fillna(1).astype(int)
    data["phrase"] = data["phrase"].str.replace(r"\s('s|'d|'re|'ll|'m|'ve|n't)\b", lambda m: m.group(1))
    return data.groupby("splitset_label")

In [None]:
base_directory, output_directory = '/content/stanfordSentimentTreebank','/content/Dataset/';
os.makedirs(output_directory, exist_ok=True)
for splitset, partition in partition(base_directory):
    split_name = {1: "train", 2: "test", 3: "dev"}[splitset]
    filename = os.path.join(output_directory, "stanford-sentiment-treebank.%s.csv" % split_name)
    del partition["splitset_label"]
    partition.to_csv(filename)

## Reading the refined CSV's

In [None]:
import pandas as pd
train_data = pd.read_csv('/content/Dataset/stanford-sentiment-treebank.train.csv')
test_data = pd.read_csv('/content/Dataset/stanford-sentiment-treebank.test.csv')
dev_data = pd.read_csv('/content/Dataset/stanford-sentiment-treebank.dev.csv')

## Shape of the Data

In [None]:
print(train_data.shape)
print(test_data.shape)
print(dev_data.shape)

(8544, 7)
(2210, 7)
(1101, 7)


## Checking the Labels

In [None]:
print(train_data.sentiment.value_counts())
print(test_data.sentiment.value_counts())
print(dev_data.sentiment.value_counts())

0.777780    219
0.722220    215
0.277780    206
0.833330    200
0.222220    198
           ... 
0.027778     17
0.958330     16
0.000000     14
0.972220     12
0.986110     11
Name: sentiment, Length: 73, dtype: int64
0.27778    72
0.83333    65
0.77778    62
0.16667    55
0.19444    49
           ..
0.72917     1
0.61458     1
0.68750     1
0.78125     1
0.71875     1
Name: sentiment, Length: 82, dtype: int64
0.833330    32
0.277780    30
0.750000    30
0.722220    29
0.791670    29
            ..
0.041667     2
0.069444     2
0.958330     2
0.986110     1
0.083333     1
Name: sentiment, Length: 72, dtype: int64


Upscale the labels to fit into 25 classes of sentiments according to the paper. Currently the maximum Categories are 82.

## Analysis of Null values and interpolate them.

In [None]:
print("The Total null values in Train Data:- ",train_data['sentiment'].isnull().sum())
print("The Total null values in Test Data:- ",test_data['sentiment'].isnull().sum())
print("The Total null values in Dev Data:- ",dev_data['sentiment'].isnull().sum())

The Total null values in Train Data:-  427
The Total null values in Test Data:-  85
The Total null values in Dev Data:-  57


In [None]:
train_data.sentiment.interpolate(method ='linear', limit_direction ='forward',inplace=True)
test_data.sentiment.interpolate(method ='linear', limit_direction ='forward',inplace=True)
dev_data.sentiment.interpolate(method ='linear', limit_direction ='forward',inplace=True)

In [None]:
print("The Total null values in Train Data:- ",train_data['sentiment'].isnull().sum())
print("The Total null values in Test Data:- ",test_data['sentiment'].isnull().sum())
print("The Total null values in Dev Data:- ",dev_data['sentiment'].isnull().sum())

The Total null values in Train Data:-  0
The Total null values in Test Data:-  0
The Total null values in Dev Data:-  0


## Upscaling labels to make them in 25 categories

In [None]:
train_data['label'] = (train_data['sentiment'] * 24).clip(0, 24).astype('int')
test_data['label'] = (test_data['sentiment'] * 24).clip(0, 24).astype('int')
dev_data['label'] =  (dev_data['sentiment'] * 24).clip(0, 24).astype('int')

In [None]:
print(len(train_data.label.unique()))
print(len(test_data.label.unique()))
print(len(dev_data.label.unique()))

25
25
25


## Data Augmentation Functions

In [None]:
## Radom Insertion
def random_insertion(sentence): 
    words = remove_stopwords(sentence)
    n = random.randint(0, len(words)) 
    for _ in range(n):
        new_synonym = get_synonyms(random.choice(words))
        sentence.insert(randrange(len(sentence)+1), new_synonym) 
    return sentence

In [None]:
## Random Deletion
def random_deletion(words, p=0.5): 
    if len(words) == 1: # return if single word
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words)) 
    if len(remaining) == 0: # if not left, sample a random word
        return [random.choice(words)] 
    else:
        return remaining

In [None]:
## Random Swap
def random_swap(sentence): 
    length = range(len(sentence))
    n = random.randint(0, len(sentence))  
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return sentence

In [None]:
!pip install googletrans==3.1.0a0

Collecting googletrans==3.1.0a0
  Downloading https://files.pythonhosted.org/packages/19/3d/4e3a1609bf52f2f7b00436cc751eb977e27040665dde2bd57e7152989672/googletrans-3.1.0a0.tar.gz
Collecting httpx==0.13.3
[?25l  Downloading https://files.pythonhosted.org/packages/54/b4/698b284c6aed4d7c2b4fe3ba5df1fcf6093612423797e76fbb24890dd22f/httpx-0.13.3-py3-none-any.whl (55kB)
[K     |████████████████████████████████| 61kB 8.6MB/s 
[?25hCollecting rfc3986<2,>=1.3
  Downloading https://files.pythonhosted.org/packages/c4/e5/63ca2c4edf4e00657584608bee1001302bbf8c5f569340b78304f2f446cb/rfc3986-1.5.0-py2.py3-none-any.whl
Collecting httpcore==0.9.*
[?25l  Downloading https://files.pythonhosted.org/packages/dd/d5/e4ff9318693ac6101a2095e580908b591838c6f33df8d3ee8dd953ba96a8/httpcore-0.9.1-py3-none-any.whl (42kB)
[K     |████████████████████████████████| 51kB 10.0MB/s 
Collecting hstspreload
[?25l  Downloading https://files.pythonhosted.org/packages/dd/50/606213e12fb49c5eb667df0936223dcaf461f94e215ea

In [None]:
## Back Translation
import random
import googletrans
from googletrans import Translator

translator = Translator()
sentence = ['Passion , melodrama , sorrow , laugther , and tears cascade over the screen effortlessly ...']

available_langs = list(googletrans.LANGUAGES.keys()) 
trans_lang = random.choice(available_langs) 
print(f"Translating to {googletrans.LANGUAGES[trans_lang]}")

translations = translator.translate(sentence, dest=trans_lang) 
t_text = [t.text for t in translations]
print(t_text)

translations_en_random = translator.translate(t_text, src=trans_lang, dest='en') 
en_text = [t.text for t in translations_en_random]
print(en_text)

Translating to cebuano
['Ang hilig, melodrama, kasubo, katawa, ug mga luha nag-cascade sa screen nga wala’y mahimo ...']
['Passion, melodrama, sadness, laughter, and tears cascade across the screen helplessly ...']


In [None]:
def back_translate(sentence):
  if len(sentence)<=1:
    return sentence
  sentence=[sentence]
  available_langs = list(googletrans.LANGUAGES.keys()) 
  trans_lang = random.choice(available_langs)
  translations = translator.translate(sentence, dest=trans_lang)
  t_text = [t.text for t in translations]
  translations_en_random = translator.translate(t_text, src=trans_lang, dest='en')
  en_text = [t.text for t in translations_en_random]
  return en_text[0]

## Sample Size from the Dataset to Augment

In [None]:
Augmented_Data = train_data.sample(frac=0.4, random_state=42).copy()
Augmented_Data['Translate'] =  Augmented_Data['sentence'].apply(back_translate)

In [None]:
Augmented_Data.to_csv('/content/Dataset/stanfored_Retranslaated.csv', index=False)

In [None]:
Augmented_Data[['Translate','sentence']].head(10)

Unnamed: 0,Translate,sentence
4046,"Passion, melodrama, sadness, laughter and tear...","Passion , melodrama , sorrow , laugther , and ..."
1870,The film's thesis - a magnificent technology f...,The movie 's thesis -- elegant technology for ...
2029,The birthday girl walks on a tricky tightrope ...,Birthday Girl walks a tricky tightrope between...
453,Director David Jacobson gives Dahmer a conside...,Director David Jacobson gives Dahmer a conside...
748,Life on the rez is not a journey: this picture...,Life on the rez is no picnic : this picture sh...
2145,Tedpole may be one of the most adorable movies...,Tadpole may be one of the most appealing movie...
2063,The slow pace of the film is actually one of i...,The film 's unhurried pace is actually one of ...
2829,"If vile vampires are your cup of blood, Blade ...","If villainous vampires are your cup of blood ,..."
2115,"For the first time, a movie doesn't proclaim t...","For once , a movie does not proclaim the truth..."
7694,The thriller without a lot of excitement.,A thriller without a lot of thrills .


In [None]:
!git clone https://github.com/jasonwei20/eda_nlp
from eda_nlp.code import eda

fatal: destination path 'eda_nlp' already exists and is not an empty directory.


In [None]:
alpha_sr = 0.2 #percent of words in each sentence to be replaced by synonyms
alpha_ri=0.2 #percent of words in each sentence to be inserted
alpha_rs=0.2 #percent of words in each sentence to be swapped
alpha_rd=0.2 #percent of words in each sentence to be deleted
num_aug=5 #number of augmented sentences per original sentence

In [None]:
import nltk; nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
Augmented_Data.drop(columns='sentence', inplace=True)
Augmented_Data.rename(columns={'Translate':'sentence'}, inplace=True)

In [None]:
train_data = train_data.append(Augmented_Data, ignore_index=True)

## Used EDA Package to Augment the Data

In [None]:
eda.eda('Passion , melodrama , sorrow , laugther , and tears cascade over the screen effortlessly ...', alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, p_rd=alpha_rd, num_aug=num_aug)
# return five sentences as sr, ri, rs, rd

['passion melodrama sorrow tears and laugther cascade the over screen effortlessly',
 'melodrama sorrow laugther and tears cascade over the screen effortlessly',
 'warmth passion melodrama sorrow laugther and tears cascade filmdom over the screen effortlessly',
 'passion melodrama sorrowfulness laugther and crying cascade over the screen effortlessly',
 'passion melodrama sorrow filmdom laugther and tears warmth cascade over the screen effortlessly',
 'passion melodrama sorrow laugther and tears cascade over the screen effortlessly ']

In [None]:
Augmented_Data = train_data.copy()
Augmented_Data['Translate'] = Augmented_Data.apply(lambda row: eda.eda(row['sentence'], alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, p_rd=alpha_rd, num_aug=num_aug), axis=1)

In [None]:
Augmented_Data = Augmented_Data.explode('Translate')

In [None]:
Augmented_Data[['Translate','sentence']].head(10)

Unnamed: 0,Translate,sentence
0,the rock is destined to be the st century s un...,The Rock is destined to be the 21st Century 's...
0,the rock is destined to going the arnold centu...,The Rock is destined to be the 21st Century 's...
0,the rock is destined to the century conan that...,The Rock is destined to be the 21st Century 's...
0,the s and destined to be claud st century s ne...,The Rock is destined to be the 21st Century 's...
0,the rock is destined to be the st progress to ...,The Rock is destined to be the 21st Century 's...
0,the rock is destined to be the st century s ne...,The Rock is destined to be the 21st Century 's...
1,the resplendently elaborate law of continuatio...,The gorgeously elaborate continuation of `` Th...
1,the gorgeously elaborate continuation of the l...,The gorgeously elaborate continuation of `` Th...
1,vision gorgeously elaborate continuation of of...,The gorgeously elaborate continuation of `` Th...
1,the capital of mississippi gorgeously elaborat...,The gorgeously elaborate continuation of `` Th...


In [None]:
Augmented_Data.to_csv('/content/Dataset/stanfored_augmented.csv', index=False)

In [None]:
Augmented_Data.drop(columns='sentence', inplace=True)
Augmented_Data.rename(columns={'Translate':'sentence'}, inplace=True)

In [None]:
train_data = train_data.append(Augmented_Data, ignore_index=True)

In [None]:
train_data.to_csv('/content/Dataset/stanfored_final_train.csv')

## TorchText

In [None]:
# Import Library
import random
import torch, torchtext
from torchtext.legacy import data 

# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f4585d83750>

In [None]:
Sentence = data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [None]:
fields = [('sentence', Sentence),('label',Label)]

In [None]:
train_example = [data.Example.fromlist([train_data.sentence[i],train_data.label[i]], fields) for i in range(train_data.shape[0])]
test_example = [data.Example.fromlist([test_data.sentence[i],test_data.label[i]], fields) for i in range(test_data.shape[0])]
dev_example = [data.Example.fromlist([dev_data.sentence[i],dev_data.label[i]], fields) for i in range(dev_data.shape[0])] 

In [None]:
# Creating dataset
#twitterDataset = data.TabularDataset(path="tweets.csv", format="CSV", fields=fields, skip_header=True)

train_dataset = data.Dataset(train_example, fields)
test_dataset = data.Dataset(test_example, fields)
dev_dataset = data.Dataset(dev_example, fields)

In [None]:
vars(train_dataset.examples[10])

{'label': 21,
 'sentence': ['Good',
  'fun',
  ',',
  'good',
  'action',
  ',',
  'good',
  'acting',
  ',',
  'good',
  'dialogue',
  ',',
  'good',
  'pace',
  ',',
  'good',
  'cinematography',
  '.']}

In [None]:
Sentence.build_vocab(train_dataset)
Label.build_vocab(train_dataset)

In [None]:
print('Size of input vocab : ', len(Sentence.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Sentence.vocab.freqs.most_common(10)))
print('Labels : ', Sentence.vocab.stoi)

Size of input vocab :  27404
Size of label vocab :  25
Top 10 words appreared repeatedly : [('the', 68311), ('a', 50258), ('and', 42769), ('of', 42635), ('to', 29822), ('is', 24150), ('it', 22622), ('s', 21157), ('in', 19090), ('that', 18148)]


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
train_iterator, test_iterator, dev_iterator = data.BucketIterator.splits((train_dataset, test_dataset, dev_dataset), batch_size = 32, 
                                                            sort_key = lambda x: len(x.sentence),
                                                            sort_within_batch=True, device = device)

In [None]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Sentence.vocab.stoi, tokens)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
        
        # Final activation function softmax
        output = F.softmax(dense_outputs[0], dim=1)
            
        return output

In [None]:
# Define hyperparameters
size_of_vocab = len(Sentence.vocab)
embedding_dim = 300
num_hidden_nodes = 100
num_output_nodes = len(Label.vocab)
num_layers = 2
dropout = 0.2

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout = dropout)

In [None]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

classifier(
  (embedding): Embedding(27404, 300)
  (encoder): LSTM(300, 100, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=100, out_features=25, bias=True)
)
The model has 8,465,325 trainable parameters


In [None]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        sentence, sentence_lengths = batch.sentence   
        
        # convert to 1D tensor
        predictions = model(sentence, sentence_lengths).squeeze()  
        
        # print(predictions)
        # compute the loss
        loss = criterion(predictions, batch.label)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            sentence, sentence_lengths = batch.sentence
            
            # convert to 1d tensor
            predictions = model(sentence, sentence_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 20
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    dev_loss, dev_acc = evaluate(model, dev_iterator, criterion)
    
    # save the best model
    if test_loss < best_valid_loss:
        best_valid_loss = test_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}% \n')
    print(f'\t Dev Loss: {dev_loss:.3f} |  Dev Acc: {dev_acc*100:.2f}% \n')

	Train Loss: 3.152 | Train Acc: 13.81%
	 Test Loss: 3.174 |  Test Acc: 10.13% 

	 Dev Loss: 3.181 |  Dev Acc: 8.93% 

	Train Loss: 3.029 | Train Acc: 26.88%
	 Test Loss: 3.174 |  Test Acc: 10.18% 

	 Dev Loss: 3.187 |  Dev Acc: 8.30% 

	Train Loss: 2.956 | Train Acc: 34.21%
	 Test Loss: 3.171 |  Test Acc: 10.80% 

	 Dev Loss: 3.168 |  Dev Acc: 10.71% 

	Train Loss: 2.905 | Train Acc: 39.09%
	 Test Loss: 3.170 |  Test Acc: 10.85% 

	 Dev Loss: 3.163 |  Dev Acc: 11.96% 

	Train Loss: 2.866 | Train Acc: 43.07%
	 Test Loss: 3.177 |  Test Acc: 10.04% 

	 Dev Loss: 3.168 |  Dev Acc: 11.11% 

	Train Loss: 2.827 | Train Acc: 47.10%
	 Test Loss: 3.179 |  Test Acc: 9.78% 

	 Dev Loss: 3.171 |  Dev Acc: 10.76% 

	Train Loss: 2.797 | Train Acc: 49.77%
	 Test Loss: 3.184 |  Test Acc: 9.20% 

	 Dev Loss: 3.172 |  Dev Acc: 11.02% 

	Train Loss: 2.774 | Train Acc: 51.77%
	 Test Loss: 3.187 |  Test Acc: 8.79% 

	 Dev Loss: 3.175 |  Dev Acc: 10.09% 

	Train Loss: 2.741 | Train Acc: 55.50%
	 Test Loss: 3

In [None]:
N_EPOCHS = 50
# best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    dev_loss, dev_acc = evaluate(model, dev_iterator, criterion)
    
    # save the best model
    if test_loss < best_valid_loss:
        best_valid_loss = test_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}% \n')
    print(f'\t Dev Loss: {dev_loss:.3f} |  Dev Acc: {dev_acc*100:.2f}% \n')

	Train Loss: 2.436 | Train Acc: 85.44%
	 Test Loss: 3.204 |  Test Acc: 7.28% 

	 Dev Loss: 3.204 |  Dev Acc: 6.91% 

	Train Loss: 2.423 | Train Acc: 86.72%
	 Test Loss: 3.200 |  Test Acc: 7.77% 

	 Dev Loss: 3.207 |  Dev Acc: 7.09% 

	Train Loss: 2.410 | Train Acc: 87.99%
	 Test Loss: 3.202 |  Test Acc: 7.68% 

	 Dev Loss: 3.209 |  Dev Acc: 6.24% 

	Train Loss: 2.398 | Train Acc: 89.19%
	 Test Loss: 3.204 |  Test Acc: 7.37% 

	 Dev Loss: 3.211 |  Dev Acc: 6.46% 

	Train Loss: 2.387 | Train Acc: 90.27%
	 Test Loss: 3.207 |  Test Acc: 7.01% 

	 Dev Loss: 3.206 |  Dev Acc: 7.18% 

	Train Loss: 2.378 | Train Acc: 91.13%
	 Test Loss: 3.205 |  Test Acc: 7.28% 

	 Dev Loss: 3.213 |  Dev Acc: 6.29% 

	Train Loss: 2.370 | Train Acc: 91.81%
	 Test Loss: 3.208 |  Test Acc: 6.96% 

	 Dev Loss: 3.215 |  Dev Acc: 5.76% 

	Train Loss: 2.364 | Train Acc: 92.41%
	 Test Loss: 3.210 |  Test Acc: 6.52% 

	 Dev Loss: 3.209 |  Dev Acc: 6.92% 

	Train Loss: 2.359 | Train Acc: 92.89%
	 Test Loss: 3.208 |  Tes

In [None]:
torch.save(model.state_dict(), 'last_saved_weights.pt')

In [103]:
path='./last_saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_tweet(tweet):
    
    #categories = {0: "Negative", 1:"Positive", 2:"Neutral"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor)

    _, pred = torch.max(prediction, 1) 
    
    #return categories[pred.item()]
    return pred.item()

## Setimental Analysis of Train, Test and Dev Data

In [None]:
# train_data['Predicted_Label'] = train_data['sentence'].apply(classify_tweet)

In [104]:
test_data['Predicted_Label'] = test_data['sentence'].apply(classify_tweet)

In [105]:
dev_data['Predicted_Label'] = test_data['sentence'].apply(classify_tweet)

## Postive Tweets

In [106]:
import numpy as np
# train_data['Flag'] = np.where(train_data['Predicted_Label']==train_data['label'], 1, 0)
test_data['Flag'] = np.where(test_data['Predicted_Label']==test_data['label'], 1, 0)
dev_data['Flag'] = np.where(dev_data['Predicted_Label']==dev_data['label'], 1, 0)

## Train Postive Tweets

In [107]:
# train_data[train_data['Flag']==1]

## Test Set Postive Tweets

In [108]:
test_data[test_data['Flag']==1]

Unnamed: 0.1,Unnamed: 0,sentence,id,phrase,sentiment,fine,coarse,label,Predicted_Label,Flag
6,8,Steers turns in a snappy screenplay that curls...,225968.0,Steers turns in a snappy screenplay that curls...,0.777780,positive,positive,18,18,1
9,11,"This is a film well worth seeing , talking and...",14534.0,"This is a film well worth seeing , talking and...",0.902780,very positive,positive,21,21,1
53,55,Hugh Grant and Sandra Bullock are two such lik...,14092.0,Hugh Grant and Sandra Bullock are two such lik...,0.875000,very positive,positive,21,21,1
98,107,Steve Irwin 's method is Ernest Hemmingway at ...,14457.0,Steve Irwin's method is Ernest Hemmingway at a...,0.666670,positive,positive,16,16,1
103,112,A romantic comedy that operates by the rules o...,13795.0,A romantic comedy that operates by the rules o...,0.625000,positive,positive,15,15,1
...,...,...,...,...,...,...,...,...,...,...
2133,9000,Big Fat Liar is just futile silliness looking ...,144150.0,Big Fat Liar is just futile silliness looking ...,0.236110,negative,negative,5,5,1
2154,9022,It 's painful .,146599.0,It's painful .,0.097222,very negative,negative,2,2,1
2168,9037,"A woozy , roisterous , exhausting mess , and t...",143445.0,"A woozy , roisterous , exhausting mess , and t...",0.361110,negative,negative,8,8,1
2186,9055,I regret to report that these ops are just not...,146106.0,I regret to report that these ops are just not...,0.361110,negative,negative,8,8,1


## Dev Set Postive Tweets

In [109]:
dev_data[dev_data['Flag']==1]

Unnamed: 0.1,Unnamed: 0,sentence,id,phrase,sentiment,fine,coarse,label,Predicted_Label,Flag
11,1127,You 'll gasp appalled and laugh outraged and p...,27292.0,You'll gasp appalled and laugh outraged and po...,0.66667,positive,positive,16,16,1
17,1133,The film serves as a valuable time capsule to ...,26827.0,The film serves as a valuable time capsule to ...,0.44444,neutral,neutral,10,10,1
21,1137,"It provides the grand , intelligent entertainm...",25800.0,"It provides the grand , intelligent entertainm...",0.90278,very positive,positive,21,21,1
39,1157,Here 's yet another studio horror franchise mu...,25527.0,Here's yet another studio horror franchise muc...,0.19444,very negative,negative,4,4,1
56,1174,"Ultimately feels empty and unsatisfying , like...",27101.0,"Ultimately feels empty and unsatisfying , like...",0.11111,very negative,negative,2,2,1
71,1189,A rigorously structured and exquisitely filmed...,24451.0,A rigorously structured and exquisitely filmed...,0.72222,positive,positive,17,17,1
74,1192,A quiet treasure -- a film to be savored .,24426.0,A quiet treasure -- a film to be savored .,0.93056,very positive,positive,22,22,1
133,1253,If director Michael Dowse only superficially u...,25650.0,If director Michael Dowse only superficially u...,0.38889,negative,negative,9,9,1
155,1277,"Kinnear does n't aim for our sympathy , but ra...",25901.0,"Kinnear doesn't aim for our sympathy , but rat...",0.73611,positive,positive,17,17,1
214,1336,If I had been thinking about the visual medium...,223897.0,If I had been thinking about the visual medium...,0.43056,neutral,neutral,10,10,1


Unnamed: 0.1,Unnamed: 0,sentence,id,phrase,sentiment,fine,coarse,label,Predicted_Label,Flag
11,1127,You 'll gasp appalled and laugh outraged and p...,27292.0,You'll gasp appalled and laugh outraged and po...,0.66667,positive,positive,16,16,1
17,1133,The film serves as a valuable time capsule to ...,26827.0,The film serves as a valuable time capsule to ...,0.44444,neutral,neutral,10,10,1
21,1137,"It provides the grand , intelligent entertainm...",25800.0,"It provides the grand , intelligent entertainm...",0.90278,very positive,positive,21,21,1
39,1157,Here 's yet another studio horror franchise mu...,25527.0,Here's yet another studio horror franchise muc...,0.19444,very negative,negative,4,4,1
56,1174,"Ultimately feels empty and unsatisfying , like...",27101.0,"Ultimately feels empty and unsatisfying , like...",0.11111,very negative,negative,2,2,1
71,1189,A rigorously structured and exquisitely filmed...,24451.0,A rigorously structured and exquisitely filmed...,0.72222,positive,positive,17,17,1
74,1192,A quiet treasure -- a film to be savored .,24426.0,A quiet treasure -- a film to be savored .,0.93056,very positive,positive,22,22,1
133,1253,If director Michael Dowse only superficially u...,25650.0,If director Michael Dowse only superficially u...,0.38889,negative,negative,9,9,1
155,1277,"Kinnear does n't aim for our sympathy , but ra...",25901.0,"Kinnear doesn't aim for our sympathy , but rat...",0.73611,positive,positive,17,17,1
214,1336,If I had been thinking about the visual medium...,223897.0,If I had been thinking about the visual medium...,0.43056,neutral,neutral,10,10,1


## Negative Tweets

## Train Set Negative Tweets

In [110]:
# train_data[train_data['Flag']==0]

## Test Set Negative Tweets

In [111]:
test_data[test_data['Flag']==0]

Unnamed: 0.1,Unnamed: 0,sentence,id,phrase,sentiment,fine,coarse,label,Predicted_Label,Flag
0,2,Effective but too-tepid biopic,13995.0,Effective but too-tepid biopic,0.51389,neutral,neutral,12,4,0
1,3,If you sometimes like to go to the movies to h...,14123.0,If you sometimes like to go to the movies to h...,0.73611,positive,positive,17,8,0
2,4,"Emerges as something rare , an issue movie tha...",13999.0,"Emerges as something rare , an issue movie tha...",0.86111,very positive,positive,20,21,0
3,5,The film provides some great insight into the ...,14498.0,The film provides some great insight into the ...,0.59722,neutral,neutral,14,17,0
4,6,Offers that rare combination of entertainment ...,14351.0,Offers that rare combination of entertainment ...,0.83333,very positive,positive,19,13,0
...,...,...,...,...,...,...,...,...,...,...
2204,9075,I hate this movie,146071.0,I hate this movie,0.30556,negative,negative,7,18,0
2205,11620,An imaginative comedy\/thriller .,13851.0,An imaginative comedy\/thriller .,0.77778,positive,positive,18,10,0
2206,11622,"-LRB- A -RRB- rare , beautiful film .",13691.0,"-LRB- A -RRB- rare , beautiful film .",0.95833,very positive,positive,22,3,0
2207,11625,-LRB- An -RRB- hilarious romantic comedy .,,,0.95833,,,22,6,0


Unnamed: 0.1,Unnamed: 0,sentence,id,phrase,sentiment,fine,coarse,label,Predicted_Label,Flag
0,2,Effective but too-tepid biopic,13995.0,Effective but too-tepid biopic,0.51389,neutral,neutral,12,4,0
1,3,If you sometimes like to go to the movies to h...,14123.0,If you sometimes like to go to the movies to h...,0.73611,positive,positive,17,8,0
2,4,"Emerges as something rare , an issue movie tha...",13999.0,"Emerges as something rare , an issue movie tha...",0.86111,very positive,positive,20,21,0
3,5,The film provides some great insight into the ...,14498.0,The film provides some great insight into the ...,0.59722,neutral,neutral,14,17,0
4,6,Offers that rare combination of entertainment ...,14351.0,Offers that rare combination of entertainment ...,0.83333,very positive,positive,19,13,0
...,...,...,...,...,...,...,...,...,...,...
2204,9075,I hate this movie,146071.0,I hate this movie,0.30556,negative,negative,7,18,0
2205,11620,An imaginative comedy\/thriller .,13851.0,An imaginative comedy\/thriller .,0.77778,positive,positive,18,10,0
2206,11622,"-LRB- A -RRB- rare , beautiful film .",13691.0,"-LRB- A -RRB- rare , beautiful film .",0.95833,very positive,positive,22,3,0
2207,11625,-LRB- An -RRB- hilarious romantic comedy .,,,0.95833,,,22,6,0


## Dev Set Negative Tweets

In [100]:
dev_data[dev_data['Flag']==0]

Unnamed: 0.1,Unnamed: 0,sentence,id,phrase,sentiment,fine,coarse,label,Predicted_Label,Flag
0,1116,It 's a lovely film with lovely performances b...,25730.0,It's a lovely film with lovely performances by...,0.79167,positive,positive,19,4,0
1,1117,"No one goes unindicted here , which is probabl...",225163.0,"No one goes unindicted here , which is probabl...",0.51389,neutral,neutral,12,8,0
2,1118,And if you 're not nearly moved to tears by a ...,222358.0,And if you're not nearly moved to tears by a c...,0.76389,positive,positive,18,21,0
3,1119,"A warm , funny , engaging film .",24502.0,"A warm , funny , engaging film .",0.88889,very positive,positive,21,17,0
4,1120,Uses sharp humor and insight into human nature...,27115.0,Uses sharp humor and insight into human nature...,0.80556,very positive,positive,19,13,0
...,...,...,...,...,...,...,...,...,...,...
1096,7900,it seems to me the film is about the art of ri...,163906.0,it seems to me the film is about the art of ri...,0.29167,negative,negative,7,10,0
1097,7901,It 's just disappointingly superficial -- a mo...,146522.0,It's just disappointingly superficial -- a mov...,0.33333,negative,negative,7,16,0
1098,7902,The title not only describes its main characte...,149944.0,The title not only describes its main characte...,0.23611,negative,negative,5,21,0
1099,7903,Sometimes it feels as if it might have been ma...,148760.0,Sometimes it feels as if it might have been ma...,0.44444,neutral,neutral,10,7,0


In [112]:
dev_data.to_csv('/content/Dataset/final_dev.csv',index=False)
train_data.to_csv('/content/Dataset/final_train.csv',index=False)
test_data.to_csv('/content/Dataset/final_test.csv',index=False)

In [113]:
!zip Dataset.zip /content/Dataset/*

updating: content/Dataset/final_dev.csv (deflated 74%)
updating: content/Dataset/final_test.csv (deflated 74%)
updating: content/Dataset/final_train.csv (deflated 86%)
updating: content/Dataset/stanford-sentiment-treebank.dev.csv (deflated 74%)
updating: content/Dataset/stanford-sentiment-treebank.test.csv (deflated 75%)
updating: content/Dataset/stanford-sentiment-treebank.train.csv (deflated 75%)
updating: content/Dataset/stanfored_augmented.csv (deflated 91%)
updating: content/Dataset/stanfored_final_train.csv (deflated 85%)
updating: content/Dataset/stanfored_Retranslaated.csv (deflated 79%)
