## Downloading Stanford Data

In [1]:
!nvidia-smi 

Thu Jun 17 19:19:20 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# !wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip

In [3]:
# !unzip /content/stanfordSentimentTreebank.zip

## Processing Stanford Sentimental Data

In [4]:
import os
import sys

import pandas
def clean_data(x):
  char_dict = {
          '-LRB-' : '(',
          '-RRB-' : ')',
          '\xa0' : ' ',
          '\xc2' : '',
          '\xc3\x83\xc2\xa0' : 'a',
          'à' : 'a',
          'Â' : '',
          'â' : 'a',
          'ã' : 'a',
          'Ã¡' : 'a',
          'Ã¢' : 'a',
          'Ã£' : 'a',
          'Ã¦' : 'ae',
          'Ã§' : 'c',
          'Ã¨' : 'e',
          'Ã©' : 'e',
          'Ã­' : 'i',
          'Ã¯' : 'i',
          'Ã±' : 'n',
          'Ã³' : 'o',
          'Ã´' : 'o',
          'Ã¶' : 'o',
          'Ã»' : 'u',
          'Ã¼' : 'u',
          'æ' : 'ae',
          'ç' : 'c',
          'è' : 'e',
          'é' : 'e',
          'í' : 'i',
          'ï' : 'i',
          'ñ' : 'n',
          'ó' : 'o',
          'ô' : 'o',
          'ö' : 'o',
          'û' : 'u',
          'ü' : 'u'
      }
  for keys in char_dict.keys():
    x = x.replace(keys, char_dict[keys])
  return x

def get_phrase_sentiments(base_directory):
    def group_labels(label):
        if label in ["very negative", "negative"]:
            return "negative"
        elif label in ["positive", "very positive"]:
            return "positive"
        else:
            return "neutral"

    dictionary = pandas.read_csv(os.path.join(base_directory, "dictionary.txt"), sep="|")
    dictionary.columns = ["phrase", "id"]
    dictionary = dictionary.set_index("id")

    sentiment_labels = pandas.read_csv(os.path.join(base_directory, "sentiment_labels.txt"), sep="|")
    sentiment_labels.columns = ["id", "sentiment"]
    sentiment_labels = sentiment_labels.set_index("id")

    phrase_sentiments = dictionary.join(sentiment_labels)

    phrase_sentiments["fine"] = pandas.cut(phrase_sentiments.sentiment, [0, 0.2, 0.4, 0.6, 0.8, 1.0],
                                           include_lowest=True,
                                           labels=["very negative", "negative", "neutral", "positive", "very positive"])
    phrase_sentiments["coarse"] = phrase_sentiments.fine.apply(group_labels)
    return phrase_sentiments


def get_sentence_partitions(base_directory):
    sentences = pandas.read_csv(os.path.join(base_directory, "datasetSentences.txt"), index_col="sentence_index",
                                sep="\t")
    splits = pandas.read_csv(os.path.join(base_directory, "datasetSplit.txt"), index_col="sentence_index")
    return sentences.join(splits)


def partition(base_directory):
    phrase_sentiments = get_phrase_sentiments(base_directory).reset_index(level=0)
    sentence_partitions = get_sentence_partitions(base_directory)
    # noinspection PyUnresolvedReferences
    phrase_sentiments['phrase'] = phrase_sentiments['phrase'].apply(lambda x : clean_data(x))
    sentence_partitions['sentence'] = sentence_partitions['sentence'].apply(lambda x : clean_data(x))
    data = pandas.merge(sentence_partitions, phrase_sentiments, right_on="phrase", left_on="sentence", how='left')
    data["splitset_label"] = data["splitset_label"].fillna(1).astype(int)
    data["sentence"] = data["sentence"].str.replace(r"\s('s|'d|'re|'ll|'m|'ve|n't)\b", lambda m: m.group(1))
    return data.groupby("splitset_label")

In [5]:
base_directory, output_directory = '/content/stanfordSentimentTreebank','/content/Dataset/';
os.makedirs(output_directory, exist_ok=True)
for splitset, partition in partition(base_directory):
    split_name = {1: "train", 2: "test", 3: "dev"}[splitset]
    filename = os.path.join(output_directory, "stanford-sentiment-treebank.%s.csv" % split_name)
    del partition["splitset_label"]
    partition.to_csv(filename)

## Reading the refined CSV's

In [6]:
import pandas as pd
train_data = pd.read_csv('/content/Dataset/stanford-sentiment-treebank.train.csv')
test_data = pd.read_csv('/content/Dataset/stanford-sentiment-treebank.test.csv')
dev_data = pd.read_csv('/content/Dataset/stanford-sentiment-treebank.dev.csv')

## Shape of the Data

In [7]:
print(train_data.shape)
print(test_data.shape)
print(dev_data.shape)

(8544, 7)
(2217, 7)
(1101, 7)


## Checking the Labels

In [8]:
print(train_data.fine.value_counts())
print(test_data.fine.value_counts())
print(dev_data.fine.value_counts())

positive         2321
negative         2218
neutral          1623
very positive    1287
very negative    1092
Name: fine, dtype: int64
negative         633
positive         515
very positive    400
neutral          390
very negative    279
Name: fine, dtype: int64
negative         289
positive         279
neutral          229
very positive    164
very negative    139
Name: fine, dtype: int64


Upscale the labels to fit into 25 classes of sentiments according to the paper. Currently the maximum Categories are 82.

## Analysis of Null values and interpolate them.

In [9]:
print("The Total null values in Train Data:- ",train_data['fine'].isnull().sum())
print("The Total null values in Test Data:- ",test_data['fine'].isnull().sum())
print("The Total null values in Dev Data:- ",dev_data['fine'].isnull().sum())

The Total null values in Train Data:-  3
The Total null values in Test Data:-  0
The Total null values in Dev Data:-  1


In [10]:
train_data.dropna(subset=['fine'], inplace=True)
# test_data.sentiment.interpolate(method ='linear', limit_direction ='forward',inplace=True)
dev_data.dropna(subset=['fine'], inplace=True)

In [11]:
print("The Total null values in Train Data:- ",train_data['fine'].isnull().sum())
print("The Total null values in Test Data:- ",test_data['fine'].isnull().sum())
print("The Total null values in Dev Data:- ",dev_data['fine'].isnull().sum())

The Total null values in Train Data:-  0
The Total null values in Test Data:-  0
The Total null values in Dev Data:-  0


## Appending Data

In [12]:
train_data = train_data.append(test_data, ignore_index=True)
train_data = train_data.append(dev_data, ignore_index=True)

In [13]:
train_data.to_csv('/content/Dataset/stanfored_final_data.csv')

## TorchText

In [14]:
# Import Library
import random
import torch, torchtext
from torchtext.legacy import data 

# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7faa0fd512b0>

In [15]:
Sentence = data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [16]:
fields = [('sentence', Sentence),('label',Label)]

In [17]:
train_example = [data.Example.fromlist([train_data.sentence[i],train_data.fine[i]], fields) for i in range(train_data.shape[0])]

In [18]:
# Creating dataset

train_dataset = data.Dataset(train_example, fields)

In [19]:
vars(train_dataset.examples[10])

{'label': 'very positive',
 'sentence': ['Good',
  'fun',
  ',',
  'good',
  'action',
  ',',
  'good',
  'acting',
  ',',
  'good',
  'dialogue',
  ',',
  'good',
  'pace',
  ',',
  'good',
  'cinematography',
  '.']}

In [20]:
(train, test) = train_dataset.split(split_ratio=[80, 20], random_state = random.seed(SEED))

In [21]:
len(train), len(test)

(9486, 2372)

In [22]:
# MAX_VOCAB_SIZE = 25000
Sentence.build_vocab(train)
Label.build_vocab(train)

In [23]:
print('Size of input vocab : ', len(Sentence.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Sentence.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  18173
Size of label vocab :  5
Top 10 words appreared repeatedly : [('.', 8951), (',', 8010), ('the', 6734), ('and', 4976), ('a', 4895), ('of', 4891), ('to', 3421), ('-', 3088), ("'s", 2881), ('is', 2867)]
Labels :  defaultdict(None, {'negative': 0, 'positive': 1, 'neutral': 2, 'very positive': 3, 'very negative': 4})


In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [25]:
train_iterator, test_iterator = data.BucketIterator.splits((train, test), batch_size = 32, 
                                                            sort_key = lambda x: len(x.sentence),
                                                            sort_within_batch=True, device = device)

In [26]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump({'input_stoi':Sentence.vocab.stoi, 'label_itos':Label.vocab.itos}, tokens)

In [27]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout, pad_idx):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True,
                           bidirectional=True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        self.projection = nn.Sequential(nn.Linear(2 * hidden_dim,hidden_dim),nn.BatchNorm1d(hidden_dim),nn.ReLU(), nn.Dropout(dropout)) 
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]

        projection = self.projection(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(projection)   
        
        # Final activation function softmax
        output = F.softmax(dense_outputs, dim=1)
            
        return output

In [28]:
# Define hyperparameters
size_of_vocab = len(Sentence.vocab)
embedding_dim = 100
num_hidden_nodes = 100
num_output_nodes = len(Label.vocab)
num_layers = 2
dropout = 0.5
PAD_IDX = Sentence.vocab.stoi[Sentence.pad_token]

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout, PAD_IDX)

In [29]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

classifier(
  (embedding): Embedding(18173, 100, padding_idx=1)
  (encoder): LSTM(100, 100, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (projection): Sequential(
    (0): Linear(in_features=200, out_features=100, bias=True)
    (1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
  )
  (fc): Linear(in_features=100, out_features=5, bias=True)
)
The model has 2,241,305 trainable parameters


In [30]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=5e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    correct = (predictions == y).float()
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [31]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        sentence, sentence_lengths = batch.sentence   
        
        # convert to 1D tensor
        predictions = model(sentence, sentence_lengths).squeeze()  
        
        # print(predictions)
        # compute the loss
        loss = criterion(predictions, batch.label)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [32]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            sentence, sentence_lengths = batch.sentence
            
            # convert to 1d tensor
            predictions = model(sentence, sentence_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [33]:
N_EPOCHS = 20
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    # dev_loss, dev_acc = evaluate(model, dev_iterator, criterion)
    
    # save the best model
    if test_loss < best_valid_loss:
        best_valid_loss = test_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}% \n')
    # print(f'\t Dev Loss: {dev_loss:.3f} |  Dev Acc: {dev_acc*100:.2f}% \n')

	Train Loss: 1.579 | Train Acc: 28.53%
	 Test Loss: 1.561 |  Test Acc: 31.62% 

	Train Loss: 1.543 | Train Acc: 34.01%
	 Test Loss: 1.545 |  Test Acc: 33.38% 

	Train Loss: 1.502 | Train Acc: 38.36%
	 Test Loss: 1.540 |  Test Acc: 33.92% 

	Train Loss: 1.463 | Train Acc: 42.87%
	 Test Loss: 1.525 |  Test Acc: 36.04% 

	Train Loss: 1.420 | Train Acc: 48.17%
	 Test Loss: 1.528 |  Test Acc: 34.92% 

	Train Loss: 1.375 | Train Acc: 52.56%
	 Test Loss: 1.530 |  Test Acc: 35.00% 

	Train Loss: 1.341 | Train Acc: 56.16%
	 Test Loss: 1.538 |  Test Acc: 34.46% 

	Train Loss: 1.303 | Train Acc: 60.01%
	 Test Loss: 1.527 |  Test Acc: 35.58% 

	Train Loss: 1.277 | Train Acc: 62.69%
	 Test Loss: 1.529 |  Test Acc: 35.75% 

	Train Loss: 1.257 | Train Acc: 64.65%
	 Test Loss: 1.536 |  Test Acc: 34.83% 

	Train Loss: 1.233 | Train Acc: 66.84%
	 Test Loss: 1.531 |  Test Acc: 35.83% 

	Train Loss: 1.216 | Train Acc: 68.75%
	 Test Loss: 1.541 |  Test Acc: 34.71% 

	Train Loss: 1.207 | Train Acc: 69.44%
	

In [34]:
# torch.save(model.state_dict(), 'last_saved_weights.pt')

In [35]:
path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
with open('./tokenizer.pkl', 'rb') as f:
  meta_data = pickle.load(f)
tokenizer = meta_data['input_stoi']
categories = meta_data['label_itos']

#inference 

import spacy
nlp = spacy.load('en')

def classify_tweet(tweet):
    
    #categories = {0: "Negative", 1:"Positive", 2:"Neutral"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor)

    _, pred = torch.max(prediction, 1) 
    
    return categories[pred.item()]
    # return pred.item()

## Setimental Analysis of Train, Test and Dev Data

In [36]:
train_data['Predicted_Label'] = train_data['sentence'].apply(classify_tweet)

In [37]:
# test_data['Predicted_Label'] = test_data['sentence'].apply(classify_tweet)

In [38]:
# dev_data['Predicted_Label'] = test_data['sentence'].apply(classify_tweet)

## Postive Tweets

In [39]:
import numpy as np
train_data['Flag'] = np.where(train_data['Predicted_Label']==train_data['fine'], 1, 0)
# test_data['Flag'] = np.where(test_data['Predicted_Label']==test_data['label'], 1, 0)
# dev_data['Flag'] = np.where(dev_data['Predicted_Label']==dev_data['label'], 1, 0)

## Train Postive Tweets

In [40]:
train_data[train_data['Flag']==1]

Unnamed: 0.1,Unnamed: 0,sentence,id,phrase,sentiment,fine,coarse,Predicted_Label,Flag
2,63,Singer\/composer Bryan Adams contributes a sle...,225801.0,Singer\/composer Bryan Adams contributes a sle...,0.62500,positive,positive,positive,1
3,64,You'd think by now America would have had enou...,14646.0,You 'd think by now America would have had eno...,0.50000,neutral,neutral,neutral,1
4,65,Yet the act is still charming here .,14644.0,Yet the act is still charming here .,0.72222,positive,positive,positive,1
7,74,Part of the charm of Satin Rouge is that it av...,225402.0,Part of the charm of Satin Rouge is that it av...,0.72222,positive,positive,positive,1
14,138,"Still , this flick is fun , and host to some t...",225973.0,"Still , this flick is fun , and host to some t...",0.81944,very positive,positive,very positive,1
...,...,...,...,...,...,...,...,...,...
11848,7901,But it could have been worse .,222770.0,But it could have been worse .,0.36111,negative,negative,negative,1
11849,7902,"Some of their jokes work , but most fail miser...",148746.0,"Some of their jokes work , but most fail miser...",0.20833,negative,negative,negative,1
11852,7905,... Designed to provide a mix of smiles and te...,221766.0,... Designed to provide a mix of smiles and te...,0.22222,negative,negative,negative,1
11853,7906,it seems to me the film is about the art of ri...,163906.0,it seems to me the film is about the art of ri...,0.29167,negative,negative,negative,1


## Train Set Negative Tweets

In [41]:
train_data[train_data['Flag']==0]

Unnamed: 0.1,Unnamed: 0,sentence,id,phrase,sentiment,fine,coarse,Predicted_Label,Flag
0,0,The Rock is destined to be the 21st Century's ...,226166.0,The Rock is destined to be the 21st Century 's...,0.69444,positive,positive,negative,0
1,1,The gorgeously elaborate continuation of `` Th...,226300.0,The gorgeously elaborate continuation of `` Th...,0.83333,very positive,positive,negative,0
5,66,Whether or not you're enlightened by any of De...,227114.0,Whether or not you 're enlightened by any of D...,0.83333,very positive,positive,positive,0
6,70,Just the labour involved in creating the layer...,224508.0,Just the labour involved in creating the layer...,0.87500,very positive,positive,positive,0
8,84,a screenplay more ingeniously constructed than...,228134.0,a screenplay more ingeniously constructed than...,0.83333,very positive,positive,negative,0
...,...,...,...,...,...,...,...,...,...
11850,7903,Even horror fans will most likely not find wha...,145161.0,Even horror fans will most likely not find wha...,0.12500,very negative,negative,positive,0
11851,7904,comes off like a rejected ABC Afterschool Spec...,229921.0,comes off like a rejected ABC Afterschool Spec...,0.16667,very negative,negative,negative,0
11854,7907,It's just disappointingly superficial -- a mov...,146522.0,It 's just disappointingly superficial -- a mo...,0.33333,negative,negative,positive,0
11855,7908,The title not only describes its main characte...,149944.0,The title not only describes its main characte...,0.23611,negative,negative,positive,0


In [42]:
# dev_data.to_csv('/content/Dataset/final_dev.csv',index=False)
train_data.to_csv('/content/Dataset/final_train.csv',index=False)
# test_data.to_csv('/content/Dataset/final_test.csv',index=False)

In [43]:
!mv /content/tokenizer.pkl /content/Dataset/
!mv /content/saved_weights.pt /content/Dataset/

In [44]:
!zip Dataset.zip /content/Dataset/*

updating: content/Dataset/final_train.csv (deflated 76%)
updating: content/Dataset/saved_weights.pt (deflated 8%)
updating: content/Dataset/stanford-sentiment-treebank.dev.csv (deflated 75%)
updating: content/Dataset/stanford-sentiment-treebank.test.csv (deflated 75%)
updating: content/Dataset/stanford-sentiment-treebank.train.csv (deflated 75%)
updating: content/Dataset/stanfored_final_data.csv (deflated 74%)
updating: content/Dataset/tokenizer.pkl (deflated 48%)
