<a href="https://colab.research.google.com/github/nikshrimali/TSAI_END/blob/main/S7_HandsOn/Sentiment_Analysis_LSTM_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Sentiment analysis using LSTM Model

import os
import sys

import pandas
import random
import torch, torchtext
from torchtext import data 

import spacy
spacy.load("en")

# Manual Seed
SEED = 43
torch.manual_seed(SEED)

# Selecting the device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [None]:
!wget nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip

--2020-12-23 01:34:45--  http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip [following]
--2020-12-23 01:34:45--  https://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6372817 (6.1M) [application/zip]
Saving to: ‘stanfordSentimentTreebank.zip.2’


2020-12-23 01:34:50 (1.17 MB/s) - ‘stanfordSentimentTreebank.zip.2’ saved [6372817/6372817]



In [None]:
!unzip /content/stanfordSentimentTreebank.zip

Archive:  /content/stanfordSentimentTreebank.zip
replace stanfordSentimentTreebank/datasetSentences.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [None]:

# Put all the Stanford Sentiment Treebank phrase data into test, training, and dev CSVs.

def get_phrase_sentiments(base_directory):
    def group_labels(label):
        if label in ["very negative", "negative"]:
            return "negative"
        elif label in ["positive", "very positive"]:
            return "positive"
        else:
            return "neutral"

    dictionary = pandas.read_csv(os.path.join(base_directory, "dictionary.txt"), sep="|")
    dictionary.columns = ["phrase", "id"]
    dictionary = dictionary.set_index("id")

    sentiment_labels = pandas.read_csv(os.path.join(base_directory, "sentiment_labels.txt"), sep="|")
    sentiment_labels.columns = ["id", "sentiment"]
    sentiment_labels = sentiment_labels.set_index("id")

    phrase_sentiments = dictionary.join(sentiment_labels)

    phrase_sentiments["fine"] = pandas.cut(phrase_sentiments.sentiment, [0, 0.2, 0.4, 0.6, 0.8, 1.0],
                                           include_lowest=True,
                                           labels=["very negative", "negative", "neutral", "positive", "very positive"])
    phrase_sentiments["coarse"] = phrase_sentiments.fine.apply(group_labels)
    return phrase_sentiments


def get_sentence_partitions(base_directory):
    sentences = pandas.read_csv(os.path.join(base_directory, "datasetSentences.txt"), index_col="sentence_index",
                                sep="\t")
    splits = pandas.read_csv(os.path.join(base_directory, "datasetSplit.txt"), index_col="sentence_index")
    return sentences.join(splits).set_index("sentence")


def partition(base_directory):
    phrase_sentiments = get_phrase_sentiments(base_directory)
    sentence_partitions = get_sentence_partitions(base_directory)
    # noinspection PyUnresolvedReferences
    data = phrase_sentiments.join(sentence_partitions, on="phrase")
    data["splitset_label"] = data["splitset_label"].fillna(1).astype(int)
    data["phrase"] = data["phrase"].str.replace(r"\s('s|'d|'re|'ll|'m|'ve|n't)\b", lambda m: m.group(1))
    return data.groupby("splitset_label")


base_directory = r'/content/stanfordSentimentTreebank'
output_directory = r'/content/output'
os.makedirs(output_directory, exist_ok=True)
for splitset, partition in partition(base_directory):
    split_name = {1: "train", 2: "test", 3: "dev"}[splitset]
    filename = os.path.join(output_directory, "stanford-sentiment-treebank.%s.csv" % split_name)
    del partition["splitset_label"]
    partition.to_csv(filename)

In [None]:
valid = pandas.read_csv(r'/content/output/stanford-sentiment-treebank.dev.csv')
test = pandas.read_csv(r'/content/output/stanford-sentiment-treebank.test.csv')
train = pandas.read_csv(r'/content/output/stanford-sentiment-treebank.train.csv')

In [None]:
print(train.shape)
train.head(10)

(236076, 5)


Unnamed: 0,id,phrase,sentiment,fine,coarse
0,22935,! ',0.52778,neutral,neutral
1,18235,! '',0.5,neutral,neutral
2,179257,! Alas,0.44444,neutral,neutral
3,22936,! Brilliant,0.86111,very positive,positive
4,40532,! Brilliant !,0.93056,very positive,positive
5,22937,! Brilliant ! ',1.0,very positive,positive
6,60624,! C'mon,0.47222,neutral,neutral
7,13402,! Gollum's ` performance ' is incredible,0.76389,positive,positive
8,179258,"! Oh , look at that clever angle ! Wow , a jum...",0.27778,negative,negative
9,140882,! Romething,0.5,neutral,neutral


In [None]:
# Removing stop words and numbers

import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.corpus import stopwords
import string

stopwords = set(stopwords.words('english') + ['AT_USER','URL'])
    
def processTweet(tweet):
    # tweet is the text we will pass for preprocessing
    # convert passed tweet to lower case 
    tweet = str(tweet).lower()
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    tweet = re.sub(r'#([\s]+)', r'\1', tweet) # remove the # space
    tweet = tweet.replace("'", "")
    tweet = re.sub('[^A-Za-z0-9]+', ' ', tweet) # remove # and numbers
    # use work_tokenize imported above to tokenize the tweet
    tweet =  word_tokenize(tweet)
    return [word for word in tweet if word not in stopwords or word not in list(punctuation)]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# processed=[]

# for phrase in train['phrase']:
#     # process all tweets using processTweet function above - store in variable 'cleaned' 
#     cleaned=processTweet(phrase)
#     processed.append(' '.join(cleaned))
    

In [None]:
train = train[train['phrase'].map(len) > 4]

In [None]:
train.head(5)

Unnamed: 0,id,phrase,sentiment,fine,coarse
2,179257,! Alas,0.44444,neutral,neutral
3,22936,! Brilliant,0.86111,very positive,positive
4,40532,! Brilliant !,0.93056,very positive,positive
5,22937,! Brilliant ! ',1.0,very positive,positive
6,60624,! C'mon,0.47222,neutral,neutral


In [None]:
train.sample(5)

Unnamed: 0,id,phrase,sentiment,fine,coarse
56820,189697,While the new film is much more eye-catching t...,0.56944,neutral,neutral
114163,41956,fresh and,0.91667,very positive,positive
206205,96298,the ongoing - and unprecedented - construction...,0.56944,neutral,neutral
152802,206073,message-movie,0.5,neutral,neutral
5388,141819,( T ) hose same extremes prevent us from takin...,0.375,negative,negative


In [None]:
def fine_to_label(fine):
  if fine == 'very negative':
    return 0
  elif fine == 'negative':
    return 1
  elif fine == 'neutral':
    return 2
  elif fine == 'positive':
    return 3
  elif fine == 'very positive':
    return 4

train['label'] = train.apply(lambda row: fine_to_label(row['fine']), axis=1)

In [None]:
!pip install google_trans_new



# Data Augmentations

# Back Translations

Translate text to a langauge to again translate back to english

In [None]:
import random
import google_trans_new
from google_trans_new import google_translator

def backTranslation(sentence):
  translator = google_translator()
  
  available_langs = list(google_trans_new.LANGUAGES.keys()) 
  trans_lang = random.choice(available_langs) 
  
  translations = translator.translate(sentence, lang_tgt=trans_lang) 

  translations_en_random = translator.translate(translations, lang_src=trans_lang, lang_tgt='en') 
  return translations_en_random

## Random Deletion

Randomly deletes words from sentences given a probability parameter 'p'

In [None]:
def random_deletion(words, p=0.2):
    
    if len(words) == 1: # return if single word
        return words
    
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words)) 
    if len(remaining) == 0: # if not left, sample a random word
        return [random.choice(words)] 
    else:
        return remaining

In [None]:
sent = ['I', 'am a lucky guy']
print(random_deletion(sent))

['I', 'am a lucky guy']


## Random Swap

Takes sentence and then swaps words within it n times

In [None]:
def random_swap(sentence, n=5): 
    length = range(len(sentence)) 
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1]
    return sentence

In [None]:
def augment_data(text):
    # Takes a random no and applies augmentation based upon that
    aug = random.randint(0,9)
    text = processTweet(text)

    if len(text) > 3:
        if aug == 1:
            text = random_swap(text)
            # print('random swap', text)
        elif aug == 2:
            text = random_deletion(text)
            # print('Random deletion', text)
        elif aug == 3:
            text = backTranslation(text)
        #     # print('back translation', text)
    
    return text

In [None]:
train.head(10)

Unnamed: 0,id,phrase,sentiment,fine,coarse,label
2,179257,! Alas,0.44444,neutral,neutral,2
3,22936,! Brilliant,0.86111,very positive,positive,4
4,40532,! Brilliant !,0.93056,very positive,positive,4
5,22937,! Brilliant ! ',1.0,very positive,positive,4
6,60624,! C'mon,0.47222,neutral,neutral,2
7,13402,! Gollum's ` performance ' is incredible,0.76389,positive,positive,3
8,179258,"! Oh , look at that clever angle ! Wow , a jum...",0.27778,negative,negative,1
9,140882,! Romething,0.5,neutral,neutral,2
10,179259,! Run,0.43056,neutral,neutral,2
11,60625,! The Movie,0.5,neutral,neutral,2


In [None]:
train = train[:30000]

In [None]:
train

Unnamed: 0,id,phrase,sentiment,fine,coarse,label
2,179257,! Alas,0.44444,neutral,neutral,2
3,22936,! Brilliant,0.86111,very positive,positive,4
4,40532,! Brilliant !,0.93056,very positive,positive,4
5,22937,! Brilliant ! ',1.00000,very positive,positive,4
6,60624,! C'mon,0.47222,neutral,neutral,2
...,...,...,...,...,...,...
30683,184621,I can't remember the last time I saw an audien...,0.26389,negative,negative,1
30684,184622,I can't remember the last time I saw an audien...,0.44444,neutral,neutral,2
30685,106422,I can't remember the last time I saw worse stu...,0.36111,negative,negative,1
30686,184623,I can't say for sure,0.44444,neutral,neutral,2


In [None]:
train.shape

(30000, 6)

In [None]:
import time
from tqdm import tqdm
time_start = time.time()
for i, sent in enumerate(train.phrase):
    train.phrase[i] = ' '.join(augment_data(sent))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [None]:
train.sample(20)

Unnamed: 0,id,phrase,sentiment,fine,coarse,label
3051,179802,s kind of,0.33333,negative,negative,1
28916,106105,here on earth,0.27778,negative,negative,1
2018,179607,"[ ' s ' , ' V ' , ' A ' , ' F a i l u r ...",0.41667,neutral,neutral,2
19437,104511,audacious impossible,0.5,neutral,neutral,2
782,60709,ll only put you to sleep,0.77778,positive,positive,3
3234,101754,s much too big for its britches,0.55556,neutral,neutral,2
3215,179842,most film thought s provoking,0.40278,neutral,neutral,2
4064,43318,s some centered go along with all the weird stuff,0.70833,positive,positive,3
6216,180377,i could feel my eyelids getting very heavy,0.18056,very negative,negative,0
11009,62409,while at the same time being a most touching r...,0.56944,neutral,neutral,2


In [None]:
# train = train.drop(columns=['id','sentiment', 'fine', 'coarse'])
# train.reset_index(drop=True, inplace=True)

# Loading the data into dataloader
# Import Library

Phrase = data.Field(tokenize='spacy',sequential = True, batch_first =True, include_lengths=True)
Label = data.LabelField(tokenize = 'spacy', is_target=True, batch_first =True, sequential =False)

Phrase.build_vocab(train)
Label.build_vocab(train)

fields = [('Phrase', Phrase),('Label',Label)]

In [None]:
vars(Phrase.vocab)

{'freqs': Counter({'a': 2,
          'b': 1,
          'e': 2,
          'h': 1,
          'l': 2,
          'p': 1,
          'r': 1,
          's': 1}),
 'itos': ['<unk>', '<pad>', 'a', 'e', 'l', 'b', 'h', 'p', 'r', 's'],
 'stoi': defaultdict(<function torchtext.vocab._default_unk_index>,
             {'<pad>': 1,
              '<unk>': 0,
              'a': 2,
              'b': 5,
              'e': 3,
              'h': 6,
              'l': 4,
              'p': 7,
              'r': 8,
              's': 9}),
 'vectors': None}

In [None]:
example = [data.Example.fromlist([train.phrase[i],train.label[i]], fields) for i in range(train.shape[0])]

In [None]:
vars(Phrase.vocab)

{'freqs': Counter({'a': 2,
          'b': 1,
          'e': 2,
          'h': 1,
          'l': 2,
          'p': 1,
          'r': 1,
          's': 1}),
 'itos': ['<unk>', '<pad>', 'a', 'e', 'l', 'b', 'h', 'p', 'r', 's'],
 'stoi': defaultdict(<function torchtext.vocab._default_unk_index>,
             {'<pad>': 1,
              '<unk>': 0,
              'a': 2,
              'b': 5,
              'e': 3,
              'h': 6,
              'l': 4,
              'p': 7,
              'r': 8,
              's': 9}),
 'vectors': None}

In [None]:
print('Size of input vocab : ', len(Phrase.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Phrase.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

In [None]:
train_iterator, valid_iterator = data.BucketIterator.splits((train, valid), batch_size = 32, 
                                                            sort_key = lambda x: len(x.tweets),
                                                            sort_within_batch=True, device = device)

In [None]:
# import os, pickle
# with open('tokenizer.pkl', 'wb') as tokens: 
#     pickle.dump(Tweet.vocab.stoi, tokens)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
        
        # Final activation function softmax
        output = F.softmax(dense_outputs[0], dim=1)
            
        return output

In [None]:
# Define hyperparameters
size_of_vocab = len(Tweet.vocab)
embedding_dim = 300
num_hidden_nodes = 100
num_output_nodes = 3
num_layers = 2
dropout = 0.2

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout = dropout)

In [None]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

In [None]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        tweet, tweet_lengths = batch.tweets   
        
        # convert to 1D tensor
        predictions = model(tweet, tweet_lengths).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.labels)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.labels)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            tweet, tweet_lengths = batch.tweets
            
            # convert to 1d tensor
            predictions = model(tweet, tweet_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.labels)
            acc = binary_accuracy(predictions, batch.labels)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

In [None]:
#load weights and tokenizer

path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_tweet(tweet):
    
    categories = {0: "Negative", 1:"Positive", 2:"Neutral"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor)

    _, pred = torch.max(prediction, 1) 
    
    return categories[pred.item()]

In [None]:
def random_insertion(sentence, n): 
    words = remove_stopwords(sentence) 
    for _ in range(n):
        new_synonym = get_synonyms(random.choice(words))
        sentence.insert(randrange(len(sentence)+1), new_synonym) 
    return sentence

In [None]:
def random_deletion(words, p=0.5): 
    if len(words) == 1: # return if single word
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words)) 
    if len(remaining) == 0: # if not left, sample a random word
        return [random.choice(words)] 
    else:
        return remaining

In [None]:
def random_swap(sentence, n=5): 
    length = range(len(sentence)) 
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return sentence