In [1]:
import pandas as pd

sentences = pd.read_csv("data/stanfordSentimentTreebank/datasetSentences.txt", sep="	")

dataset_split = pd.read_csv("data/stanfordSentimentTreebank/datasetSplit.txt")
split_data = sentences.merge(dataset_split, on="sentence_index", how="left")

split_data.columns = ["sentence_index", "phrase", "splitset_label"]

split_data.head(5)

Unnamed: 0,sentence_index,phrase,splitset_label
0,1,The Rock is destined to be the 21st Century 's...,1
1,2,The gorgeously elaborate continuation of `` Th...,1
2,3,Effective but too-tepid biopic,2
3,4,If you sometimes like to go to the movies to h...,2
4,5,"Emerges as something rare , an issue movie tha...",2


In [2]:
dictionary = pd.read_csv("data/stanfordSentimentTreebank/dictionary.txt", sep="|")
dictionary.columns = ["phrase", "phrase ids"]
# dictionary = dictionary.set_index("phrase ids")

sentiment_labels = pd.read_csv("data/stanfordSentimentTreebank/sentiment_labels.txt", sep="|")

sentiment_labels["overall sentiment"] = pd.cut(sentiment_labels["sentiment values"], [0, 0.2, 0.4, 0.6, 0.8, 1.0], include_lowest=True ,labels=["very negative", "negative", "neutral", "positive", "very positive"])

sentiment_labels["sentiment_value"] = pd.cut(sentiment_labels["sentiment values"], [0, 0.2, 0.4, 0.6, 0.8, 1.0], include_lowest=True ,labels=[0, 1, 2, 3, 4])

phrase_sentiments = dictionary.merge(sentiment_labels, on="phrase ids")

phrase_sentiments.head(5)

Unnamed: 0,phrase,phrase ids,sentiment values,overall sentiment,sentiment_value
0,! ',22935,0.52778,neutral,2
1,! '',18235,0.5,neutral,2
2,! Alas,179257,0.44444,neutral,2
3,! Brilliant,22936,0.86111,very positive,4
4,! Brilliant !,40532,0.93056,very positive,4


In [3]:
main_dataframe = phrase_sentiments.merge(split_data, on="phrase")
main_dataframe["splitset_label"] = main_dataframe["splitset_label"].fillna(1).astype(int)
main_dataframe = main_dataframe[main_dataframe.phrase.map(len)>4]

main_dataframe = main_dataframe.groupby("splitset_label")

main_dataframe.head(5)

Unnamed: 0,phrase,phrase ids,sentiment values,overall sentiment,sentiment_value,sentence_index,splitset_label
0,", The Sum of All Fears is simply a well-made a...",102340,0.88889,very positive,4,4860,1
1,", `` They 're out there ! ''",221244,0.61111,positive,3,7251,1
2,", is a temporal inquiry that shoulders its phi...",221388,0.69444,positive,3,5477,1
3,- I also wanted a little alien as a friend !,221714,0.69444,positive,3,5576,1
4,"- West Coast rap wars , this modern mob music ...",221716,0.76389,positive,3,2338,1
13,"-LRB- A -RRB- rare , beautiful film .",13691,0.95833,very positive,4,11623,2
14,-LRB- Drumline -RRB- is entertaining for what ...,13695,0.73611,positive,3,224,2
15,-LRB- Schweiger is -RRB- talented and terribly...,13696,0.77778,positive,3,145,2
16,-LRB- Wendigo is -RRB- why we go to the cinema...,13697,0.65278,positive,3,14,2
17,... Blade II is more enjoyable than the origin...,24114,0.91667,very positive,4,961,2


In [4]:
from utils.augumentations import back_translation


train_dataframe = main_dataframe.get_group(1)
train_dataframe = train_dataframe.set_index(pd.Index(range(train_dataframe.shape[0])))
# train_dataframe = train_dataframe.append(
#     train_dataframe.apply(
#         lambda row: {
#             'phrase': back_translation(row['phrase']), 'phrase ids':  row['phrase ids'], 
#             'sentiment values': row['sentiment values'], 'overall sentiment': row['overall sentiment'], 
#             'sentiment_value': row['sentiment_value'], 'sentence_index': row['sentence_index'], 
#             'splitset_label': row['splitset_label']
#         }, 
#         axis=1)[0],
    # ignore_index = True)



In [5]:
test_dataframe = main_dataframe.get_group(2)
test_dataframe = test_dataframe.set_index(pd.Index(range(test_dataframe.shape[0])))

validaion_dataframe = main_dataframe.get_group(3)
validaion_dataframe = validaion_dataframe.set_index(pd.Index(range(validaion_dataframe.shape[0])))

train_dataframe.head(20)

Unnamed: 0,phrase,phrase ids,sentiment values,overall sentiment,sentiment_value,sentence_index,splitset_label
0,", The Sum of All Fears is simply a well-made a...",102340,0.88889,very positive,4,4860,1
1,", `` They 're out there ! ''",221244,0.61111,positive,3,7251,1
2,", is a temporal inquiry that shoulders its phi...",221388,0.69444,positive,3,5477,1
3,- I also wanted a little alien as a friend !,221714,0.69444,positive,3,5576,1
4,"- West Coast rap wars , this modern mob music ...",221716,0.76389,positive,3,2338,1
5,- greaseballs mob action-comedy .,142685,0.36111,negative,1,7166,1
6,- spy action flick with Antonio Banderas and L...,221720,0.16667,very negative,0,11305,1
7,- style cross-country adventure ... it has spo...,221722,0.70833,positive,3,11409,1
8,-- but certainly hard to hate .,221739,0.61111,positive,3,7163,1
9,-- but it makes for one of the most purely enj...,221741,0.81944,very positive,4,2732,1


In [6]:
# Import Library
import random
import torch, torchtext
from torchtext.legacy import data

# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x1cb8271c390>

In [7]:
ReviewComment = data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Rating = data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [8]:
fields = [('review', ReviewComment),('rating', Rating)]

In [9]:
train_dataset = data.Dataset([data.Example.fromlist([train_dataframe.phrase[i],train_dataframe.sentiment_value[i]], fields) for i in range(train_dataframe.shape[0])], fields)

In [10]:
test_dataset = data.Dataset([data.Example.fromlist([test_dataframe.phrase[i],test_dataframe.sentiment_value[i]], fields) for i in range(test_dataframe.shape[0])], fields)

In [11]:
vars(test_dataset.examples[10])

{'review': ['...',
  'a',
  'haunting',
  'vision',
  ',',
  'with',
  'images',
  'that',
  'seem',
  'more',
  'like',
  'disturbing',
  'hallucinations',
  '.'],
 'rating': 0}

In [12]:
valid_dataset = data.Dataset([data.Example.fromlist([validaion_dataframe.phrase[i],validaion_dataframe.sentiment_value[i]], fields) for i in range(validaion_dataframe.shape[0])], fields)

In [13]:
ReviewComment.build_vocab(train_dataset)
Rating.build_vocab(train_dataset)

In [14]:
print('Size of input vocab : ', len(ReviewComment.vocab))
print('Size of label vocab : ', len(Rating.vocab))
print('Top 10 words appreared repeatedly :', list(ReviewComment.vocab.freqs.most_common(10)))
print('Labels : ', Rating.vocab.stoi)

Size of input vocab :  16523
Size of label vocab :  5
Top 10 words appreared repeatedly : [('.', 7633), (',', 6706), ('the', 5714), ('of', 4170), ('and', 4152), ('a', 4147), ('to', 2844), ('-', 2566), ('is', 2403), ("'s", 2353)]
Labels :  defaultdict(None, {3: 0, 1: 1, 2: 2, 4: 3, 0: 4})


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
train_iterator, test_iterator = data.BucketIterator.splits((train_dataset, test_dataset), batch_size = 32, 
                                                            sort_key = lambda x: len(x.review),
                                                            sort_within_batch=True, device = device)

In [17]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(ReviewComment.vocab.stoi, tokens)

In [18]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
        
        # Final activation function softmax
        output = F.softmax(dense_outputs[0], dim=1)
            
        return output

In [19]:
# Define hyperparameters
size_of_vocab = len(ReviewComment.vocab)
embedding_dim = 50
num_hidden_nodes = 100
num_output_nodes = 5
num_layers = 2
dropout = 0.5

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout = dropout)

In [20]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

classifier(
  (embedding): Embedding(16523, 50)
  (encoder): LSTM(50, 100, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=100, out_features=5, bias=True)
)
The model has 968,255 trainable parameters


In [21]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [22]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        review, review_lengths = batch.review   
        
        # convert to 1D tensor
        predictions = model(review, review_lengths).squeeze()
        
        # compute the loss
        loss = criterion(predictions, batch.rating)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.rating)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [23]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            
            review, review_lengths = batch.review   
            # convert to 1D tensor
            predictions = model(review, review_lengths).squeeze()  
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.rating)
            acc = binary_accuracy(predictions, batch.rating)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [24]:
N_EPOCHS = 100
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, test_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

	Train Loss: 1.587 | Train Acc: 27.31%
	 Val. Loss: 1.580 |  Val. Acc: 26.48% 

	Train Loss: 1.571 | Train Acc: 29.02%
	 Val. Loss: 1.576 |  Val. Acc: 28.98% 

	Train Loss: 1.562 | Train Acc: 31.48%
	 Val. Loss: 1.571 |  Val. Acc: 29.73% 

	Train Loss: 1.551 | Train Acc: 33.04%
	 Val. Loss: 1.572 |  Val. Acc: 30.63% 

	Train Loss: 1.539 | Train Acc: 34.46%
	 Val. Loss: 1.567 |  Val. Acc: 30.14% 

	Train Loss: 1.527 | Train Acc: 35.93%
	 Val. Loss: 1.561 |  Val. Acc: 31.49% 

	Train Loss: 1.516 | Train Acc: 37.28%
	 Val. Loss: 1.558 |  Val. Acc: 32.15% 

	Train Loss: 1.504 | Train Acc: 38.44%
	 Val. Loss: 1.553 |  Val. Acc: 32.19% 

	Train Loss: 1.489 | Train Acc: 39.96%
	 Val. Loss: 1.553 |  Val. Acc: 32.08% 

	Train Loss: 1.476 | Train Acc: 41.42%
	 Val. Loss: 1.548 |  Val. Acc: 32.75% 

	Train Loss: 1.462 | Train Acc: 43.60%
	 Val. Loss: 1.559 |  Val. Acc: 31.52% 

	Train Loss: 1.449 | Train Acc: 45.05%
	 Val. Loss: 1.550 |  Val. Acc: 31.71% 

	Train Loss: 1.439 | Train Acc: 46.31%
	

In [25]:
#load weights and tokenizer

path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_review_comment(review):
    
    categories = {0: "very negative", 1:  "negative", 2 : "neutral", 3: "positive", 4: "very positive"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(review)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor)

    _, pred = torch.max(prediction, 1) 
    
    return categories[pred.item()]

In [26]:
classify_review_comment(" !!!!!!")

'neutral'

In [27]:
from utils.augumentations import *
back_translation(["One by one we'll make many one", "Two by two is equals to 4"])


Translating to latin


'[ " You are one of many one by one \', \' two by two in pairs at the 4 \'] '