In [1]:
import pandas as pd
from gensim import models
import numpy as np
import gensim
import gensim.downloader
from tqdm import tqdm
import time

## Dataset

In [2]:
df_sentences = pd.read_csv('data/stanfordSentimentTreebank/datasetSentences.txt', header=0, delimiter='\t', index_col=0)
df_labels = pd.read_csv('data/stanfordSentimentTreebank/datasetSplit.txt', header=0, delimiter=',', index_col='sentence_index')

In [3]:
sentences = df_sentences['sentence'].to_list()
sentences = [sentence.lower().split() for sentence in sentences]
labels = df_labels['splitset_label'].to_list()

In [4]:
max_length = -np.inf
for sentence in sentences:
    if len(sentence) > max_length:
        max_length = len(sentence)

In [5]:
max_length

56

## Word Embeddings

In [6]:
vector_size = 50
window_size = 5
negative_size = 15
sentence_size = 60

wv_model_file = 'wv_model.pth'

In [7]:
# wv_model = gensim.downloader.load('glove-wiki-gigaword-50')
# wv_model = models.Word2Vec(sentences=sentences, vector_size=vector_size, window=window_size, negative=negative_size).wv
# wv_model = models.Word2Vec(corpus_file='data/corpus.txt', vector_size=vector_size, window=window_size, negative=negative_size).wv

# wv_model.save(wv_model_file)
# del wv_model

In [8]:
def vectorize_sentences(sentences, wv, sentence_size):
    vec_sentences = []
    for sentence in sentences:
        vec_sentence = []
        for token in sentence:
            if token in wv:
                vec_sentence.append(wv[token])
            else:
                vec_sentence.append(wv['<unk>'])
        
        
        while len(vec_sentence) < sentence_size:
            vec_sentence.append(wv['<eos>'])
        
        vec_sentence = vec_sentence[:sentence_size]
        vec_sentences.append(vec_sentence)
        
    return vec_sentences

In [9]:
wv = models.KeyedVectors.load(wv_model_file)

In [10]:
wv.add_vectors(
    ['<unk>', '<eos>'],
    [np.zeros(wv.vector_size), np.ones(wv.vector_size)]
)

## BiLSTM

In [11]:
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [12]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [13]:
def save_model(model, file_name):
    torch.save(model.state_dict(), file_name)
def load_model(model, file_name):
    return model.load_state_dict(torch.load(file_name))

In [14]:
hidden_size = 512
num_layers = 6
bidirectional = True

batch_size = 128

lr = 0.0001
num_epochs = 20


eval_rate = 0.2

model_file = 'lstm_sentiment_analysis.pth'

In [15]:
vec_sentences = vectorize_sentences(sentences, wv, sentence_size)

vec_sentences = np.array(vec_sentences)

In [16]:
vectors = vec_sentences.reshape((-1, 50))

mu = vectors.mean(axis=0)
sigma = np.sqrt(((vectors - mu) ** 2).mean(axis=0))

In [17]:
normalized_vec_sentences = (vec_sentences - mu) / sigma

In [18]:
eval_index = int(len(normalized_vec_sentences) * eval_rate)


train_x = vec_sentences[eval_index:]
eval_x = vec_sentences[:eval_index]

train_y = labels[eval_index:]
eval_y = labels[:eval_index]

In [19]:
train_x = torch.tensor(train_x, dtype=torch.float)
eval_x = torch.tensor(eval_x, dtype=torch.float)

train_y = torch.tensor(train_y, dtype=torch.long) - 1
eval_y = torch.tensor(eval_y, dtype=torch.long) - 1

train_loader = torch.utils.data.DataLoader(list(zip(train_x, train_y)), batch_size, shuffle=True)
eval_loader = torch.utils.data.DataLoader(list(zip(eval_x, eval_y)), batch_size)

In [20]:
class Classifier(nn.Module):
    
    def __init__(self):
        super(Classifier, self).__init__()
        lstm_dim = hidden_size * 2 * (2 if bidirectional else 1)
        
        
        self.lstm = nn.LSTM(input_size=vector_size,
                              hidden_size=hidden_size,
                              num_layers=num_layers,
                              bidirectional=bidirectional,
                              batch_first=True
                             )
        self.dropout = nn.Dropout(p=0.5)

        
        self.fcnn_1 = nn.Linear(in_features=lstm_dim, out_features=64)
        self.b_norm_1 = nn.BatchNorm1d(64)
        
        self.fcnn_2 = nn.Linear(in_features=64, out_features=3)
        
        
    def forward(self, sentences):
        # sentences = pack_padded_sequence(sentences, sentence_size, batch_first=True, enforce_sorted=False)
        h_lstm, _ = self.lstm(sentences)
        # output, _ = pad_packed_sequence(h_lstm, batch_first=True)
        
        avg_pool = torch.mean(h_lstm, 1)
        max_pool, _ = torch.max(h_lstm, 1)
        
        
        output = torch.cat([avg_pool, max_pool], 1)
        output = self.dropout(output)
        
        output = self.fcnn_1(output)
        output = torch.relu(output)
        
        output = self.fcnn_2(output)
        
        return output


In [21]:
classifier = Classifier().to(device)

In [22]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=lr)

In [23]:
min_loss = np.inf

for i in range(num_epochs):
    print(f'---> Epoch {i} <---')
    time.sleep(0.5)
    
    classifier.train()
    loader = tqdm(train_loader, postfix={'Epoch': i})
    train_losses = []
    
    for sentences, targets in loader:
        sentences = sentences.to(device)
        targets = targets.to(device)
        
        optimizer.zero_grad()
        
        outputs = classifier(sentences)
        
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        train_losses.append(loss.item())
        loader.set_postfix({
            'Epoch': i,
            'Train loss': np.mean(train_losses)
        }, refresh=True)
    
    
    time.sleep(0.5)
    
    classifier.eval()
    loader = tqdm(eval_loader, postfix={'Epoch': i,}, colour='green')
    eval_losses = []
    eval_scores = []
    
    for sentences, targets in loader:
        sentences = sentences.to(device)
        targets = targets.to(device)
        
        outputs = classifier(sentences)
        loss = criterion(outputs, targets)
        
        score = (outputs.argmax(dim=1) == targets).detach().cpu().numpy()
        eval_scores.append(score)
        eval_losses.append(loss.item())
        loader.set_postfix({
            'Epoch': i,
            'Eval loss': np.mean(eval_losses),
            'Eval score': np.concatenate(eval_scores).mean()
        }, refresh=True)
        
    
    eval_loss = np.mean(eval_losses)
    if eval_loss <= min_loss:
        min_loss = eval_loss
        save_model(classifier, model_file)
        loader.write('*** save ***')
        
    time.sleep(0.5)

---> Epoch 0 <---


100%|██████████| 75/75 [00:32<00:00,  2.31it/s, Epoch=0, Train loss=0.646]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.88it/s, Epoch=0, Eval loss=1.77, Eval score=0.32]  


*** save ***
---> Epoch 1 <---


100%|██████████| 75/75 [00:33<00:00,  2.25it/s, Epoch=1, Train loss=0.585]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.89it/s, Epoch=1, Eval loss=1.68, Eval score=0.32]  


*** save ***
---> Epoch 2 <---


100%|██████████| 75/75 [00:33<00:00,  2.24it/s, Epoch=2, Train loss=0.587]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.40it/s, Epoch=2, Eval loss=1.57, Eval score=0.32]  


*** save ***
---> Epoch 3 <---


100%|██████████| 75/75 [00:33<00:00,  2.25it/s, Epoch=3, Train loss=0.584]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.41it/s, Epoch=3, Eval loss=1.69, Eval score=0.32]  


---> Epoch 4 <---


100%|██████████| 75/75 [00:34<00:00,  2.18it/s, Epoch=4, Train loss=0.583]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.89it/s, Epoch=4, Eval loss=1.64, Eval score=0.32]  


---> Epoch 5 <---


100%|██████████| 75/75 [00:34<00:00,  2.19it/s, Epoch=5, Train loss=0.585]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.88it/s, Epoch=5, Eval loss=1.63, Eval score=0.32]  


---> Epoch 6 <---


100%|██████████| 75/75 [00:33<00:00,  2.24it/s, Epoch=6, Train loss=0.593]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.83it/s, Epoch=6, Eval loss=1.68, Eval score=0.32]  


---> Epoch 7 <---


100%|██████████| 75/75 [00:34<00:00,  2.16it/s, Epoch=7, Train loss=0.584]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.81it/s, Epoch=7, Eval loss=1.67, Eval score=0.32]  


---> Epoch 8 <---


100%|██████████| 75/75 [00:34<00:00,  2.19it/s, Epoch=8, Train loss=0.583]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.43it/s, Epoch=8, Eval loss=1.76, Eval score=0.32]  


---> Epoch 9 <---


100%|██████████| 75/75 [00:33<00:00,  2.24it/s, Epoch=9, Train loss=0.577]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.73it/s, Epoch=9, Eval loss=1.81, Eval score=0.32]  


---> Epoch 10 <---


100%|██████████| 75/75 [00:33<00:00,  2.25it/s, Epoch=10, Train loss=0.582]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.72it/s, Epoch=10, Eval loss=1.7, Eval score=0.32]   


---> Epoch 11 <---


100%|██████████| 75/75 [00:34<00:00,  2.20it/s, Epoch=11, Train loss=0.578]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.58it/s, Epoch=11, Eval loss=1.61, Eval score=0.32]  


---> Epoch 12 <---


100%|██████████| 75/75 [00:34<00:00,  2.18it/s, Epoch=12, Train loss=0.578]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.62it/s, Epoch=12, Eval loss=1.81, Eval score=0.32]  


---> Epoch 13 <---


100%|██████████| 75/75 [00:34<00:00,  2.16it/s, Epoch=13, Train loss=0.575]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.70it/s, Epoch=13, Eval loss=1.8, Eval score=0.32]   


---> Epoch 14 <---


100%|██████████| 75/75 [00:33<00:00,  2.23it/s, Epoch=14, Train loss=0.573]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.56it/s, Epoch=14, Eval loss=1.83, Eval score=0.32]  


---> Epoch 15 <---


100%|██████████| 75/75 [00:36<00:00,  2.08it/s, Epoch=15, Train loss=0.575]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.51it/s, Epoch=15, Eval loss=1.84, Eval score=0.32]  


---> Epoch 16 <---


100%|██████████| 75/75 [00:35<00:00,  2.14it/s, Epoch=16, Train loss=0.57] 
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.68it/s, Epoch=16, Eval loss=1.84, Eval score=0.32]  


---> Epoch 17 <---


100%|██████████| 75/75 [00:34<00:00,  2.14it/s, Epoch=17, Train loss=0.572]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.74it/s, Epoch=17, Eval loss=1.75, Eval score=0.32]  


---> Epoch 18 <---


100%|██████████| 75/75 [00:34<00:00,  2.17it/s, Epoch=18, Train loss=0.567]
100%|[32m██████████[0m| 19/19 [00:02<00:00,  6.40it/s, Epoch=18, Eval loss=1.74, Eval score=0.32]  


---> Epoch 19 <---


 55%|█████▍    | 41/75 [00:20<00:17,  1.99it/s, Epoch=19, Train loss=0.565]


KeyboardInterrupt: 