In [1]:
import pandas as pd
from gensim import models
import numpy as np
import gensim
import gensim.downloader
from tqdm import tqdm
import time

## Dataset

In [2]:
df_sentences = pd.read_csv('data/stanfordSentimentTreebank/datasetSentences.txt', header=0, delimiter='\t', index_col=0)
df_labels = pd.read_csv('data/stanfordSentimentTreebank/datasetSplit.txt', header=0, delimiter=',', index_col='sentence_index')

In [3]:
sentences = df_sentences['sentence'].to_list()
sentences = [sentence.lower().split() for sentence in sentences]
labels = df_labels['splitset_label'].to_list()

In [4]:
max_length = -np.inf
for sentence in sentences:
    if len(sentence) > max_length:
        max_length = len(sentence)

In [5]:
max_length

56

## Word Embeddings

In [6]:
vector_size = 50
window_size = 5
negative_size = 15
sentence_size = 60

wv_model_file = 'wv_model.pth'

In [7]:
# wv_model = gensim.downloader.load('glove-wiki-gigaword-50')
# wv_model = models.Word2Vec(sentences=train_sentences, vector_size=vector_size, window=window_size, negative=negative_size).wv
# wv_model = models.Word2Vec(corpus_file='data/corpus.txt', vector_size=vector_size, window=window_size, negative=negative_size).wv

# wv_model.save(wv_model_file)
# del wv_model

In [8]:
def vectorize_sentences(sentences, wv, sentence_size):
    vec_sentences = []
    for sentence in sentences:
        vec_sentence = []
        for token in sentence:
            if token in wv:
                vec_sentence.append(wv[token])
            else:
                vec_sentence.append(wv['<unk>'])
        
        while len(vec_sentence) < sentence_size:
            vec_sentence.append(wv['<eos>'])
        
        vec_sentences.append(vec_sentence)
        
    return vec_sentences

In [9]:
wv = models.KeyedVectors.load(wv_model_file)

In [10]:
wv.add_vectors(
    ['<unk>', '<eos>'],
    [np.zeros(wv.vector_size), np.ones(wv.vector_size)]
)

## BiLSTM

In [11]:
import torch
from torch import nn

In [12]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [13]:
def save_model(model, file_name):
    torch.save(model.state_dict(), file_name)
def load_model(model, file_name):
    return model.load_state_dict(torch.load(file_name))

In [14]:
hidden_size = 1024
num_layers = 6

bidirectional = False

batch_size = 32

lr = 0.0001
num_epochs = 20


eval_rate = 0.2

model_file = 'lstm_sentiment_analysis.pth'

In [15]:
vec_sentences = vectorize_sentences(sentences, wv, sentence_size)

eval_index = int(len(vec_sentences) * eval_rate)

train_x = vec_sentences[eval_index:]
eval_x = vec_sentences[:eval_index]

train_y = labels[eval_index:]
eval_y = labels[:eval_index]

In [16]:
train_x = torch.tensor(train_x, dtype=torch.float)
eval_x = torch.tensor(eval_x, dtype=torch.float)

train_y = torch.tensor(train_y, dtype=torch.long) - 1
eval_y = torch.tensor(eval_y, dtype=torch.long) - 1

train_loader = torch.utils.data.DataLoader(list(zip(train_x, train_y)), batch_size, shuffle=True)
eval_loader = torch.utils.data.DataLoader(list(zip(eval_x, eval_y)), batch_size)

In [17]:
class Classifier(nn.Module):
    
    def __init__(self):
        super(Classifier, self).__init__()
        
        self.bilstm = nn.LSTM(input_size=vector_size,
                              hidden_size=hidden_size,
                              num_layers=num_layers,
                              bidirectional=bidirectional,
                             )
        
        self.fcnn = nn.Linear(in_features=hidden_size * (2 if bidirectional else 1), out_features=3)
        
    def forward(self, sentences):
        x = sentences.transpose(1, 0)
        
        output, _ = self.bilstm(x)
        output = output[-1, :, :]
        
        output = self.fcnn(output)
        
        return output


In [18]:
classifier = Classifier().to(device)

In [19]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=lr)

In [20]:
min_loss = np.inf

for i in range(num_epochs):
    print(f'---> Epoch {i} <---')
    time.sleep(0.5)
    
    classifier.train()
    loader = tqdm(train_loader, postfix={'Epoch': i})
    train_losses = []
    
    for sentences, targets in loader:
        sentences = sentences.to(device)
        targets = targets.to(device)
        
        optimizer.zero_grad()
        
        outputs = classifier(sentences)
        
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        train_losses.append(loss.item())
        loader.set_postfix({
            'Epoch': i,
            'Train loss': np.mean(train_losses)
        }, refresh=True)
    
    
    time.sleep(0.5)
    
    classifier.eval()
    loader = tqdm(eval_loader, postfix={'Epoch': i,}, colour='green')
    eval_losses = []
    eval_scores = []
    
    for sentences, targets in loader:
        sentences = sentences.to(device)
        targets = targets.to(device)
        
        outputs = classifier(sentences)
        loss = criterion(outputs, targets)
        
        score = (outputs.argmax(dim=1) == targets).detach().cpu().numpy()
        eval_scores.append(score)
        eval_losses.append(loss.item())
        loader.set_postfix({
            'Epoch': i,
            'Eval loss': np.mean(eval_losses),
            'Eval score': np.concatenate(eval_scores).mean()
        }, refresh=True)
        
    
    eval_loss = np.mean(eval_losses)
    if eval_loss <= min_loss:
        min_loss = eval_loss
        save_model(classifier, model_file)
        loader.write('*** save ***')
        
    time.sleep(0.5)

---> Epoch 0 <---


100%|██████████| 297/297 [00:50<00:00,  5.90it/s, Epoch=0, Train loss=0.602]
100%|[32m██████████[0m| 75/75 [00:04<00:00, 16.30it/s, Epoch=0, Eval loss=1.63, Eval score=0.32]  


*** save ***
---> Epoch 1 <---


100%|██████████| 297/297 [00:50<00:00,  5.84it/s, Epoch=1, Train loss=0.587]
100%|[32m██████████[0m| 75/75 [00:04<00:00, 16.06it/s, Epoch=1, Eval loss=1.63, Eval score=0.32]  


*** save ***
---> Epoch 2 <---


100%|██████████| 297/297 [00:51<00:00,  5.79it/s, Epoch=2, Train loss=0.585]
100%|[32m██████████[0m| 75/75 [00:04<00:00, 16.17it/s, Epoch=2, Eval loss=1.76, Eval score=0.32]  


---> Epoch 3 <---


100%|██████████| 297/297 [00:51<00:00,  5.75it/s, Epoch=3, Train loss=0.585]
100%|[32m██████████[0m| 75/75 [00:04<00:00, 16.09it/s, Epoch=3, Eval loss=1.64, Eval score=0.32]  


---> Epoch 4 <---


 57%|█████▋    | 168/297 [00:29<00:22,  5.64it/s, Epoch=4, Train loss=0.58] 


KeyboardInterrupt: 