In [1]:
import pandas as pd
from gensim import models
import numpy as np
import gensim
import gensim.downloader
from tqdm import tqdm
import time

## Dataset

In [2]:
df_sentences = pd.read_csv('data/stanfordSentimentTreebank/datasetSentences.txt', header=0, delimiter='\t', index_col=0)
df_labels = pd.read_csv('data/stanfordSentimentTreebank/datasetSplit.txt', header=0, delimiter=',', index_col='sentence_index')

In [3]:
df = pd.concat([df_labels, df_sentences], axis=1)
df

Unnamed: 0_level_0,splitset_label,sentence
sentence_index,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,The Rock is destined to be the 21st Century 's...
2,1,The gorgeously elaborate continuation of `` Th...
3,2,Effective but too-tepid biopic
4,2,If you sometimes like to go to the movies to h...
5,2,"Emerges as something rare , an issue movie tha..."
...,...,...
11851,1,A real snooze .
11852,1,No surprises .
11853,1,We 've seen the hippie-turned-yuppie plot befo...
11854,1,Her fans walked out muttering words like `` ho...


In [4]:
df = df.sample(frac=1).reset_index(drop=True)

In [5]:
sentences = df['sentence'].to_list()
sentences = [sentence.lower().split() for sentence in sentences]
labels = df['splitset_label'].to_list()

In [6]:
max_length = -np.inf
for sentence in sentences:
    if len(sentence) > max_length:
        max_length = len(sentence)

In [7]:
max_length

56

## Word Embeddings

In [8]:
vector_size = 50
window_size = 5
negative_size = 15
sentence_size = 60

wv_model_file = 'wv_model.pth'

In [9]:
# wv_model = gensim.downloader.load('glove-wiki-gigaword-50')
wv_model = models.Word2Vec(sentences=sentences, vector_size=vector_size, window=window_size, negative=negative_size).wv
# wv_model = models.Word2Vec(corpus_file='data/corpus.txt', vector_size=vector_size, window=window_size, negative=negative_size).wv

wv_model.save(wv_model_file)
del wv_model

In [10]:
def vectorize_sentences(sentences, wv, sentence_size):
    vec_sentences = []
    lengths = []
    for sentence in sentences:
        vec_sentence = []
        for token in sentence:
            if token in wv:
                vec_sentence.append(wv[token])
            else:
                vec_sentence.append(wv['<unk>'])
        
        lengths.append(len(vec_sentence))
        while len(vec_sentence) < sentence_size:
            vec_sentence.append(wv['<eos>'])
        
        vec_sentence = vec_sentence[:sentence_size]
        vec_sentences.append(vec_sentence)
        
    return vec_sentences, lengths

In [11]:
wv = models.KeyedVectors.load(wv_model_file)

In [12]:
wv.add_vectors(
    ['<unk>', '<eos>'],
    [np.zeros(wv.vector_size), np.ones(wv.vector_size)]
)

## BiLSTM

In [13]:
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [14]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [15]:
def save_model(model, file_name):
    torch.save(model.state_dict(), file_name)
def load_model(model, file_name):
    return model.load_state_dict(torch.load(file_name))

In [16]:
hidden_size = 50
num_layers = 2
bidirectional = True

batch_size = 128

lr = 0.001
num_epochs = 20


eval_rate = 0.1

model_file = 'lstm_sentiment_analysis.pth'

In [17]:
vec_sentences, lengths = vectorize_sentences(sentences, wv, sentence_size)

vec_sentences = np.array(vec_sentences)
lengths = np.array(lengths)

labels = np.array(labels) - 1

In [18]:
vec_labels = np.zeros((labels.size, labels.max()+1))
vec_labels[np.arange(labels.size), labels] = 1

In [19]:
vectors = vec_sentences.reshape((-1, 50))

mu = vectors.mean(axis=0)
sigma = np.sqrt(((vectors - mu) ** 2).mean(axis=0))

vec_sentences = (vec_sentences - mu) / sigma

In [20]:
eval_index = int(len(vec_sentences) * eval_rate)

train_x = vec_sentences[eval_index:]
eval_x = vec_sentences[:eval_index]

train_len = lengths[eval_index:]
eval_len = lengths[:eval_index]

train_y = vec_labels[eval_index:]
eval_y = vec_labels[:eval_index]

In [21]:
train_x = torch.tensor(train_x, dtype=torch.float)
eval_x = torch.tensor(eval_x, dtype=torch.float)

train_len = torch.tensor(train_len, dtype=torch.int)
eval_len = torch.tensor(eval_len, dtype=torch.int)

train_y = torch.tensor(train_y, dtype=torch.float)
eval_y = torch.tensor(eval_y, dtype=torch.float)


train_loader = torch.utils.data.DataLoader(list(zip(train_x, train_len, train_y)), batch_size, shuffle=True)
eval_loader = torch.utils.data.DataLoader(list(zip(eval_x, eval_len, eval_y)), batch_size, shuffle=True)

In [22]:
class Classifier(nn.Module):
    
    def __init__(self):
        super(Classifier, self).__init__()
        lstm_dim = hidden_size * 2 * (2 if bidirectional else 1)
        
        
        self.lstm = nn.LSTM(input_size=vector_size,
                              hidden_size=hidden_size,
                              num_layers=num_layers,
                              bidirectional=bidirectional,
                              batch_first=True
                             )
        self.dropout = nn.Dropout(p=0.5)

        
        self.fcnn_1 = nn.Linear(in_features=lstm_dim, out_features=64)
        
        self.fcnn_2 = nn.Linear(in_features=64, out_features=3)
        
        self.leaky_relu = nn.LeakyReLU()
        self.relu = nn.ReLU()
        
    def forward(self, sentences, lengths):
        sentences = pack_padded_sequence(sentences, lengths.cpu(), batch_first=True, enforce_sorted=False)
        h_lstm, _ = self.lstm(sentences)
        output, _ = pad_packed_sequence(h_lstm, batch_first=True)
        
        avg_pool = torch.mean(output, 1)
        max_pool, _ = torch.max(output, 1)
        
        
        output = torch.cat([avg_pool, max_pool], 1)
        output = self.dropout(output)
        
        output = self.fcnn_1(output)
        output = torch.sigmoid(output)
        
        output = self.fcnn_2(output)
        output = torch.sigmoid(output)
        
        return output

In [23]:
classifier = Classifier().to(device)

In [24]:
weights = torch.log(1/(train_y.sum(dim=0) / train_y.sum()))
weights

tensor([0.3278, 1.6783, 2.3775])

In [25]:
weights = weights.detach().to(device)

In [26]:
criterion = nn.BCEWithLogitsLoss(weights)
optimizer = torch.optim.Adam(classifier.parameters(), lr=lr)

In [27]:
min_loss = np.inf

for i in range(num_epochs):
    print(f'---> Epoch {i} <---')
    time.sleep(0.5)
    
    classifier.train()
    loader = tqdm(train_loader, postfix={'Epoch': i})
    train_losses = []
    
    for sentences, lengths, targets in loader:
        sentences = sentences.to(device)
        targets = targets.to(device)
        lengths = lengths.to(device)
        
        optimizer.zero_grad()
        
        outputs = classifier(sentences, lengths)
        
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        with torch.no_grad():
            train_losses.append(loss.item())
            loader.set_postfix({
                'Epoch': i,
                'Train loss': np.mean(train_losses)
            }, refresh=True)
    
    
    time.sleep(0.5)
    
    with torch.no_grad():
        classifier.eval()
        loader = tqdm(eval_loader, postfix={'Epoch': i,}, colour='green')
        eval_losses = []
        eval_scores = []

        for sentences, lengths, targets in loader:
            sentences = sentences.to(device)
            targets = targets.to(device)
            lengths = lengths.to(device)

            outputs = classifier(sentences, lengths)
            loss = criterion(outputs, targets)
            
            score = (outputs.argmax(dim=1) == targets.argmax(dim=1)).detach().cpu().numpy()
            eval_scores.append(score)
            eval_losses.append(loss.item())
            loader.set_postfix({
                'Epoch': i,
                'Eval loss': np.mean(eval_losses),
                'Eval score': np.concatenate(eval_scores).mean()
            }, refresh=True)


        eval_loss = np.mean(eval_losses)
        if eval_loss <= min_loss:
            min_loss = eval_loss
            save_model(classifier, model_file)
            loader.write('*** save ***')
        
    time.sleep(0.5)

---> Epoch 0 <---


100%|██████████| 84/84 [00:02<00:00, 32.99it/s, Epoch=0, Train loss=1.06]
100%|[32m██████████[0m| 10/10 [00:00<00:00, 93.50it/s, Epoch=0, Eval loss=1.01, Eval score=0.722]


*** save ***
---> Epoch 1 <---


100%|██████████| 84/84 [00:02<00:00, 34.54it/s, Epoch=1, Train loss=1.01]
100%|[32m██████████[0m| 10/10 [00:00<00:00, 91.74it/s, Epoch=1, Eval loss=1.01, Eval score=0.722]


*** save ***
---> Epoch 2 <---


100%|██████████| 84/84 [00:02<00:00, 36.05it/s, Epoch=2, Train loss=1]   
100%|[32m██████████[0m| 10/10 [00:00<00:00, 91.41it/s, Epoch=2, Eval loss=1, Eval score=0.722]


*** save ***
---> Epoch 3 <---


100%|██████████| 84/84 [00:02<00:00, 39.44it/s, Epoch=3, Train loss=1]   
100%|[32m██████████[0m| 10/10 [00:00<00:00, 99.98it/s, Epoch=3, Eval loss=1, Eval score=0.722]


*** save ***
---> Epoch 4 <---


 74%|███████▍  | 62/84 [00:01<00:00, 35.58it/s, Epoch=4, Train loss=1]


KeyboardInterrupt: 