In [1]:
import pandas as pd
from gensim import models
import numpy as np
import gensim
import gensim.downloader
from tqdm import tqdm
import time

## Dataset

In [2]:
df_sentences = pd.read_csv('data/stanfordSentimentTreebank/datasetSentences.txt', header=0, delimiter='\t', index_col=0)
df_labels = pd.read_csv('data/stanfordSentimentTreebank/datasetSplit.txt', header=0, delimiter=',', index_col='sentence_index')

In [3]:
sentences = df_sentences['sentence'].to_list()
sentences = [sentence.lower().split() for sentence in sentences]
labels = df_labels['splitset_label'].to_list()

In [4]:
max_length = -np.inf
for sentence in sentences:
    if len(sentence) > max_length:
        max_length = len(sentence)

In [5]:
max_length

56

## Word Embeddings

In [6]:
vector_size = 50
window_size = 5
negative_size = 15
sentence_size = 60

wv_model_file = 'wv_model.pth'

In [7]:
# wv_model = gensim.downloader.load('glove-wiki-gigaword-50')
# wv_model = models.Word2Vec(sentences=sentences, vector_size=vector_size, window=window_size, negative=negative_size).wv
# wv_model = models.Word2Vec(corpus_file='data/corpus.txt', vector_size=vector_size, window=window_size, negative=negative_size).wv

# wv_model.save(wv_model_file)
# del wv_model

In [8]:
def vectorize_sentences(sentences, wv, sentence_size):
    vec_sentences = []
    for sentence in sentences:
        vec_sentence = []
        for token in sentence:
            if token in wv:
                vec_sentence.append(wv[token])
            else:
                vec_sentence.append(wv['<unk>'])
        
        
        while len(vec_sentence) < sentence_size:
            vec_sentence.append(wv['<eos>'])
        
        vec_sentence = vec_sentence[:sentence_size]
        vec_sentences.append(vec_sentence)
        
    return vec_sentences

In [9]:
wv = models.KeyedVectors.load(wv_model_file)

In [10]:
wv.add_vectors(
    ['<unk>', '<eos>'],
    [np.zeros(wv.vector_size), np.ones(wv.vector_size)]
)

## BiLSTM

In [11]:
import torch
from torch import nn

from transformers import BertTokenizer, BertForSequenceClassification



In [12]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [13]:
def save_model(model, file_name):
    torch.save(model.state_dict(), file_name)
def load_model(model, file_name):
    return model.load_state_dict(torch.load(file_name))

In [14]:
hidden_size = 512
num_layers = 6
bidirectional = True

batch_size = 128

lr = 0.0001
num_epochs = 20


eval_rate = 0.2

model_file = 'lstm_sentiment_analysis.pth'

In [15]:
vec_sentences = vectorize_sentences(sentences, wv, sentence_size)

vec_sentences = np.array(vec_sentences)

In [16]:
vectors = vec_sentences.reshape((-1, 50))

mu = vectors.mean(axis=0)
sigma = np.sqrt(((vectors - mu) ** 2).mean(axis=0))

In [17]:
normalized_vec_sentences = (vec_sentences - mu) / sigma

In [18]:
eval_index = int(len(normalized_vec_sentences) * eval_rate)


train_x = vec_sentences[eval_index:]
eval_x = vec_sentences[:eval_index]

train_y = labels[eval_index:]
eval_y = labels[:eval_index]

In [19]:
train_x = torch.tensor(train_x, dtype=torch.float)
eval_x = torch.tensor(eval_x, dtype=torch.float)

train_y = torch.tensor(train_y, dtype=torch.long) - 1
eval_y = torch.tensor(eval_y, dtype=torch.long) - 1

train_loader = torch.utils.data.DataLoader(list(zip(train_x, train_y)), batch_size, shuffle=True)
eval_loader = torch.utils.data.DataLoader(list(zip(eval_x, eval_y)), batch_size)

In [20]:
class Classifier(nn.Module):
    
    def __init__(self):
        super(Classifier, self).__init__()
        options_name = "bert-base-uncased"
        self.encoder = BertForSequenceClassification.from_pretrained(options_name)

        
        
    def forward(self, text, label):
        loss, text_fea = self.encoder(text, labels=label)[:2]
        
        return loss, text_fea


In [24]:
classifier = Classifier().to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [22]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=lr)

In [23]:
min_loss = np.inf

for i in range(num_epochs):
    print(f'---> Epoch {i} <---')
    time.sleep(0.5)
    
    classifier.train()
    loader = tqdm(train_loader, postfix={'Epoch': i})
    train_losses = []
    
    for sentences, targets in loader:
        sentences = sentences.to(device)
        targets = targets.to(device)
        
        optimizer.zero_grad()
        
        outputs = classifier(sentences)
        
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        train_losses.append(loss.item())
        loader.set_postfix({
            'Epoch': i,
            'Train loss': np.mean(train_losses)
        }, refresh=True)
    
    
    time.sleep(0.5)
    
    classifier.eval()
    loader = tqdm(eval_loader, postfix={'Epoch': i,}, colour='green')
    eval_losses = []
    eval_scores = []
    
    for sentences, targets in loader:
        sentences = sentences.to(device)
        targets = targets.to(device)
        
        outputs = classifier(sentences)
        loss = criterion(outputs, targets)
        
        score = (outputs.argmax(dim=1) == targets).detach().cpu().numpy()
        eval_scores.append(score)
        eval_losses.append(loss.item())
        loader.set_postfix({
            'Epoch': i,
            'Eval loss': np.mean(eval_losses),
            'Eval score': np.concatenate(eval_scores).mean()
        }, refresh=True)
        
    
    eval_loss = np.mean(eval_losses)
    if eval_loss <= min_loss:
        min_loss = eval_loss
        save_model(classifier, model_file)
        loader.write('*** save ***')
        
    time.sleep(0.5)

---> Epoch 0 <---


  0%|          | 0/75 [00:00<?, ?it/s, Epoch=0]


TypeError: forward() missing 1 required positional argument: 'label'