In [1]:
import pandas as pd
from gensim import models
import numpy as np
import gensim
import gensim.downloader
from tqdm import tqdm
import time
import os

## Dataset

In [2]:
train_df = pd.read_csv('data/cola_public/tokenized/in_domain_train.tsv', header=None, delimiter='\t')
train_df = train_df.sample(frac=1).reset_index(drop=True)

eval_df = pd.read_csv('data/cola_public/tokenized/in_domain_dev.tsv', header=None, delimiter='\t')
eval_df = eval_df.sample(frac=1).reset_index(drop=True)

In [3]:
train_sentences = train_df[3].to_list()
train_sentences = [sentence.split() for sentence in train_sentences]
train_y = train_df[1].to_list()

eval_sentences = eval_df[3].to_list()
eval_sentences = [sentence.split() for sentence in eval_sentences]
eval_y = eval_df[1].to_list()

In [4]:
max_length = -np.inf
for sentence in train_sentences:
    if len(sentence) > max_length:
        max_length = len(sentence)

In [5]:
max_length

44

## Word Embeddings

In [6]:
root_dir = 'logs/cola'
os.makedirs(root_dir, exist_ok=True)

In [7]:
vector_size = 50
window_size = 5
negative_size = 15
sentence_size = 50

wv_model_file = root_dir + '/' + 'wv_bilstm.pth'

In [8]:
# wv_model = gensim.downloader.load('glove-wiki-gigaword-50')
# wv_model = models.Word2Vec(sentences=train_sentences, vector_size=vector_size, window=window_size, negative=negative_size).wv
# wv_model = models.Word2Vec(corpus_file='data/corpus.txt', vector_size=vector_size, window=window_size, negative=negative_size).wv

# wv_model.save(wv_model_file)
# del wv_model

In [9]:
def vectorize_sentences(sentences, wv, sentence_size):
    voc = wv.key_to_index.keys()
    unk = wv['<unk>']
    eos = wv['<eos>']
    lengths = []
    for i, sentence in enumerate(sentences):
        lengths.append(len(sentence))
        
        for i, token in enumerate(sentence):
            if token in voc:
                sentence[i] = wv[token]
            else:
                sentence[i] = unk
        
        
        while len(sentence) < sentence_size:
            sentence.append(eos)
        
        
    return sentences, lengths

In [10]:
wv = models.KeyedVectors.load(wv_model_file)

In [11]:
wv.add_vectors(
    ['<unk>', '<eos>'],
    [np.zeros(wv.vector_size), np.ones(wv.vector_size)]
)

## BiLSTM

In [12]:
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [13]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [14]:
def save_model(model, file_name):
    torch.save(model.state_dict(), file_name)
def load_model(model, file_name):
    return model.load_state_dict(torch.load(file_name))

In [15]:
hidden_size = 128
num_layers = 6

bidirectional = True

batch_size = 32

lr = 0.001
num_epochs = 20

model_file = root_dir + '/' +'lstm_model.pth'

In [16]:
train_x, train_len_x = vectorize_sentences(train_sentences, wv, sentence_size)
eval_x, eval_len_x = vectorize_sentences(eval_sentences, wv, sentence_size)


train_x = torch.tensor(np.array(train_x), dtype=torch.float)
eval_x = torch.tensor(np.array(eval_x), dtype=torch.float)

train_len_x = torch.tensor(train_len_x, dtype=torch.int)
eval_len_x = torch.tensor(eval_len_x, dtype=torch.int)



train_y = torch.tensor(train_y, dtype=torch.long)
eval_y = torch.tensor(eval_y, dtype=torch.long)

train_loader = torch.utils.data.DataLoader(list(zip(train_x, train_len_x, train_y)), batch_size, shuffle=True)
eval_loader = torch.utils.data.DataLoader(list(zip(eval_x, train_len_x, eval_y)), batch_size)

In [17]:
class Classifier(nn.Module):
    
    def __init__(self):
        super(Classifier, self).__init__()
        
        lstm_dim = hidden_size * 2 * (2 if bidirectional else 1)
        
        self.lstm = nn.LSTM(input_size=vector_size,
                              hidden_size=hidden_size,
                              num_layers=num_layers,
                              bidirectional=bidirectional,
                              batch_first=True
                             )
        
        self.dropout = nn.Dropout(p=0.5)

        
        self.fcnn_1 = nn.Linear(in_features=lstm_dim, out_features=64)
        
        self.fcnn_2 = nn.Linear(in_features=64, out_features=2)
        
        self.leakyrely = nn.LeakyReLU()
        
        
        
    def forward(self, sentences, lengths):
        sentences = pack_padded_sequence(sentences, lengths.cpu(), batch_first=True, enforce_sorted=False)
        h_lstm, _ = self.lstm(sentences)
        output, _ = pad_packed_sequence(h_lstm, batch_first=True)
        
        avg_pool = torch.mean(output, 1)
        max_pool, _ = torch.max(output, 1)
        
        
        output = torch.cat([avg_pool, max_pool], 1)
        output = self.dropout(output)
        
        output = self.fcnn_1(output)
        output = torch.relu(output)
        
        output = self.fcnn_2(output)
        
        return output


In [18]:
classifier = Classifier().to(device)

In [19]:
weights = torch.tensor([train_y[train_y==0].shape[0], train_y[train_y==1].shape[0]])
weights = 1/(weights/weights.sum())
weights = weights.to(device)
weights

tensor([3.3825, 1.4197], device='cuda:0')

In [20]:
criterion = nn.CrossEntropyLoss(weights)
optimizer = torch.optim.Adam(classifier.parameters(), lr=lr)

In [21]:
min_loss = np.inf

for i in range(num_epochs):
    print(f'---> Epoch {i} <---')
    time.sleep(0.5)
    
    classifier.train()
    loader = tqdm(train_loader, postfix={'Epoch': i})
    train_losses = []
    
    for sentences, lengths, targets in loader:
        sentences = sentences.to(device)
        targets = targets.to(device)
        
        optimizer.zero_grad()
        
        outputs = classifier(sentences, lengths)
        
        loss = criterion(outputs, targets)
        train_losses.append(loss.item())
        
        loss.backward()
        
        optimizer.step()
        
        
        loader.set_postfix({
            'Epoch': i,
            'Train loss': np.mean(train_losses)
        }, refresh=True)
    
    
    time.sleep(0.5)
    
    classifier.eval()
    loader = tqdm(eval_loader, postfix={'Epoch': i,}, colour='green')
    eval_losses = []
    eval_scores = []
    
    for sentences, lengths, targets in loader:
        sentences = sentences.to(device)
        targets = targets.to(device)
        
        outputs = classifier(sentences, lengths)
        loss = criterion(outputs, targets)
        
        score = (outputs.argmax(dim=1) == targets).detach().cpu().numpy()
        eval_scores.append(score)
        
        eval_losses.append(loss.item())
        
        loader.set_postfix({
            'Epoch': i,
            'Eval loss': np.mean(eval_losses),
            'Eval score': np.concatenate(eval_scores).mean()
        }, refresh=True)
        
    
    eval_loss = np.mean(eval_losses)
    if eval_loss <= min_loss:
        min_loss = eval_loss
        save_model(classifier, model_file)
        loader.write('*** save ***')
        
    time.sleep(0.5)

---> Epoch 0 <---


100%|██████████| 268/268 [00:08<00:00, 32.93it/s, Epoch=0, Train loss=0.694]
100%|[32m██████████[0m| 17/17 [00:00<00:00, 101.42it/s, Epoch=0, Eval loss=0.693, Eval score=0.307]


*** save ***
---> Epoch 1 <---


100%|██████████| 268/268 [00:08<00:00, 32.23it/s, Epoch=1, Train loss=0.693]
100%|[32m██████████[0m| 17/17 [00:00<00:00, 108.34it/s, Epoch=1, Eval loss=0.695, Eval score=0.693]


---> Epoch 2 <---


100%|██████████| 268/268 [00:08<00:00, 32.40it/s, Epoch=2, Train loss=0.694]
100%|[32m██████████[0m| 17/17 [00:00<00:00, 95.84it/s, Epoch=2, Eval loss=0.694, Eval score=0.662]


---> Epoch 3 <---


100%|██████████| 268/268 [00:08<00:00, 32.62it/s, Epoch=3, Train loss=0.693]
100%|[32m██████████[0m| 17/17 [00:00<00:00, 88.84it/s, Epoch=3, Eval loss=0.693, Eval score=0.636]


*** save ***
---> Epoch 4 <---


100%|██████████| 268/268 [00:08<00:00, 33.12it/s, Epoch=4, Train loss=0.693]
100%|[32m██████████[0m| 17/17 [00:00<00:00, 100.13it/s, Epoch=4, Eval loss=0.694, Eval score=0.307]


---> Epoch 5 <---


100%|██████████| 268/268 [00:08<00:00, 32.45it/s, Epoch=5, Train loss=0.693]
100%|[32m██████████[0m| 17/17 [00:00<00:00, 111.43it/s, Epoch=5, Eval loss=0.693, Eval score=0.655]


---> Epoch 6 <---


100%|██████████| 268/268 [00:08<00:00, 32.28it/s, Epoch=6, Train loss=0.693]
100%|[32m██████████[0m| 17/17 [00:00<00:00, 101.82it/s, Epoch=6, Eval loss=0.694, Eval score=0.662]


---> Epoch 7 <---


100%|██████████| 268/268 [00:07<00:00, 33.66it/s, Epoch=7, Train loss=0.693]
100%|[32m██████████[0m| 17/17 [00:00<00:00, 94.06it/s, Epoch=7, Eval loss=0.694, Eval score=0.662]


---> Epoch 8 <---


100%|██████████| 268/268 [00:08<00:00, 32.03it/s, Epoch=8, Train loss=0.693]
100%|[32m██████████[0m| 17/17 [00:00<00:00, 86.19it/s, Epoch=8, Eval loss=0.694, Eval score=0.662]


---> Epoch 9 <---


  9%|▉         | 24/268 [00:00<00:07, 32.69it/s, Epoch=9, Train loss=0.692]


KeyboardInterrupt: 