In [1]:
import os
import sys
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

import src.pytorch_utils as ptu
import src.dataset as dset

import warnings
warnings.filterwarnings('ignore')

seed = 42
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

np.random.seed(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
models_path = 'models'

cuda


In [2]:
train_dataset = dset.DataSet('data/train.labeled', tqdm_bar=True)
# train_dataset = dset.DataSet('data/test.labeled', tqdm_bar=True)
test_dataset = dset.DataSet('data/test.labeled', train_dataset=train_dataset, tqdm_bar=True)
# comp_dataset = dset.DataSet('data/comp.unlabeled', train_dataset=train_dataset, tagged=False, tqdm_bar=True)

100%|██████████| 125430/125430 [00:17<00:00, 7108.94it/s]
100%|██████████| 25325/25325 [00:03<00:00, 7357.59it/s]


In [3]:
class BiLSTM(nn.Module):
    def __init__(self, train_dataset, word_embed_dim, tag_embed_dim, hidden_dim, num_layers, bias, mlp1_dim, mlp2_dim, p_dropout):
        super(BiLSTM, self).__init__()
        
        self.pad = int(train_dataset.special_dict[dset.PAD])
        self.y_pad = int(train_dataset.special_dict[dset.y_PAD])

        self.word_embedding_layer = nn.Embedding(num_embeddings=train_dataset.words_num,
                                                 embedding_dim=word_embed_dim,
                                                 padding_idx=self.pad)

        self.tag_embedding_layer = nn.Embedding(num_embeddings=train_dataset.tags_num,
                                                embedding_dim=tag_embed_dim,
                                                padding_idx=self.pad)

        self.lstm = nn.LSTM(input_size=word_embed_dim + tag_embed_dim,
                            hidden_size=hidden_dim,
                            num_layers=num_layers,
                            bias=bias,
                            dropout=p_dropout,
                            batch_first=True,
                            bidirectional=True)

        self.mlp1 = nn.Linear(int(hidden_dim*2),
                              mlp1_dim,
                              bias=bias)

        self.mlp2 = nn.Linear(mlp1_dim,
                              mlp2_dim,
                              bias=bias)

    def forward(self, words, tags, lens, prints=False):
        max_len = words.shape[1]
        
        print('words', words.shape) if prints else None
        # [batch_size, max_sentence_len]
        
        print('tags', tags.shape) if prints else None
        # [batch_size, max_sentence_len]
        
        print('lens', len(lens)) if prints else None
        # [batch_size]
        
        words = self.word_embedding_layer(words)
        print('word_embeds', words.shape) if prints else None
        # [batch_size, max_sentence_len, word_embed_dim]

        tags = self.tag_embedding_layer(tags)
        print('tag_embeds', tags.shape) if prints else None
        # [batch_size, max_sentence_len, tag_embed_dim]
        
        x = torch.cat((words, tags), -1)
        print('cat', x.shape) if prints else None
        # [batch_size, max_sentence_len, word_embed_dim + tag_embed_dim]
        
        x = nn.utils.rnn.pack_padded_sequence(x, lens, batch_first=True, enforce_sorted=False)
#         print('pack_padded_sequence', x.shape) if prints else None
        # [batch_size, packed_size, word_embed_dim + tag_embed_dim]
        
        x, _ = self.lstm(x)
#         print('lstm', x.shape) if prints else None
        # [batch_size, seq_length, 2*hidden_dim]
        
        x, lens = nn.utils.rnn.pad_packed_sequence(x, batch_first=True, padding_value=self.pad, total_length=max_len)#.transpose(1, 0)
        print('pad_packed_sequence', x.shape) if prints else None
        # [batch_size, packed_size, word_embed_dim + tag_embed_dim]

        x = self.mlp1(x)
        print('mlp1', x.shape) if prints else None
        # [batch_size, max_sentence_len, mlp1_dim]
        
        x = F.tanh(x)
        print('mlp1_tanh', x.shape) if prints else None
        # [batch_size, max_sentence_len, mlp1_dim]
        
        x = self.mlp2(x)
        print('mlp2', x.shape) if prints else None
        # [batch_size, max_sentence_len, mlp2_dim]
        
        x = F.log_softmax(x, dim=2)
        print('log_softmax', x.shape) if prints else None
        # [batch_size, max_sentence_len, mlp2_dim]
        
        x = x.transpose(2, 1)
        print('transpose', x.shape) if prints else None
        # [batch_size, mlp2_dim, max_sentence_len]
        
        return x

In [4]:
def loss_decision_func(obj, device, batch, prints=False):
    words, tags, lens, y = batch
    out = obj.model.forward(words.to(device), tags.to(device), lens, prints=prints)
    mask = (y > obj.model.y_pad).int()
    
    out = out.transpose(2, 1)[mask == 1.]
    y = y[mask == 1.]
    loss = obj.criterion(out.to(device), y.to(device).long())
    return loss, y, out

In [8]:
version = 'V1_1.0'

model = BiLSTM(train_dataset=train_dataset,
               word_embed_dim=100,  # 100
               tag_embed_dim=25,  # 25
               hidden_dim=125,  # 125
               num_layers=2,  # 2
               bias=True,
               mlp1_dim=100,  # 100
               mlp2_dim=250,  # 100
               p_dropout=0.5)  # 0.1

checkpoint = ptu.Checkpoint(models_path=models_path,
                            version=version,
                            model=model,
                            score=lambda y_true, y_pred: (np.array(y_true) == np.array(y_pred)).mean(),
                            loss_decision_func=loss_decision_func,
                            out_decision_func=lambda x: x.argmax(axis=1),
                            seed=42,
                            optimizer=torch.optim.Adam,
                            criterion=nn.NLLLoss,
                            save=True,
                            prints=True,
                           )

model version: V1_1.0
Number of parameters 2096925 trainable 2096925


In [14]:
hyperparam_list = [
#     {'train_epochs': 5, 'batch_size': 32, 'optimizer_params': {'lr': 1e-3,}, 'lr_decay': 0.0},
#     {'train_epochs': 5, 'batch_size': 64, 'optimizer_params': {'lr': 4e-4}, 'lr_decay': 0.07},
#     {'train_epochs': 5, 'batch_size': 128, 'optimizer_params': {}, 'lr_decay': 0.07},
#     {'train_epochs': 5, 'batch_size': 1024, 'optimizer_params': {'weight_decay': 1e-6}, 'lr_decay': 0.07},
    {'train_epochs': 50, 'batch_size': 2048, 'optimizer_params': {'weight_decay': 1e-6}, 'lr_decay': 0.07},
#     {'train_epochs': 50, 'batch_size': 5000, 'optimizer_params': {'weight_decay': 0}, 'lr_decay': 0.05},
#     {'train_epochs': 10, 'batch_size': 32, 'optimizer_params': {'lr': 1e-4, 'weight_decay': 0}, 'p_dropout': 0.5, 'lr_decay': 0.07},
#     {'train_epochs': 10, 'batch_size': 64, 'optimizer_params': {'lr': 1e-5, 'weight_decay': 0}, 'p_dropout': 0.5, 'lr_decay': 0.07},
#     {'train_epochs': 10, 'batch_size': 128, 'optimizer_params': {'lr': 1e-6, 'weight_decay': 0}, 'p_dropout': 0.5, 'lr_decay': 0.07},
]

for session in hyperparam_list:
    checkpoint.train(device=device,
                     train_dataset=train_dataset.dataset,
                     val_dataset=test_dataset.dataset,
                     prints=True,
                     epochs_save=5,
                     save=True,
                     **session)

epoch  38/ 87 | train_loss 1.20024 | val_loss 1.35806 | train_score 0.70074 | val_score 0.64785 | train_time   3.34 min
epoch  39/ 87 | train_loss 1.19707 | val_loss 1.35706 | train_score 0.70378 | val_score 0.64962 | train_time   3.40 min
epoch  40/ 87 | train_loss 1.19196 | val_loss 1.35303 | train_score 0.70341 | val_score 0.64822 | train_time   3.51 min
epoch  41/ 87 | train_loss 1.18825 | val_loss 1.35143 | train_score 0.70789 | val_score 0.65180 | train_time   3.61 min
epoch  42/ 87 | train_loss 1.18508 | val_loss 1.34978 | train_score 0.71030 | val_score 0.65287 | train_time   3.66 min
epoch  43/ 87 | train_loss 1.18216 | val_loss 1.34754 | train_score 0.71097 | val_score 0.65266 | train_time   3.76 min
epoch  44/ 87 | train_loss 1.17999 | val_loss 1.34642 | train_score 0.71398 | val_score 0.65538 | train_time   3.81 min
epoch  45/ 87 | train_loss 1.17790 | val_loss 1.34565 | train_score 0.71329 | val_score 0.65521 | train_time   3.91 min
epoch  46/ 87 | train_loss 1.17650 | val

KeyboardInterrupt: 

In [None]:
import hyperopt as hpo

def Counter():
    for i in range(999999999999):
        yield i

init_space = {
    
}

session_space = {
    
}

def init_objective(space, save=False):
    score = 0.0
    return - score

def session_objective(space, save=False):
    score = 0.0
    return - score

_ = hpo.fmin()

In [20]:
checkpoint.model = checkpoint.model.to(device)
checkpoint.model.train()

loss_sum = np.array([])
y_pred = np.array([])
y_true = np.array([])

counter = 0
loader = torch.utils.data.DataLoader(dataset=train_dataset.dataset, batch_size=32, shuffle=True)
for batch in loader:
    
    loss, y, out = loss_decision_func(checkpoint, device, batch, prints=True)

    loss_sum = np.append(loss_sum, float(loss.data))
    
    y_pred = np.append(y_pred, checkpoint.out_decision_func(out.detach().cpu().numpy()))
    
    y_true = np.append(y_true, batch[-1].detach().cpu().numpy())
    break
    counter += 1
    if counter > 5:
        break

words torch.Size([32, 249])
tags torch.Size([32, 249])
lens 32
word_embeds torch.Size([32, 249, 300])
tag_embeds torch.Size([32, 249, 25])
cat torch.Size([32, 249, 325])
pad_packed_sequence torch.Size([32, 249, 500])
mlp1 torch.Size([32, 249, 100])
mlp1_tanh torch.Size([32, 249, 100])
mlp2 torch.Size([32, 249, 250])
log_softmax torch.Size([32, 249, 250])
transpose torch.Size([32, 250, 249])


In [13]:
# (mask.sum(dim=1) != lens).sum()

In [14]:
# mask.shape

In [15]:
# out.shape

In [16]:
# mask[mask == 1.].shape

In [17]:
# out.transpose(2, 1)[mask == 1.].shape

In [None]:
#### matan's code below

In [None]:
## dataset class, need to review and keep only the relevant
class Dependency_Parser_Dataset(nn.Module):
    def __init__(self, word_dict, pos_dict, dir_path: str, subset: str, 
                 padding=False, word_embeddings=None):
        super().__init__()
        self.subset = subset # One of the following: [train, test]
        self.file = dir_path + subset + ".wtag"
        self.datareader = PosDataReader(self.file, word_dict, pos_dict)
        self.vocab_size = len(self.datareader.word_dict)
        if word_embeddings:
            self.word_idx_mappings, self.idx_word_mappings, self.word_vectors = word_embeddings
        else:
            self.word_idx_mappings, self.idx_word_mappings, self.word_vectors = ## need to add our word emb hereself.init_word_embeddings(self.datareader.word_dict)
        self.pos_idx_mappings, self.idx_pos_mappings = ## need to add our word emb here self.init_pos_vocab(self.datareader.pos_dict)
        
        self.pad_idx = self.word_idx_mappings.get(PAD_TOKEN)
        self.unknown_idx = self.word_idx_mappings.get(UNKNOWN_TOKEN)
        self.word_vector_dim = self.word_vectors.size(-1)
        self.sentence_lens = [len(sentence) for sentence in self.datareader.sentences]
        self.max_seq_len = max(self.sentence_lens)
        self.sentences_dataset = self.convert_sentences_to_dataset(padding)
    
    def get_word_embeddings(self):
        return self.word_idx_mappings, self.idx_word_mappings, self.word_vectors

    
    def init_pos_vocab(self, pos_dict):
        idx_pos_mappings = sorted([self.word_idx_mappings.get(token) for token in SPECIAL_TOKENS])
        pos_idx_mappings = {self.idx_word_mappings[idx]: idx for idx in idx_pos_mappings}
        
        
    def convert_sentences_to_dataset(self, padding):
        sentence_word_idx_list = list()
        sentence_pos_idx_list = list()
        sentence_len_list = list()
        for sentence_idx, sentence in enumerate(self.datareader.sentences):
            words_idx_list = []
            pos_idx_list = []
            for word, pos in sentence:
                words_idx_list.append(self.word_idx_mappings.get(word))
                pos_idx_list.append(self.pos_idx_mappings.get(pos))
            sentence_len = len(words_idx_list)
            # if padding:
            #     while len(words_idx_list) < self.max_seq_len:
            #         words_idx_list.append(self.word_idx_mappings.get(PAD_TOKEN))
            #         pos_idx_list.append(self.pos_idx_mappings.get(PAD_TOKEN))
            sentence_word_idx_list.append(torch.tensor(words_idx_list, dtype=torch.long, requires_grad=False))
            sentence_pos_idx_list.append(torch.tensor(pos_idx_list, dtype=torch.long, requires_grad=False))
            sentence_len_list.append(sentence_len)
        
        # if padding:
        #     all_sentence_word_idx = torch.tensor(sentence_word_idx_list, dtype=torch.long)
        #     all_sentence_pos_idx = torch.tensor(sentence_pos_idx_list, dtype=torch.long)
        #     all_sentence_len = torch.tensor(sentence_len_list, dtype=torch.long, requires_grad=False)
        #     return TensorDataset(all_sentence_word_idx, all_sentence_pos_idx, all_sentence_len)
            
        return {i: sample_tuple for i, sample_tuple in enumerate(zip(sentence_word_idx_list,
                                                                     sentence_pos_idx_list,
                                                                     sentence_len_list))}

In [None]:
## our model
class Dnn_Dependency_Parser(nn.Module):
    def __init__(self, word_embeddings, hidden_dim, word_vocab_size, tag_vocab_size):
        super(Dnn_Dependency_Parser, self).__init__()
        emb_dim = word_embeddings.shape[1]
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.word_embedding = ## need to add our embeddin - nn.Embedding(word_vocab_size, word_embedding_dim)
        # self.word_embedding = nn.Embedding.from_pretrained(word_embeddings, freeze=False)
        self.lstm = nn.LSTM(input_size=emb_dim, hidden_size=hidden_dim, num_layers=2, bidirectional=True, batch_first=False)
        self.hidden2tag = nn.Linear(hidden_dim*2, tag_vocab_size)

        
    def forward(self, word_idx_tensor):
        embeds = self.word_embedding(word_idx_tensor.to(self.device))   # [batch_size, seq_length, emb_dim]      
        lstm_out, _ = self.lstm(embeds.view(embeds.shape[1], 1, -1))    # [seq_length, batch_size, 2*hidden_dim]
        tag_space = self.hidden2tag(lstm_out.view(embeds.shape[1], -1)) # [seq_length, tag_dim]
        tag_scores = F.log_softmax(tag_space, dim=1)                    # [seq_length, tag_dim]
        return tag_scores


In [None]:
#CUDA_LAUNCH_BLOCKING=1  

EPOCHS = 15
WORD_EMBEDDING_DIM = 100 ## need to decide if this is the right DIM
HIDDEN_DIM = 1000 ## need to decide if this is the right DIM
word_vocab_size = len(train.word_idx_mappings)
tag_vocab_size = len(train.pos_idx_mappings)

##need to decide whihc parameters are relevant
model = Dnn_Dependency_Parser(train_dataloader.dataset.word_vectors, HIDDEN_DIM, word_vocab_size, tag_vocab_size)

if device == "cuda":
    model.cuda()

# Define the loss function as the Negative Log Likelihood loss (NLLLoss)

## no need of that, i implement Negative_log_Likelihood_Loss
#loss_function = nn.NLLLoss()

# We will be using a simple SGD optimizer to minimize the loss function
## we are ok with Adam? need to change?
optimizer = optim.Adam(model.parameters(), lr=0.01)
acumulate_grad_steps = 50 # This is the actual batch_size, while we officially use batch_size=1

# Training start
print("Training Started")
accuracy_list = []
loss_list = []
epochs = EPOCHS
for epoch in range(epochs):
    acc = 0 # to keep track of accuracy
    printable_loss = 0 # To keep track of the loss value
    i = 0
    for batch_idx, input_data in enumerate(train_dataloader):
        i += 1
        words_idx_tensor, pos_idx_tensor, sentence_length = input_data
        
        sentence_scores = model(words_idx_tensor)
        sentence_scores = ## need to fix it ----tag_scores.unsqueeze(0).permute(0,2,1)
        #print("tag_scores shape -", tag_scores.shape)
        #print("pos_idx_tensor shape -", pos_idx_tensor.shape)
        loss = Negative_log_Likelihood_Loss(dataset, network_parameters) # need to fix network_parameters
        loss = loss / acumulate_grad_steps
        loss.backward()

        if i % acumulate_grad_steps == 0:
            optimizer.step()
            model.zero_grad()
        printable_loss += loss.item()## we need to change loss.item() to our lass, i think it will be only loss
        _, indices = torch.max(sentence_scores, 1)
        # print("tag_scores shape-", tag_scores.shape)
        # print("indices shape-", indices.shape)
        # acc += indices.eq(pos_idx_tensor.view_as(indices)).mean().item()
        acc += torch.mean(torch.tensor(pos_idx_tensor.to("cpu") == indices.to("cpu"), dtype=torch.float))##i think we should fix it
    printable_loss = printable_loss / len(train)
    acc = acc / len(train)
    loss_list.append(float(printable_loss))
    accuracy_list.append(float(acc))
    test_acc = evaluate()
    e_interval = i
    print("Epoch {} Completed,\tLoss {}\tAccuracy: {}\t Test Accuracy: {}".format(epoch + 1, np.mean(loss_list[-e_interval:]), np.mean(accuracy_list[-e_interval:]), test_acc))
  

In [None]:
## loss function
def Negative_log_Likelihood_Loss(dataset, network_parameters):
    loss = 0
    for x_i, y_i in dataset:
        softmax_score = softmax(y_i)
        for head, modifier in y_i:
            loss -=(1/absoulte_y_i(y_i))*mat.log(softmax_score(head,modifer))
              
def absoulte_y_i(y_i):
    return len(y_i[:0])

In [None]:
## evaluate functino- i used the function they share with us,looks ok for me 
def evaluate(test_dataloader):
    acc = 0
    with torch.no_grad():
        for batch_idx, input_data in enumerate(test_dataloader):
            
            words_idx_tensor, pos_idx_tensor, sentence_length = input_data  
            tag_scores = model(words_idx_tensor)
            tag_scores = tag_scores.unsqueeze(0).permute(0,2,1)
            
            _, indices = torch.max(tag_scores, 1)
            acc += torch.mean(torch.tensor(pos_idx_tensor.to("cpu") == indices.to("cpu"), dtype=torch.float))
        acc = acc / len(test)
    return acc