In [1]:
import os
import sys
import time

from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
# import torch.nn.Softmax as softmax

import src.pytorch_utils as ptu
import src.dataset as dset
from src import bilstm

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
models_path = 'models'

In [2]:
%%time
train_dataset = dset.DataSet('data/train.labeled', tqdm_bar=True)
test_dataset = dset.DataSet('data/test.labeled', tags=train_dataset.tags, words=train_dataset.words, tqdm_bar=True)
comp_dataset = dset.DataSet('data/comp.unlabeled', tags=train_dataset.tags, words=train_dataset.words, tqdm_bar=True, tagged=False)

100%|██████████| 125430/125430 [00:12<00:00, 10131.22it/s]
100%|██████████| 25325/25325 [00:02<00:00, 10491.83it/s]
100%|██████████| 24744/24744 [00:02<00:00, 11888.64it/s]

CPU times: user 16.9 s, sys: 281 ms, total: 17.2 s
Wall time: 17.1 s





In [4]:
## using the BiLSTM we found in the internet, need to review the model and compare it to ours 
vocab_size = len(train_dataset.words)
embedding_dim = 300
hidden_dim = 256
version = 0.0

checkpoint = ptu.Checkpoint(models_path=models_path,
                            version=version,
                            model=bilstm.BiLSTM_CRF(vocab_size, tag_to_ix, embedding_dim, hidden_dim),
                            score=lambda y_true, y_pred: (np.array(y_true) == np.array(y_pred)).mean(),
                            seed=42,
                            optimizer=torch.optim.Adam,
                            criterion=nn.NLLLoss,
                            save=False,
                            prints=False,
                           )

In [None]:
#### matan's code below

In [None]:
## dataset class, need to review and keep only the relevant
class Dependency_Parser_Dataset(nn.Module):
    def __init__(self, word_dict, pos_dict, dir_path: str, subset: str, 
                 padding=False, word_embeddings=None):
        super().__init__()
        self.subset = subset # One of the following: [train, test]
        self.file = dir_path + subset + ".wtag"
        self.datareader = PosDataReader(self.file, word_dict, pos_dict)
        self.vocab_size = len(self.datareader.word_dict)
        if word_embeddings:
            self.word_idx_mappings, self.idx_word_mappings, self.word_vectors = word_embeddings
        else:
            self.word_idx_mappings, self.idx_word_mappings, self.word_vectors = ## need to add our word emb hereself.init_word_embeddings(self.datareader.word_dict)
        self.pos_idx_mappings, self.idx_pos_mappings = ## need to add our word emb here self.init_pos_vocab(self.datareader.pos_dict)
        
        self.pad_idx = self.word_idx_mappings.get(PAD_TOKEN)
        self.unknown_idx = self.word_idx_mappings.get(UNKNOWN_TOKEN)
        self.word_vector_dim = self.word_vectors.size(-1)
        self.sentence_lens = [len(sentence) for sentence in self.datareader.sentences]
        self.max_seq_len = max(self.sentence_lens)
        self.sentences_dataset = self.convert_sentences_to_dataset(padding)
    
    def get_word_embeddings(self):
        return self.word_idx_mappings, self.idx_word_mappings, self.word_vectors

    
    def init_pos_vocab(self, pos_dict):
        idx_pos_mappings = sorted([self.word_idx_mappings.get(token) for token in SPECIAL_TOKENS])
        pos_idx_mappings = {self.idx_word_mappings[idx]: idx for idx in idx_pos_mappings}
        
        
    def convert_sentences_to_dataset(self, padding):
        sentence_word_idx_list = list()
        sentence_pos_idx_list = list()
        sentence_len_list = list()
        for sentence_idx, sentence in enumerate(self.datareader.sentences):
            words_idx_list = []
            pos_idx_list = []
            for word, pos in sentence:
                words_idx_list.append(self.word_idx_mappings.get(word))
                pos_idx_list.append(self.pos_idx_mappings.get(pos))
            sentence_len = len(words_idx_list)
            # if padding:
            #     while len(words_idx_list) < self.max_seq_len:
            #         words_idx_list.append(self.word_idx_mappings.get(PAD_TOKEN))
            #         pos_idx_list.append(self.pos_idx_mappings.get(PAD_TOKEN))
            sentence_word_idx_list.append(torch.tensor(words_idx_list, dtype=torch.long, requires_grad=False))
            sentence_pos_idx_list.append(torch.tensor(pos_idx_list, dtype=torch.long, requires_grad=False))
            sentence_len_list.append(sentence_len)
        
        # if padding:
        #     all_sentence_word_idx = torch.tensor(sentence_word_idx_list, dtype=torch.long)
        #     all_sentence_pos_idx = torch.tensor(sentence_pos_idx_list, dtype=torch.long)
        #     all_sentence_len = torch.tensor(sentence_len_list, dtype=torch.long, requires_grad=False)
        #     return TensorDataset(all_sentence_word_idx, all_sentence_pos_idx, all_sentence_len)
            
        return {i: sample_tuple for i, sample_tuple in enumerate(zip(sentence_word_idx_list,
                                                                     sentence_pos_idx_list,
                                                                     sentence_len_list))}

In [None]:
## our model
class Dnn_Dependency_Parser(nn.Module):
    def __init__(self, word_embeddings, hidden_dim, word_vocab_size, tag_vocab_size):
        super(Dnn_Dependency_Parser, self).__init__()
        emb_dim = word_embeddings.shape[1]
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.word_embedding = ## need to add our embeddin - nn.Embedding(word_vocab_size, word_embedding_dim)
        # self.word_embedding = nn.Embedding.from_pretrained(word_embeddings, freeze=False)
        self.lstm = nn.LSTM(input_size=emb_dim, hidden_size=hidden_dim, num_layers=2, bidirectional=True, batch_first=False)
        self.hidden2tag = nn.Linear(hidden_dim*2, tag_vocab_size)

        
    def forward(self, word_idx_tensor):
        embeds = self.word_embedding(word_idx_tensor.to(self.device))   # [batch_size, seq_length, emb_dim]      
        lstm_out, _ = self.lstm(embeds.view(embeds.shape[1], 1, -1))    # [seq_length, batch_size, 2*hidden_dim]
        tag_space = self.hidden2tag(lstm_out.view(embeds.shape[1], -1)) # [seq_length, tag_dim]
        tag_scores = F.log_softmax(tag_space, dim=1)                    # [seq_length, tag_dim]
        return tag_scores


In [None]:
#CUDA_LAUNCH_BLOCKING=1  

EPOCHS = 15
WORD_EMBEDDING_DIM = 100 ## need to decide if this is the right DIM
HIDDEN_DIM = 1000 ## need to decide if this is the right DIM
word_vocab_size = len(train.word_idx_mappings)
tag_vocab_size = len(train.pos_idx_mappings)

##need to decide whihc parameters are relevant
model = Dnn_Dependency_Parser(train_dataloader.dataset.word_vectors, HIDDEN_DIM, word_vocab_size, tag_vocab_size)

if device == "cuda":
    model.cuda()

# Define the loss function as the Negative Log Likelihood loss (NLLLoss)

## no need of that, i implement Negative_log_Likelihood_Loss
#loss_function = nn.NLLLoss()

# We will be using a simple SGD optimizer to minimize the loss function
## we are ok with Adam? need to change?
optimizer = optim.Adam(model.parameters(), lr=0.01)
acumulate_grad_steps = 50 # This is the actual batch_size, while we officially use batch_size=1

# Training start
print("Training Started")
accuracy_list = []
loss_list = []
epochs = EPOCHS
for epoch in range(epochs):
    acc = 0 # to keep track of accuracy
    printable_loss = 0 # To keep track of the loss value
    i = 0
    for batch_idx, input_data in enumerate(train_dataloader):
        i += 1
        words_idx_tensor, pos_idx_tensor, sentence_length = input_data
        
        sentence_scores = model(words_idx_tensor)
        sentence_scores = ## need to fix it ----tag_scores.unsqueeze(0).permute(0,2,1)
        #print("tag_scores shape -", tag_scores.shape)
        #print("pos_idx_tensor shape -", pos_idx_tensor.shape)
        loss = Negative_log_Likelihood_Loss(dataset, network_parameters) # need to fix network_parameters
        loss = loss / acumulate_grad_steps
        loss.backward()

        if i % acumulate_grad_steps == 0:
            optimizer.step()
            model.zero_grad()
        printable_loss += loss.item()## we need to change loss.item() to our lass, i think it will be only loss
        _, indices = torch.max(sentence_scores, 1)
        # print("tag_scores shape-", tag_scores.shape)
        # print("indices shape-", indices.shape)
        # acc += indices.eq(pos_idx_tensor.view_as(indices)).mean().item()
        acc += torch.mean(torch.tensor(pos_idx_tensor.to("cpu") == indices.to("cpu"), dtype=torch.float))##i think we should fix it
    printable_loss = printable_loss / len(train)
    acc = acc / len(train)
    loss_list.append(float(printable_loss))
    accuracy_list.append(float(acc))
    test_acc = evaluate()
    e_interval = i
    print("Epoch {} Completed,\tLoss {}\tAccuracy: {}\t Test Accuracy: {}".format(epoch + 1, np.mean(loss_list[-e_interval:]), np.mean(accuracy_list[-e_interval:]), test_acc))
  

In [None]:
## loss function
def Negative_log_Likelihood_Loss(dataset, network_parameters):
    loss = 0
    for x_i, y_i in dataset:
        softmax_score = softmax(y_i)
        for head, modifier in y_i:
            loss -=(1/absoulte_y_i(y_i))*mat.log(softmax_score(head,modifer))
              
def absoulte_y_i(y_i):
    return len(y_i[:0])

In [None]:
## evaluate functino- i used the function they share with us,looks ok for me 
def evaluate(test_dataloader):
    acc = 0
    with torch.no_grad():
        for batch_idx, input_data in enumerate(test_dataloader):
            
            words_idx_tensor, pos_idx_tensor, sentence_length = input_data  
            tag_scores = model(words_idx_tensor)
            tag_scores = tag_scores.unsqueeze(0).permute(0,2,1)
            
            _, indices = torch.max(tag_scores, 1)
            acc += torch.mean(torch.tensor(pos_idx_tensor.to("cpu") == indices.to("cpu"), dtype=torch.float))
        acc = acc / len(test)
    return acc