In [None]:
import os
import sys
import time
import math

# import dill
# from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from src import utils
from src import bilstm
import src.dataset as dset
import src.pytorch_utils as ptu
import src.chu_liu_edmonds as chu

import warnings
warnings.filterwarnings('ignore')

seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
versions_dir = 'models'

In [2]:
train_dataset = dset.DataSet('data/train.labeled', tqdm_bar=True, use_glove=True)
test_dataset = dset.DataSet('data/test.labeled', train_dataset=train_dataset, tqdm_bar=True, use_glove=True)

100%|██████████| 125430/125430 [00:16<00:00, 7426.27it/s]
100%|██████████| 25325/25325 [00:03<00:00, 7434.43it/s]


In [3]:
version = 'V2_1.10'
save = False

In [4]:
model = bilstm.BiLSTM(train_dataset=train_dataset,
                      word_embed_dim=300,
                      tag_embed_dim=25,
                      hidden_dim=200,
                      num_layers=4,
                      bias=True,
                      lstm_activation=None,
                      p_dropout=0.3,
                      attention=utils.MultiplicativeAttention,
                      softmax=nn.LogSoftmax(dim=2),
                      glove=True,
                      freeze=True)

checkpoint = ptu.Checkpoint(versions_dir=versions_dir,
                            version=version,
                            model=model,
                            score=lambda y_true, y_pred: (np.array(y_true) == np.array(y_pred)).mean(),
                            loss_decision_func=utils.loss_decision_func,
                            out_decision_func=chu.test_chu_liu_edmonds,
                            seed=42,
                            optimizer=torch.optim.AdamW,
                            criterion=nn.NLLLoss,
                            save=save,
                            prints=True)

model version: V2_1.10
Number of parameters 8144200 trainable 3894400


In [5]:
# checkpoint = ptu.load_model(version=version, versions_dir=versions_dir, epoch=40, seed=42)
# display(checkpoint.log)

In [6]:
word_dropout_alpha = 0.25
hyperparam_list = [
    {'train_epochs': 20, 'batch_size': 8, 'optimizer_params': {'lr': 2e-3, 'weight_decay': 5e-7}},
    {'train_epochs': 20, 'batch_size': 8, 'optimizer_params': {'lr': 2e-3, 'weight_decay': 1e-6}, 'lr_decay': 0.2},
    {'train_epochs': 20, 'batch_size': 8, 'optimizer_params': {'lr': 4e-4, 'weight_decay': 0.0}},  # 1e-6
]

for session in hyperparam_list:
    checkpoint.train(device=device,
                     train_dataset=train_dataset.dataset(word_dropout_alpha, train=True),
                     val_dataset=test_dataset.dataset(train=False),
                     prints=True,
                     epochs_save=5,
                     save=save,
#                      early_stop=5,
                     **session)

epoch   1/ 20 | train_loss 0.44128 | val_loss 0.47693 | train_score 0.86718 | val_score 0.85866 | train_time   1.07 min *
epoch   2/ 20 | train_loss 0.31957 | val_loss 0.41053 | train_score 0.90104 | val_score 0.87630 | train_time   2.91 min *
epoch   3/ 20 | train_loss 0.22960 | val_loss 0.37935 | train_score 0.92700 | val_score 0.88658 | train_time   4.56 min *
epoch   4/ 20 | train_loss 0.19850 | val_loss 0.39567 | train_score 0.93610 | val_score 0.88559 | train_time   6.28 min
epoch   5/ 20 | train_loss 0.15092 | val_loss 0.39126 | train_score 0.94990 | val_score 0.89184 | train_time   7.52 min *
epoch   6/ 20 | train_loss 0.11670 | val_loss 0.41072 | train_score 0.96096 | val_score 0.89340 | train_time   9.53 min *
epoch   7/ 20 | train_loss 0.09383 | val_loss 0.43074 | train_score 0.96850 | val_score 0.89595 | train_time  11.23 min *
epoch   8/ 20 | train_loss 0.08163 | val_loss 0.45031 | train_score 0.97336 | val_score 0.89702 | train_time  12.97 min *
epoch   9/ 20 | train_loss

In [None]:
# version = 'V2_hpo_1.0'

In [None]:
# attentions = {
#     'Additive': utils.AdditiveAttention,
#     'Multiplicative': utils.MultiplicativeAttention,
# }

# softmaxs = {
#     'LogSoftmax': nn.LogSoftmax(dim=2),
# #     'Softmax': nn.Softmax(dim=2),
# }

# activations = dict(sorted(list({
#     'tanh': nn.Tanh(),
#     'hard_tanh': nn.Hardtanh(),
# #     'relu': nn.ReLU(),
# #     'elu': nn.ELU(),
# #     'leaky_relu': nn.LeakyReLU(),
#     'p_relu': nn.PReLU(),
# #     'relu6': nn.ReLU6(),
# #     'gelu': nn.GELU(),
# #     'sigmoid': nn.Sigmoid(),
# }.items()), key=lambda x: x[0]))

In [None]:
# import hyperopt as hpo

In [None]:
# init_space = dict(sorted(list({
# #     'train_epochs': 50,
#     'batch_size': 16, #hpo.hp.quniform('batch_size', low=4, high=5, q=1),  # 16-32-64
#     'optimizer__lr': hpo.hp.uniform('optimizer__lr', low=8e-4, high=2e-3),
#     'optimizer__wd': hpo.hp.uniform('optimizer__wd', low=5e-7, high=5e-6),# 0.0
# #     'early_stop': 5,
    
#     'word_embed_dim': 300,  # 300
#     'tag_embed_dim': 32, #hpo.hp.quniform('tag_embed_dim', low=30, high=50, q=4), #25
#     'hidden_dim': hpo.hp.quniform('hidden_dim', low=200, high=300, q=50), #125,  # 
#     'num_layers': hpo.hp.quniform('num_layers', low=3, high=4, q=1),#2,  # 
#     'bias': True, #hpo.hp.choice('bias', [True, False]),
#     'attention_dim': hpo.hp.quniform('attention_dim', low=200, high=300, q=50),#100,  # 
#     'attention': hpo.hp.choice('attention', list(attentions.keys())),
#     'activation': hpo.hp.choice('activation', list(activations.keys())),
#     'softmax': hpo.hp.choice('softmax', list(softmaxs.keys())),
#     'p_dropout': hpo.hp.uniform('p_dropout', low=0.3, high=0.6),#0.1,  # 
#     'lr_decay': hpo.hp.uniform('lr_decay', low=0.15, high=0.25),#0.1,  # 
#     'freeze': True, #hpo.hp.choice('freeze', [True, False]),
# }.items()), key=lambda x: x[0]))

# def init_objective(space, save=False):
#     display(space)
#     last_score = init_log['test_score'].max() if len(init_log) > 0 else 0.0
#     batch_size = int(2 ** space['batch_size'])
# #     attention = utils.MultiplicativeAttention if space['attention'] == 'Multiplicative' else utils.AdditiveAttention
# #     activation = space['attention'] if space['attention'] != 'Multiplicative' else 'tanh'
# #     activation = activations[activation]
    
#     model = m2.Model2(train_dataset=train_dataset,
#                       word_embed_dim=space['word_embed_dim'],  # 300
#                       tag_embed_dim=space['tag_embed_dim'],  # 32
#                       hidden_dim=int(space['hidden_dim']),  # 125
#                       num_layers=int(space['num_layers']),  # 2
#                       bias=space['bias'],  # True
#                       attention_dim=int(space['attention_dim']),  # 10
#                       activation=activations[space['activation']],
#                       p_dropout=space['p_dropout'],  # 0.5
#                       attention=attentions[space['attention']],
#                       softmax=softmaxs[space['softmax']],
#                       glove=True,
#                       freeze=space['freeze'])

#     init_checkpoint = ptu.Checkpoint(versions_dir=versions_dir,
#                                      version=version,
#                                      model=model,
#                                      score=lambda y_true, y_pred: (np.array(y_true) == np.array(y_pred)).mean(),
#                                      loss_decision_func=utils.loss_decision_func,
#                                      out_decision_func=lambda y_pred, flat_y_pred, mask, padding: flat_y_pred.argmax(axis=1),
#                                      seed=42,
#                                      optimizer=torch.optim.AdamW,
#                                      criterion=nn.NLLLoss,
#                                      save=False,
#                                      prints=True)
    
#     word_dropout_alpha = 0.25
#     hyperparam_list = [
# #         {'train_epochs': 1, 'batch_size': 16, 'optimizer_params': {'lr': space['optimizer__lr'], 'weight_decay': 5e-7}},
#         {'train_epochs': 20, 'batch_size': 16, 'optimizer_params': {'lr': space['optimizer__lr'], 'weight_decay': 5e-7}},
#         {'train_epochs': 20, 'batch_size': 16, 'optimizer_params': {'lr': space['optimizer__lr'], 'weight_decay': space['optimizer__wd']}, 'lr_decay': space['lr_decay']},
# #         {'train_epochs': 20, 'batch_size': 16, 'optimizer_params': {'lr': space['optimizer__lr'], 'weight_decay': space['optimizer__wd']}, 'lr_decay': space['lr_decay']},
#     #     {'train_epochs': 20, 'batch_size': 32, 'optimizer_params': {'lr': 2e-3, 'weight_decay': 1.5e-6}, 'lr_decay': 0.2},
#     #     {'train_epochs': 20, 'batch_size': 64, 'optimizer_params': {'lr': 2e-3, 'weight_decay': 1.5e-6}, 'lr_decay': 0.2},
#     ]

#     for session in hyperparam_list:
#         init_checkpoint.train(device=device,
#                               train_dataset=train_dataset.dataset(word_dropout_alpha, train=True),
#                               val_dataset=test_dataset.dataset(train=False),
#                               prints=True,
#                               epochs_save=5,
#                               save=save,
#         #                       early_stop=5,
#                               **session)    
    
#     train_score = init_checkpoint.get_log(col='train_score', epoch=-1)
#     test_score = init_checkpoint.get_log(col='val_score', epoch=-1)
# #     print('test_score', test_score)
#     ###############################################################
#     if test_score > last_score:
#         init_checkpoint.save(epoch=True)
#     init_log.loc[init_log.index.max() + 1 if len(init_log) > 0 else 0] = [time.strftime('%d-%m-%Y %H:%M:%S'),
# #                                                                           train_score,
#                                                                           test_score,
#                                                                           space] + list(space.values())
    
#     with open(os.path.join(versions_dir, version, 'trials.pth'), 'wb') as f:
#         dill.dump(init_trials, f)
#     init_log.to_csv(os.path.join(versions_dir, version, 'trials_log.csv'), index=False)

#     return -test_score

# # session_space = dict(sorted(list({
# #     'train_epochs': 5,
# #     'batch_size_mult': min(len(X_train), int(2**hpo.hp.quniform('batch_size_mult', low=5, high=9, q=1))),
# #     'optimizer__lr_mult': hpo.hp.uniform('optimizer__lr_mult', low=1e-5, high=1e-3),
# #     'optimizer__weight_decay': hpo.hp.uniform('optimizer__weight_decay', low=1e-5, high=1e-3),
# #     'p_dropout': max(0.0, min(0.9, hpo.hp.normal('p_dropout', mu=0.5, sigma=0.15))),
# # }.items()), key=lambda x: x[0]))

In [None]:
# # init_trials = hpo.Trials()
# # init_log = pd.DataFrame(columns=['timestamp',
# #                                  # 'train_score',
# #                                  'test_score',
# #                                  'space'] + list(init_space.keys()))

# # with open(os.path.join(versions_dir, version, 'trials.pth'), 'wb') as f:
# #     dill.dump(init_trials, f)
# # init_log.to_csv(os.path.join(versions_dir, version, 'trials_log.csv'), index=False)

# with open(os.path.join(versions_dir, version, 'trials.pth'), "rb") as f:
#     init_trials = dill.load(f)
# init_log = pd.read_csv(os.path.join(versions_dir, version, 'trials_log.csv'))
# display(init_log)

In [None]:
# iters = 500

# _ = hpo.fmin(init_objective,
#              init_space,
#              algo=hpo.tpe.suggest,
#              trials=init_trials,
#              max_queue_len=1,
#              max_evals=iters)

In [None]:
# with open(os.path.join(versions_dir, version, 'trials.pth'), 'wb') as f:
#     dill.dump(init_trials, f)
# init_log.to_csv(os.path.join(versions_dir, version, 'trials_log.csv'), index=False)

In [None]:
# checkpoint = ptu.load_model(version=version, versions_dir=versions_dir, epoch='best', seed=42)
# loss, score = checkpoint.predict(test_dataset.dataset,
#                                  batch_size=32,
#                                  device=device,
#                                  results=False,
#                                  decision_func=chu.test_chu_liu_edmonds)
# print(f'chu_liu_edmonds_UAS: {score}')

In [None]:
# %%time
# checkpoint.model = checkpoint.model.to(device)
# checkpoint.model.train()
# batch_size = 32

# loader = torch.utils.data.DataLoader(dataset=train_dataset.dataset, batch_size=batch_size, shuffle=True)
# for batch in loader:
#     loss, flat_y, flat_out, mask, out, y = utils.loss_decision_func(checkpoint, device, batch, prints=True)
#     break
# torch.cuda.empty_cache()