In [1]:
import os
import sys
import time
import math

import dill
from tqdm import tqdm
# import hyperopt as hpo

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from src import utils
from src import model2 as m2
import src.dataset as dset
import src.pytorch_utils as ptu
import src.chu_liu_edmonds as chu

import warnings
warnings.filterwarnings('ignore')

seed = 42
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

np.random.seed(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(device)
versions_dir = 'models'

cuda


In [2]:
# train_dataset = dset.DataSet('data/train.labeled', tqdm_bar=True, use_glove=True)
# test_dataset = dset.DataSet('data/test.labeled', train_dataset=train_dataset, tqdm_bar=True, use_glove=True)
# comp_dataset = dset.DataSet('data/comp.unlabeled', train_dataset=train_dataset, tagged=False, tqdm_bar=True, use_glove=True)

In [3]:
# with open(os.path.join('data', 'train_dataset.pth'), 'wb') as f:
#     dill.dump(train_dataset, f)
# with open(os.path.join('data', 'test_dataset.pth'), 'wb') as f:
#     dill.dump(test_dataset, f)

In [4]:
with open(os.path.join('data', 'train_dataset.pth'), "rb") as f:
    train_dataset = dill.load(f)
with open(os.path.join('data', 'test_dataset.pth'), "rb") as f:
    test_dataset = dill.load(f)

In [5]:
activations = dict(sorted(list({
    'tanh': nn.Tanh(),
    'hard_tanh': nn.Hardtanh(),
#     'relu': nn.ReLU(),
#     'elu': nn.ELU(),
#     'leaky_relu': nn.LeakyReLU(),
#     'p_relu': nn.PReLU(),
#     'relu6': nn.ReLU6(),
#     'gelu': nn.GELU(),
#     'sigmoid': nn.Sigmoid(),
}.items()), key=lambda x: x[0]))

In [6]:
version = 'V2_1.5'
save = True

In [7]:
# checkpoint = ptu.load_model(version=version, versions_dir=versions_dir, epoch=-1, seed=42)
# display(checkpoint.log)

In [8]:
model = m2.Model2(train_dataset=train_dataset,
                  word_embed_dim=300,  # 300
                  tag_embed_dim=32,  # 32
                  hidden_dim=250,  # 125
                  num_layers=4,  # 2
                  bias=True,  # True
                  attention_dim=200,  # 10
                  activation=activations['tanh'],
                  p_dropout=0.45,  # 0.5
                  word_dropout=0.25,  # 0.0
                  attention=utils.AdditiveAttention,
#                   attention=utils.DotAttention,
#                   attention=utils.MultiplicativeAttention,
                  softmax=nn.LogSoftmax(dim=2),
#                   softmax=nn.Softmax(dim=2),
                  glove=True,
                  positional_encoding=False)

checkpoint = ptu.Checkpoint(versions_dir=versions_dir,
                            version=version,
                            model=model,
                            score=lambda y_true, y_pred: (np.array(y_true) == np.array(y_pred)).mean(),
                            loss_decision_func=utils.loss_decision_func,
                            out_decision_func=lambda y_pred, flat_y_pred, mask, padding: flat_y_pred.argmax(axis=1),
                            seed=42,
                            optimizer=torch.optim.Adam,
                            criterion=nn.NLLLoss,
                            save=save,
                            prints=True)

model version: V2_1.5
Number of parameters 5882137 trainable 5882137


In [None]:
wd = 1e-6
lr = 1.5e-3
lr_deacy = 0.2
betas = (0.95, 0.998)

hyperparam_list = [
    {'train_epochs':  5, 'optimizer_params': {'lr': lr, 'weight_decay': wd, 'betas': betas}},
    {'train_epochs': 15, 'optimizer_params': {'lr': lr, 'weight_decay': wd, 'betas': betas}, 'lr_decay': lr_deacy},
    {'train_epochs':  5, 'optimizer_params': {'weight_decay': wd, 'betas': betas}},
    {'train_epochs': 15, 'optimizer_params': {'lr': lr, 'weight_decay': wd, 'betas': betas}, 'lr_decay': lr_deacy},
    {'train_epochs':  5, 'optimizer_params': {'weight_decay': wd, 'betas': betas}},
    {'train_epochs': 15, 'optimizer_params': {'lr': lr, 'weight_decay': wd, 'betas': betas}, 'lr_decay': lr_deacy},
    {'train_epochs':  5, 'optimizer_params': {'weight_decay': wd, 'betas': betas}},
]

for session in hyperparam_list:
    checkpoint.train(device=device,
                     train_dataset=train_dataset.dataset,
                     val_dataset=test_dataset.dataset,
                     prints=True,
                     epochs_save=5,
                     save=save,
#                      early_stop=5,
                     batch_size=16,
                     **session)

epoch   1/  5 | train_loss 1.85642 | val_loss 1.83721 | train_score 0.44313 | val_score 0.44625 | train_time   1.29 min *
epoch   2/  5 | train_loss 0.89174 | val_loss 0.87620 | train_score 0.75149 | val_score 0.76140 | train_time   3.41 min *
epoch   3/  5 | train_loss 0.65940 | val_loss 0.66200 | train_score 0.80939 | val_score 0.81085 | train_time   5.61 min *


In [22]:
version = 'V2_hpo_1.0'

In [31]:
init_space = dict(sorted(list({
    'train_epochs': 50,
    'batch_size': hpo.hp.quniform('batch_size', low=4, high=5, q=1),  # 16-32-64
    'optimizer__lr': hpo.hp.uniform('optimizer__lr', low=1e-4, high=1e-3),
    'optimizer__wd': hpo.hp.choice('optimizer__wd_ind', [0, hpo.hp.uniform('optimizer__wd', low=0, high=1e-6)]),# 0.0
    'bias': True, #hpo.hp.choice('bias', [True, False]),
    'early_stop': 5,
    
    'word_embed_dim': 300,  # 300
    'tag_embed_dim': 32, #hpo.hp.quniform('tag_embed_dim', low=30, high=50, q=4), #25
    'hidden_dim': hpo.hp.quniform('hidden_dim', low=100, high=300, q=50), #125,  # 
    'num_layers': hpo.hp.quniform('num_layers', low=2, high=4, q=1),#2,  # 
    'mlp1_dim': hpo.hp.quniform('mlp1_dim', low=100, high=300, q=50),#100,  # 
    'activation': hpo.hp.choice('activation', list(activations.keys())),#nn.Tanh(),  # 
    'p_dropout': hpo.hp.normal('p_dropout', mu=0.1, sigma=0.1),#0.1,  # 
    'word_dropout': hpo.hp.normal('word_dropout', mu=0.1, sigma=0.1),#0.25,  # 
}.items()), key=lambda x: x[0]))

def init_objective(space, save=False):
    display(space)
    last_score = init_log['test_score'].max() if len(init_log) > 0 else 0.0
    batch_size = min(len(train_dataset.dataset), int(2 ** space['batch_size']))
    p_dropout = max(0.0, min(0.7, space['p_dropout']))
    word_dropout = max(0.0, min(0.7, space['word_dropout']))

    model = m2.Model2(train_dataset=train_dataset,
                      word_embed_dim=300,
                      tag_embed_dim=int(space['tag_embed_dim']),
                      hidden_dim=int(space['hidden_dim']),
                      num_layers=int(space['num_layers']),
                      bias=space['bias'],
                      mlp1_dim=int(space['mlp1_dim']),
                      activation=activations[space['activation']],
                      p_dropout=p_dropout,
                      word_dropout=word_dropout,
                      glove=True,
                      positional_encoding=False)

    init_checkpoint = ptu.Checkpoint(version=version,
                                     model=model,
                                     optimizer=torch.optim.Adam,
                                     criterion=nn.NLLLoss,
                                     score=lambda y_true, y_pred: (np.array(y_true) == np.array(y_pred)).mean(),
                                     versions_dir=versions_dir,
                                     loss_decision_func=utils.loss_decision_func,
                                     out_decision_func=lambda y_pred, flat_y_pred, mask, padding: flat_y_pred.argmax(axis=1),
                                     seed=42,
                                     custom_run_func=None,
                                     save=False,
                                     prints=True)
    
    init_checkpoint.train(device=device,
                          train_dataset=train_dataset.dataset,
                          val_dataset=test_dataset.dataset,
                          train_epochs=space['train_epochs'],
                          batch_size=batch_size,
                          optimizer_params={
                              'lr': space['optimizer__lr'],
                              'weight_decay': space['optimizer__wd'],
                          },
                          prints=True,
                          epochs_save=0,
                          early_stop=space['early_stop'],
                          save=save)
    
    train_score = init_checkpoint.get_log(col='train_score', epoch=-1)
    test_score = init_checkpoint.get_log(col='val_score', epoch=-1)
#     print('test_score', test_score)
    ###############################################################
    if test_score > last_score:
        init_checkpoint.save(epoch=True)
    init_log.loc[init_log.index.max() + 1 if len(init_log) > 0 else 0] = [time.strftime('%d-%m-%Y %H:%M:%S'),
#                                                                           train_score,
                                                                          test_score,
                                                                          space] + list(space.values())
    
    with open(os.path.join(versions_dir, version, 'trials.pth'), 'wb') as f:
        dill.dump(init_trials, f)
    init_log.to_csv(os.path.join(versions_dir, version, 'trials_log.csv'), index=False)

    return -test_score

# session_space = dict(sorted(list({
#     'train_epochs': 5,
#     'batch_size_mult': min(len(X_train), int(2**hpo.hp.quniform('batch_size_mult', low=5, high=9, q=1))),
#     'optimizer__lr_mult': hpo.hp.uniform('optimizer__lr_mult', low=1e-5, high=1e-3),
#     'optimizer__weight_decay': hpo.hp.uniform('optimizer__weight_decay', low=1e-5, high=1e-3),
#     'p_dropout': max(0.0, min(0.9, hpo.hp.normal('p_dropout', mu=0.5, sigma=0.15))),
# }.items()), key=lambda x: x[0]))

In [None]:
# init_trials = hpo.Trials()
# init_log = pd.DataFrame(columns=['timestamp',
#                                  # 'train_score',
#                                  'test_score',
#                                  'space'] + list(init_space.keys()))

# with open(os.path.join(versions_dir, version, 'trials.pth'), 'wb') as f:
#     dill.dump(init_trials, f)
# init_log.to_csv(os.path.join(versions_dir, version, 'trials_log.csv'), index=False)

with open(os.path.join(versions_dir, version, 'trials.pth'), "rb") as f:
    init_trials = dill.load(f)
init_log = pd.read_csv(os.path.join(versions_dir, version, 'trials_log.csv'))
# display(init_log)

In [32]:
iters = 500

_ = hpo.fmin(init_objective,
             init_space,
             algo=hpo.tpe.suggest,
             trials=init_trials,
             max_queue_len=1,
             max_evals=iters)

  2%|▏         | 9/500 [00:00<?, ?trial/s, best loss=?]

{'activation': 'gelu',
 'batch_size': 4.0,
 'bias': True,
 'early_stop': 5,
 'hidden_dim': 150.0,
 'mlp1_dim': 150.0,
 'num_layers': 4.0,
 'optimizer__lr': 0.0007100446481541015,
 'optimizer__wd': 0,
 'p_dropout': -0.017004121103458253,
 'tag_embed_dim': 32,
 'train_epochs': 50,
 'word_dropout': 0.14995879743817125,
 'word_embed_dim': 300}

model version:                                         
V2_hpo_1.0                                                      
Number of parameters 2299987 trainable 2299987                  
epoch   1/ 50 | train_loss 1.66794 | val_loss 1.65149 | train_score 0.45373 | val_score 0.45751 | train_time   0.92 min *
epoch   2/ 50 | train_loss 0.86116 | val_loss 0.88207 | train_score 0.75105 | val_score 0.74730 | train_time   1.83 min *
epoch   3/ 50 | train_loss 0.64424 | val_loss 0.69090 | train_score 0.81823 | val_score 0.80711 | train_time   2.75 min *
epoch   4/ 50 | train_loss 0.52722 | val_loss 0.60004 | train_score 0.84948 | val_score 0.83215 | train_time   3.67 min *
epoch   5/ 50 | train_loss 0.46754 | val_loss 0.56679 | train_score 0.86385 | val_score 0.83877 | train_time   4.58 min *
epoch   6/ 50 | train_loss 0.40919 | val_loss 0.53510 | train_score 0.87442 | val_score 0.84711 | train_time   5.50 min *
epoch   7/ 50 | train_loss 0.37172 | val_loss 0.52705 | train_score 0.88363 | val_

{'activation': 'relu',
 'batch_size': 4.0,
 'bias': True,
 'early_stop': 5,
 'hidden_dim': 150.0,
 'mlp1_dim': 250.0,
 'num_layers': 3.0,
 'optimizer__lr': 0.00043522596195538234,
 'optimizer__wd': 0,
 'p_dropout': 0.09321689570370757,
 'tag_embed_dim': 32,
 'train_epochs': 50,
 'word_dropout': 0.13314838372299523,
 'word_embed_dim': 300}

model version:                                                                           
V2_hpo_1.0                                                                               
Number of parameters 1817887 trainable 1817887                                           
epoch   1/ 50 | train_loss 1.68437 | val_loss 1.66828 | train_score 0.46500 | val_score 0.47133 | train_time   1.34 min *
epoch   2/ 50 | train_loss 1.00245 | val_loss 1.00494 | train_score 0.72039 | val_score 0.71893 | train_time   2.68 min *
epoch   3/ 50 | train_loss 0.72008 | val_loss 0.74127 | train_score 0.79740 | val_score 0.79424 | train_time   4.02 min *
epoch   4/ 50 | train_loss 0.59624 | val_loss 0.63569 | train_score 0.83160 | val_score 0.82323 | train_time   5.35 min *
epoch   5/ 50 | train_loss 0.51672 | val_loss 0.57571 | train_score 0.85193 | val_score 0.83803 | train_time   6.70 min *
epoch   6/ 50 | train_loss 0.45526 | val_loss 0.53009 | train_score 0.86798 | val_score 0.84888 | train_time   8.04 min 

{'activation': 'p_relu',
 'batch_size': 4.0,
 'bias': True,
 'early_stop': 5,
 'hidden_dim': 250.0,
 'mlp1_dim': 250.0,
 'num_layers': 4.0,
 'optimizer__lr': 0.0008501837132013326,
 'optimizer__wd': 0,
 'p_dropout': 0.011866820932908223,
 'tag_embed_dim': 32,
 'train_epochs': 50,
 'word_dropout': 0.04474779546753615,
 'word_embed_dim': 300}

model version:                                                                           
V2_hpo_1.0                                                                               
Number of parameters 5932288 trainable 5932288                                           
epoch   1/ 50 | train_loss 2.22124 | val_loss 2.22981 | train_score 0.26061 | val_score 0.25998 | train_time   1.82 min *
epoch   2/ 50 | train_loss 0.72889 | val_loss 0.75842 | train_score 0.79298 | val_score 0.78717 | train_time   3.64 min *
epoch   3/ 50 | train_loss 0.52873 | val_loss 0.59533 | train_score 0.84555 | val_score 0.83194 | train_time   5.45 min *
epoch   4/ 50 | train_loss 0.41529 | val_loss 0.52302 | train_score 0.88098 | val_score 0.85538 | train_time   7.27 min *
epoch   5/ 50 | train_loss 0.37700 | val_loss 0.55262 | train_score 0.88705 | val_score 0.85217 | train_time   9.08 min
epoch   6/ 50 | train_loss 0.33340 | val_loss 0.58080 | train_score 0.89823 | val_score 0.84904 | train_time  10.90 min
ep

{'activation': 'gigmoid',
 'batch_size': 4.0,
 'bias': True,
 'early_stop': 5,
 'hidden_dim': 100.0,
 'mlp1_dim': 150.0,
 'num_layers': 4.0,
 'optimizer__lr': 0.0002356003389179894,
 'optimizer__wd': 0,
 'p_dropout': 0.06131302393454804,
 'tag_embed_dim': 32,
 'train_epochs': 50,
 'word_dropout': 0.1032776251369739,
 'word_embed_dim': 300}

model version:                                                                             
V2_hpo_1.0                                                                                 
Number of parameters 1133987 trainable 1133987                                             
  2%|▏         | 12/500 [1:38:11<66:32:58, 490.94s/trial, best loss: -0.8855087358684481]  


KeyboardInterrupt: 

In [33]:
with open(os.path.join(versions_dir, version, 'trials.pth'), 'wb') as f:
    dill.dump(init_trials, f)
init_log.to_csv(os.path.join(versions_dir, version, 'trials_log.csv'), index=False)

In [None]:
# checkpoint = ptu.load_model(version=version, versions_dir=versions_dir, epoch='best', seed=42)
# loss, score = checkpoint.predict(test_dataset.dataset,
#                                  batch_size=32,
#                                  device=device,
#                                  results=False,
#                                  decision_func=chu.test_chu_liu_edmonds)
# print(f'chu_liu_edmonds_UAS: {score}')

In [None]:
# %%time
# checkpoint.model = checkpoint.model.to(device)
# checkpoint.model.train()
# batch_size = 32

# loader = torch.utils.data.DataLoader(dataset=train_dataset.dataset, batch_size=batch_size, shuffle=True)
# for batch in loader:
#     loss, flat_y, flat_out, mask, out, y = utils.loss_decision_func(checkpoint, device, batch, prints=True)
#     break
# torch.cuda.empty_cache()