In [1]:
import os
import sys
import time
import math

import dill
from tqdm import tqdm
import hyperopt as hpo

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from src import utils
from src import bilstm as m1
import src.dataset as dset
import src.pytorch_utils as ptu
import src.chu_liu_edmonds as chu

import warnings
warnings.filterwarnings('ignore')

seed = 42
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

np.random.seed(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
versions_dir = 'models'

cuda


In [2]:
train_dataset = dset.DataSet('data/train.labeled', tqdm_bar=True)
test_dataset = dset.DataSet('data/test.labeled', train_dataset=train_dataset, tqdm_bar=True)
# comp_dataset = dset.DataSet('data/comp.unlabeled', train_dataset=train_dataset, tagged=False, tqdm_bar=True)

100%|██████████| 125430/125430 [00:16<00:00, 7582.27it/s]
100%|██████████| 25325/25325 [00:03<00:00, 7921.30it/s]


In [3]:
version = 'V1_final'
save = False
# save = True

model = m1.BiLSTM(train_dataset=train_dataset,
                  word_embed_dim=100,
                  tag_embed_dim=25,
                  hidden_dim=125,
                  num_layers=2,
                  bias=True,
                  attention_dim=100,
                  p_dropout=0.1,
                  word_dropout=0.25)

checkpoint = ptu.Checkpoint(version=version,
                            model=model,
                            optimizer=torch.optim.Adam,
                            criterion=nn.NLLLoss,
                            score=lambda y_true, y_pred: (np.array(y_true) == np.array(y_pred)).mean(),
                            versions_dir=versions_dir,
                            loss_decision_func=utils.loss_decision_func,
                            out_decision_func=lambda y_pred, flat_y_pred, mask, padding: flat_y_pred.argmax(axis=1),
                            seed=42,
                            custom_run_func=None,
                            save=save,
                            prints=True)

model version: V1_final
Number of parameters 2097001 trainable 2097001


In [4]:
hyperparam_list = [
    {'train_epochs': 5, 'batch_size': 16, 'optimizer_params': {'lr': 1e-3, 'weight_decay': 1e-6}},
    {'train_epochs': 25, 'batch_size': 64, 'optimizer_params': {'lr': 4e-4, 'weight_decay': 1e-6}},
]

for session in hyperparam_list:
    checkpoint.train(device=device,
                     train_dataset=train_dataset.dataset,
                     val_dataset=test_dataset.dataset,
                     prints=True,
                     epochs_save=5,
                     save=save,
                     **session)

epoch   1/  5 | train_loss 1.07067 | val_loss 1.08101 | train_score 0.68392 | val_score 0.68214 | train_time   0.60 min *
epoch   2/  5 | train_loss 0.70715 | val_loss 0.73509 | train_score 0.78766 | val_score 0.78125 | train_time   1.20 min *
epoch   3/  5 | train_loss 0.55907 | val_loss 0.61712 | train_score 0.83276 | val_score 0.81566 | train_time   1.80 min *
epoch   4/  5 | train_loss 0.47470 | val_loss 0.56324 | train_score 0.85721 | val_score 0.83145 | train_time   2.39 min *
epoch   5/  5 | train_loss 0.42634 | val_loss 0.54023 | train_score 0.86868 | val_score 0.83498 | train_time   2.98 min *
epoch   6/ 30 | train_loss 0.39546 | val_loss 0.52743 | train_score 0.88059 | val_score 0.84432 | train_time   3.53 min *
epoch   7/ 30 | train_loss 0.38487 | val_loss 0.52613 | train_score 0.88346 | val_score 0.84559 | train_time   4.09 min *
epoch   8/ 30 | train_loss 0.37710 | val_loss 0.52802 | train_score 0.88586 | val_score 0.84448 | train_time   4.64 min
epoch   9/ 30 | train_loss

KeyboardInterrupt: 

In [37]:
# init_trials = hpo.Trials()
# init_log = pd.DataFrame(columns=['timestamp', 'test_score', 'space'] + list(init_space.keys()))

with open(os.path.join(versions_dir, version, 'trials.pth'), "rb") as f:
    init_trials = dill.load(f)
init_log = pd.read_csv(os.path.join(versions_dir, version, 'trials_log.csv'))

In [38]:
init_space = dict(sorted(list({
    'train_epochs': 5,
    'batch_size': hpo.hp.quniform('batch_size', low=3, high=5, q=1),  # 8-16-32
    'optimizer__lr': hpo.hp.uniform('optimizer__lr', low=1e-4, high=1e-3),
    'optimizer__wd': 0.0, # hpo.hp.choice('optimizer__wd_ind', [0, hpo.hp.uniform('optimizer__wd', low=0, high=1e-5)]),
    'bias': hpo.hp.choice('bias', [True, False]),
    
    'word_embed_dim': 100,  # 300
    'tag_embed_dim': 25,  # hpo.hp.quniform('tag_embed_dim', low=30, high=50, q=4)
    'hidden_dim': 125,  # hpo.hp.quniform('hidden_dim', low=100, high=400, q=50)
    'num_layers': 2,  # hpo.hp.quniform('num_layers', low=2, high=4, q=1)
    'mlp1_dim': 100,  # hpo.hp.quniform('mlp1_dim', low=100, high=400, q=50)
    'p_dropout': 0.1,  # hpo.hp.normal('p_dropout', mu=0.2, sigma=0.1)
    'word_dropout': 0.25,  # hpo.hp.normal('word_dropout', mu=0.3, sigma=0.1)
}.items()), key=lambda x: x[0]))

def init_objective(space, save=False):
    display(space)
    last_score = init_log['test_score'].max() if len(init_log) > 0 else 0.0
#     print('last_score', last_score)
    batch_size = min(len(train_dataset.dataset), int(2 ** space['batch_size']))
    p_dropout = max(0.0, min(0.7, space['p_dropout']))
    word_dropout = max(0.0, min(0.7, space['word_dropout']))

    model = m1.BiLSTM(train_dataset=train_dataset,
                      word_embed_dim=100,
                      tag_embed_dim=space['tag_embed_dim'],
                      hidden_dim=space['hidden_dim'],
                      num_layers=space['mlp1_dim'],
                      bias=space['bias'],
                      mlp1_dim=space['mlp1_dim'],
                      p_dropout=p_dropout,
                      word_dropout=word_dropout)

    init_checkpoint = ptu.Checkpoint(version=version,
                                     model=model,
                                     optimizer=torch.optim.Adam,
                                     criterion=nn.NLLLoss,
                                     score=lambda y_true, y_pred: (np.array(y_true) == np.array(y_pred)).mean(),
                                     versions_dir=versions_dir,
                                     loss_decision_func=utils.loss_decision_func,
                                     out_decision_func=lambda y_pred, flat_y_pred, mask, padding: flat_y_pred.argmax(axis=1),
                                     seed=42,
                                     custom_run_func=None,
                                     save=save,
                                     prints=False)
    
    init_checkpoint.train(device=device,
                          train_dataset=train_dataset.dataset,
                          val_dataset=test_dataset.dataset,
                          train_epochs=space['train_epochs'],
                          batch_size=batch_size,
                          optimizer_params={
                              'lr': space['optimizer__lr'],
                              'weight_decay': space['optimizer__wd'],
                          },
                          prints=True,
                          epochs_save=0,
                          save=save)
    
    test_score = init_checkpoint.get_log(col='val_score', epoch=-1)
#     print('test_score', test_score)
    ###############################################################
    if test_score > last_score:
        init_checkpoint.save(epoch=True)
    init_log.loc[init_log.index.max() + 1 if len(init_log) > 0 else 0] = [time.strftime('%d-%m-%Y %H:%M:%S'), test_score, space] + list(space.values())
    return -test_score

# session_space = dict(sorted(list({
#     'train_epochs': 5,
#     'batch_size_mult': min(len(X_train), int(2**hpo.hp.quniform('batch_size_mult', low=5, high=9, q=1))),
#     'optimizer__lr_mult': hpo.hp.uniform('optimizer__lr_mult', low=1e-5, high=1e-3),
#     'optimizer__weight_decay': hpo.hp.uniform('optimizer__weight_decay', low=1e-5, high=1e-3),
#     'p_dropout': max(0.0, min(0.9, hpo.hp.normal('p_dropout', mu=0.5, sigma=0.15))),
# }.items()), key=lambda x: x[0]))

In [39]:
iters = 500

_ = hpo.fmin(init_objective,
             init_space,
             algo=hpo.tpe.suggest,
             trials=init_trials,
             max_queue_len=1,
             max_evals=iters)

  4%|▎         | 18/500 [00:00<?, ?trial/s, best loss=?]

{'batch_size': 3.0,
 'bias': True,
 'hidden_dim': 125,
 'mlp1_dim': 100,
 'num_layers': 2,
 'optimizer__lr': 0.0007752413054159258,
 'optimizer__wd': 0.0,
 'p_dropout': 0.1,
 'tag_embed_dim': 25,
 'train_epochs': 5,
 'word_dropout': 0.25,
 'word_embed_dim': 100}

  4%|▎         | 18/500 [00:06<03:03,  2.63trial/s, best loss=?]


KeyboardInterrupt: 

In [40]:
with open(os.path.join(versions_dir, version, 'trials.pth'), 'wb') as f:
    dill.dump(init_trials, f)
    
init_log.to_csv(os.path.join(versions_dir, version, 'trials_log.csv'))

In [13]:
init_lr = 2e-3
decay = (1 - 0.2)
for i in range(15):
    print('{}'.format(init_lr*(decay ** (i+1))))

0.0016
0.0012800000000000003
0.0010240000000000002
0.0008192000000000002
0.0006553600000000002
0.0005242880000000002
0.0004194304000000002
0.0003355443200000002
0.00026843545600000016
0.00021474836480000011
0.0001717986918400001
0.0001374389534720001
0.00010995116277760008
8.796093022208007e-05
7.036874417766406e-05
