In [1]:
import pandas as pd

from src.dataset import DepressionDataset
from src.train import train
from src.model import GCN


* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
raw_train = pd.read_table("./data/bronze/train.tsv", sep="\t").groupby("label").head(2000).reset_index(drop=True)
raw_dev = pd.read_table("./data/bronze/dev.tsv", sep="\t")

In [3]:
dataset_path = './data/gold'
dataset_root = dataset_path + "/w2v_window_11"

train_dataset = DepressionDataset(
    root=dataset_root, filename="",
    prefix="train",
    raw_data=raw_train
)

dev_dataset = DepressionDataset(
    root=dataset_root, filename="",
    prefix="dev",
    raw_data=raw_dev
)

In [4]:
HYPERPARAMETERS = {
    "n_epochs": [300],
    "batch_size": [32, 128, 64],
    "learning_rate": [0.1, 0.05, 0.01, 0.001],
    "weight_decay": [0.0001, 0.00001, 0.001],
    "sgd_momentum": [0.9, 0.8, 0.5],
    "scheduler_gamma": [0.995, 0.9, 0.8, 0.5, 1],
    "model_embedding_size": [8, 16, 32, 64, 128],
    "model_dense_neurons": [16, 128, 64, 256, 32],
    "model_dropout": [0.1, 0.2, 0.3, 0.4, 0.5],
    "model_layers": [3],
    "model_num_classes": [3],
}

In [6]:
import mlflow
mlflow.set_experiment("with dropout")

2023/11/30 19:44:19 INFO mlflow.tracking.fluent: Experiment with name 'with dropout' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/nomomon/Desktop/research%20paper/project/mlruns/454631613486971684', creation_time=1701369859522, experiment_id='454631613486971684', last_update_time=1701369859522, lifecycle_stage='active', name='with dropout', tags={}>

In [7]:
# fetch from mlflow the hyperparameters that have already been tested
# and their corresponding best_valid_loss

mlflow_client = mlflow.tracking.MlflowClient()
experiment = mlflow_client.get_experiment_by_name("with dropout")

if experiment is not None:
    runs = mlflow.search_runs(experiment.experiment_id)
    
    keys = list(HYPERPARAMETERS.keys())
    keys = ['params.' + key for key in keys]

    hyperparameters = runs[keys + ['metrics.valid_loss']]
    valid_loss = hyperparameters['metrics.valid_loss']
else:
    print("Experiment not found.")

initial_custom = list(zip(hyperparameters.T.to_dict().values(), valid_loss.values))
len(initial_custom)

KeyError: "None of [Index(['params.n_epochs', 'params.batch_size', 'params.learning_rate',\n       'params.weight_decay', 'params.sgd_momentum', 'params.scheduler_gamma',\n       'params.model_embedding_size', 'params.model_dense_neurons',\n       'params.model_dropout', 'params.model_layers',\n       'params.model_num_classes', 'metrics.valid_loss'],\n      dtype='object')] are in the [columns]"

In [8]:
from mango import Tuner, scheduler

config = dict()
config["optimizer"] = "Bayesian"
config["num_iteration"] = 100
config["initial_custom"] = initial_custom


@scheduler.serial
def obj_func(**params):
    return train(params, train_dataset, dev_dataset, GCN)


tuner = Tuner(HYPERPARAMETERS, 
              objective=obj_func,
              conf_dict=config)
results = tuner.minimize()

  0%|          | 0/300 [00:00<?, ?it/s]

In [None]:
!mlflow server