Skip to content

Commit

Permalink
add progress bar
Browse files Browse the repository at this point in the history
  • Loading branch information
pplonski committed Apr 20, 2019
1 parent 70fc706 commit b41ec89
Show file tree
Hide file tree
Showing 16 changed files with 94 additions and 101 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -8,3 +8,4 @@ catboost==0.13.1
h5py==2.9.0
tensorflow==1.13.1
Keras==2.2.4
tqdm==4.31.1
90 changes: 44 additions & 46 deletions supervised/automl.py
Expand Up @@ -3,6 +3,7 @@
import time
import numpy as np
import pandas as pd
from tqdm import tqdm

from supervised.models.learner_xgboost import XgbLearner
from supervised.iterative_learner_framework import IterativeLearner
Expand Down Expand Up @@ -33,7 +34,7 @@ def __init__(
top_models_to_improve=5,
train_ensemble=True,
verbose=True,
seed = 1
seed=1,
):
self._total_time_limit = total_time_limit
self._time_limit = (
Expand All @@ -53,19 +54,20 @@ def __init__(
self._algorithms = algorithms
self._verbose = verbose

# single models including models in the folds
estimated_models_to_check = (
len(self._algorithms) * self._start_random_models
+ self._top_models_to_improve * self._hill_climbing_steps * 2
) * 5

if self._total_time_limit is not None:
estimated_models_to_check = (
len(self._algorithms)
* (
self._start_random_models
+ self._top_models_to_improve * self._hill_climbing_steps * 2
)
* 5
)
# set time limit for single model training
# the 0.85 is safe scale factor, to not exceed time limit
self._time_limit = self._total_time_limit * 0.85 / estimated_models_to_check

self._progress_bar = tqdm(
total=int(estimated_models_to_check/5), desc="MLJAR AutoML", unit="model"
)

if len(self._algorithms) == 0:
self._algorithms = list(
ModelsRegistry.registry[BINARY_CLASSIFICATION].keys()
Expand Down Expand Up @@ -94,7 +96,9 @@ def get_additional_metrics(self):

def _get_model_params(self, model_type, X, y):
model_info = ModelsRegistry.registry[BINARY_CLASSIFICATION][model_type]
model_params = RandomParameters.get(model_info["params"], len(self._models) + self._seed)
model_params = RandomParameters.get(
model_info["params"], len(self._models) + self._seed
)
required_preprocessing = model_info["required_preprocessing"]
model_additional = model_info["additional"]
preprocessing_params = PreprocessingTuner.get(
Expand All @@ -110,6 +114,21 @@ def _get_model_params(self, model_type, X, y):
},
}

def keep_model(self, model):
if model is None:
return
self._models += [model]
self.verbose_print(
"Learner {} final loss {} time {} seconds".format(
model.get_name(),
model.get_final_loss(),
np.round(model.get_train_time(),2),
)
)
self.log_train_time(
model.get_name(), model.get_train_time()
)

def train_model(self, params, X, y):
metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})
early_stop = EarlyStopping({"metric": {"name": "logloss"}})
Expand All @@ -119,16 +138,19 @@ def train_model(self, params, X, y):
)
il_key = il.get_params_key()
if il_key in self._models_params_keys:
self._progress_bar.update(1)
return None
self._models_params_keys += [il_key]
if self.should_train_next(il.get_name()):
il.train({"train": {"X": X, "y": y}})
self._progress_bar.update(1)
return il
self._progress_bar.update(1)
return None

def verbose_print(self, msg):
if self._verbose:
print(msg)
self._progress_bar.write(msg)

def log_train_time(self, model_type, train_time):
if model_type in self._models_train_time:
Expand Down Expand Up @@ -164,14 +186,7 @@ def not_so_random_step(self, X, y):
for i in range(self._start_random_models):
params = self._get_model_params(model_type, X, y)
m = self.train_model(params, X, y)
if m is not None:
self._models += [m]
self.verbose_print(
"Learner {} final loss {} time {}".format(
m.get_name(), m.get_final_loss(), m.get_train_time()
)
)
self.log_train_time(m.get_name(), m.get_train_time())
self.keep_model(m)

def hill_climbing_step(self, X, y):
for hill_climbing in range(self._hill_climbing_steps):
Expand All @@ -182,40 +197,24 @@ def hill_climbing_step(self, X, y):
models = sorted(models, key=lambda x: x[0])
for i in range(min(self._top_models_to_improve, len(models))):
m = models[i][1]
for p in HillClimbing.get(m.params.get("learner"), len(self._models) + self._seed):
for p in HillClimbing.get(
m.params.get("learner"), len(self._models) + self._seed
):
if p is not None:
all_params = copy.deepcopy(m.params)
all_params["learner"] = p
new_model = self.train_model(all_params, X, y)
if new_model is not None:
self._models += [new_model]
self.verbose_print(
"Learner {} final loss {} time {}".format(
new_model.get_name(),
new_model.get_final_loss(),
new_model.get_train_time(),
)
)
self.log_train_time(
new_model.get_name(), new_model.get_train_time()
)
self.keep_model(new_model)
else:
self._progress_bar.update(1)

def ensemble_step(self, y):
if self._train_ensemble:
self.ensemble = Ensemble()
X_oof = self.ensemble.get_oof_matrix(self._models)
self.ensemble.fit(X_oof, y)
self._models += [self.ensemble]
self.verbose_print(
"Learner {} final loss {} time {}".format(
self.ensemble.get_name(),
self.ensemble.get_final_loss(),
self.ensemble.get_train_time(),
)
)
self.log_train_time(
self.ensemble.get_name(), self.ensemble.get_train_time()
)
self.keep_model(self.ensemble)
self._progress_bar.update(1)

def fit(self, X, y):
start_time = time.time()
Expand All @@ -231,10 +230,8 @@ def fit(self, X, y):

# start with not-so-random models
self.not_so_random_step(X, y)

# perform hill climbing steps on best models
self.hill_climbing_step(X, y)

# train ensemble
self.ensemble_step(y)

Expand All @@ -246,6 +243,7 @@ def fit(self, X, y):

self.get_additional_metrics()
self._fit_time = time.time() - start_time
self._progress_bar.close()

def predict(self, X):
if self._best_model is not None:
Expand Down
8 changes: 3 additions & 5 deletions supervised/callbacks/max_iters_constraint.py
Expand Up @@ -14,12 +14,10 @@ def __init__(self, params):
self.name = params.get("name", "max_iters_constraint")
self.max_iters = params.get("max_iters", 10)


def add_and_set_learner(self, learner):
self.learner = learner

def on_iteration_end(self, logs, predictions):
# iters are computed starting from 0
if logs.get("iter_cnt")+1 >= self.max_iters:
def on_iteration_end(self, logs, predictions):
# iters are computed starting from 0
if logs.get("iter_cnt") + 1 >= self.max_iters:
self.learner.stop_training = True

2 changes: 1 addition & 1 deletion supervised/models/learner_catboost.py
Expand Up @@ -40,7 +40,7 @@ def __init__(self, params):
"random_strength": self.params.get("random_strength", 1),
"bagging_temperature": self.params.get("bagging_temperature", 1),
"l2_leaf_reg": self.params.get("l2_leaf_reg", 3),
"random_seed": self.params.get("seed", 1)
"random_seed": self.params.get("seed", 1),
}

log.debug("CatBoostLearner __init__")
Expand Down
2 changes: 1 addition & 1 deletion supervised/models/learner_lightgbm.py
Expand Up @@ -40,7 +40,7 @@ def __init__(self, params):
"bagging_fraction": self.params.get("bagging_fraction", 0.7),
"bagging_freq": self.params.get("bagging_freq", 1),
"verbose": -1,
"seed": self.params.get("seed", 1)
"seed": self.params.get("seed", 1),
}

log.debug("LightgbmLearner __init__")
Expand Down
2 changes: 2 additions & 0 deletions supervised/models/learner_nn.py
Expand Up @@ -17,9 +17,11 @@
import numpy as np
import tensorflow as tf
import random as rn

np.random.seed(42)
rn.seed(12345)
from keras import backend as K

tf.set_random_seed(1234)
tf.logging.set_verbosity(tf.logging.ERROR)
################################################################################
Expand Down
3 changes: 1 addition & 2 deletions supervised/models/learner_xgboost.py
Expand Up @@ -54,10 +54,9 @@ def __init__(self, params):
"colsample_bytree": self.params.get("colsample_bytree", 0.8),
"silent": self.params.get("silent", 1),
"seed": self.params.get("seed", 1),

}
log.debug("XgbLearner __init__")

def update(self, update_params):
# Dont need to update boosting rounds, it is adding rounds incrementally
pass
Expand Down
8 changes: 3 additions & 5 deletions tests/test_automl.py
Expand Up @@ -27,20 +27,18 @@ def setUpClass(cls):
cls.X = pd.DataFrame(cls.X, columns=["f0", "f1", "f2", "f3", "f4"])
# cls.y = pd.DataFrame(cls.y)



def test_reproduce_fit(self):
metric = Metric({"name": "logloss"})
losses = []
for i in range(2):
automl = AutoML(
total_time_limit=10000, # the time limit should be big enough too not interrupt the training
total_time_limit=10000, # the time limit should be big enough too not interrupt the training
algorithms=["Xgboost"],
start_random_models=2,
hill_climbing_steps=1,
train_ensemble=True,
verbose=True,
seed = 12
seed=12,
)
automl.fit(self.X, self.y)
y_predicted = automl.predict(self.X)["p_1"]
Expand Down Expand Up @@ -93,6 +91,6 @@ def test_predict_labels(self):
self.assertTrue("A" in np.unique(y_predicted["label"]))
self.assertTrue("B" in np.unique(y_predicted["label"]))


if __name__ == "__main__":
unittest.main()
8 changes: 4 additions & 4 deletions tests/test_automl_with_data.py
Expand Up @@ -17,7 +17,7 @@ class AutoMLTestWithData(unittest.TestCase):
def test_fit_and_predict(self):
seed = 1706 + 1
for dataset_id in [31]: # 720 # 31,44,737
df = pd.read_csv("./tests/data/data/{0}.csv".format(dataset_id))
df = pd.read_csv("./tests/data/{0}.csv".format(dataset_id))
x_cols = [c for c in df.columns if c != "target"]
X = df[x_cols]
y = df["target"]
Expand All @@ -26,17 +26,17 @@ def test_fit_and_predict(self):
X, y, test_size=0.3, random_state=seed
)
automl = AutoML(
total_time_limit=60 * 6000,
total_time_limit=60 * 1,
algorithms=["LightGBM", "RF", "NN", "CatBoost", "Xgboost"],
start_random_models=10,
start_random_models=5,
hill_climbing_steps=3,
top_models_to_improve=3,
train_ensemble=True,
verbose=True,
)
automl.fit(X_train, y_train)

response = automl.predict(X_test)
response = automl.predict(X_test)["p_1"]
# Compute the logloss on test dataset
ll = log_loss(y_test, response)
print("(*) Dataset id {} logloss {}".format(dataset_id, ll))
Expand Down
4 changes: 1 addition & 3 deletions tests/test_iterative_learner_framework.py
Expand Up @@ -52,11 +52,10 @@ def setUpClass(cls):
"max_iters": 3,
"silent": 1,
"max_depth": 1,
"seed": 1
"seed": 1,
},
}


def test_reproduce_fit(self):
losses = []
for i in range(2):
Expand All @@ -67,7 +66,6 @@ def test_reproduce_fit(self):
losses += [metric(self.y, y_predicted)]
assert_almost_equal(losses[0], losses[1])


def test_fit_and_predict(self):
il = IterativeLearner(self.train_params, callbacks=[])
il.train(self.data)
Expand Down
17 changes: 8 additions & 9 deletions tests/tests_callbacks/test_max_iters_constraint.py
Expand Up @@ -41,19 +41,15 @@ def setUpClass(cls):
cls.kfolds = 3
cls.train_params = {
"preprocessing": {},
"validation": {
"validation_type": "kfold",
"kfold": cls.kfolds,

},
"validation": {"validation_type": "kfold", "kfold": cls.kfolds},
"learner": {
"learner_type": "Xgboost",
"objective": "binary:logistic",
"eval_metric": "logloss",
"eta": 0.01,
"silent": 1,
"max_depth": 1,
"seed": 1
"seed": 1,
},
}

Expand All @@ -67,9 +63,12 @@ def test_fit_and_predict(self):
il.train(self.data)
metric_logs = il.get_metric_logs()
for k in range(self.kfolds):
self.assertEqual(len(metric_logs[il.learners[k].uid]["train"]["logloss"]), iters_cnt)
self.assertNotEqual(len(metric_logs[il.learners[k].uid]["train"]["logloss"]), MAX_STEPS)

self.assertEqual(
len(metric_logs[il.learners[k].uid]["train"]["logloss"]), iters_cnt
)
self.assertNotEqual(
len(metric_logs[il.learners[k].uid]["train"]["logloss"]), MAX_STEPS
)


if __name__ == "__main__":
Expand Down
17 changes: 12 additions & 5 deletions tests/tests_callbacks/test_metric_logger.py
Expand Up @@ -51,7 +51,7 @@ def setUpClass(cls):
"eta": 0.01,
"silent": 1,
"max_depth": 1,
"seed": 1
"seed": 1,
},
}

Expand All @@ -62,10 +62,17 @@ def test_fit_and_predict(self):
il = IterativeLearner(self.train_params, callbacks=[metric_logger])
il.train(self.data)
metric_logs = il.get_metric_logs()
self.assertEqual(len(metric_logs[il.learners[0].uid]["train"]["logloss"]), len(metric_logs[il.learners[0].uid]["train"]["auc"]))
self.assertEqual(len(metric_logs[il.learners[0].uid]["train"]["logloss"]), len(metric_logs[il.learners[0].uid]["iters"]))
self.assertEqual(len(metric_logs[il.learners[0].uid]["train"]["logloss"]), MAX_STEPS)

self.assertEqual(
len(metric_logs[il.learners[0].uid]["train"]["logloss"]),
len(metric_logs[il.learners[0].uid]["train"]["auc"]),
)
self.assertEqual(
len(metric_logs[il.learners[0].uid]["train"]["logloss"]),
len(metric_logs[il.learners[0].uid]["iters"]),
)
self.assertEqual(
len(metric_logs[il.learners[0].uid]["train"]["logloss"]), MAX_STEPS
)


if __name__ == "__main__":
Expand Down

0 comments on commit b41ec89

Please sign in to comment.