Skip to content

Commit

Permalink
working
Browse files Browse the repository at this point in the history
  • Loading branch information
pplonski committed Apr 9, 2019
1 parent 213b89b commit 6416529
Show file tree
Hide file tree
Showing 13 changed files with 100 additions and 86 deletions.
37 changes: 17 additions & 20 deletions supervised/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
class AutoML:
def __init__(
self,
total_time_limit=None,
total_time_limit=60*60,
learner_time_limit=120,
algorithms=["CatBoost", "Xgboost", "RF", "LightGBM", "NN"],
start_random_models=10,
Expand All @@ -35,9 +35,9 @@ def __init__(
learner_time_limit
) # time limit in seconds for single learner
self._train_ensemble = train_ensemble
self._models = []
self._models = [] # instances of iterative learner framework or ensemble
self._models_params_keys = []
self._best_model = None
self._best_model = None # it is instance of iterative learner framework or ensemble
self._validation = {"validation_type": "kfold", "k_folds": 5, "shuffle": True}

self._start_random_models = start_random_models
Expand All @@ -47,7 +47,6 @@ def __init__(
self._verbose = verbose

if self._total_time_limit is not None:

estimated_models_to_check = (
len(self._algorithms)
* (
Expand All @@ -59,7 +58,6 @@ def __init__(
# set time limit for single model training
# the 0.85 is safe scale factor, to not exceed time limit
self._time_limit = self._total_time_limit * 0.85 / estimated_models_to_check
print("single time limit->", self._time_limit)

if len(self._algorithms) == 0:
self._algorithms = list(
Expand Down Expand Up @@ -113,7 +111,7 @@ def should_train_next(self, model_type):
# no time limit, just train, dont ask
if self._total_time_limit is None:
return True
print(self._models_train_time)

total_time_already_spend = (
0
if model_type not in self._models_train_time
Expand All @@ -125,10 +123,6 @@ def should_train_next(self, model_type):
else np.mean(self._models_train_time[model_type])
)

print("Total time already spend", total_time_already_spend)
print("Mean time already spend", mean_time_already_spend)
print(">", 0.85 * self._total_time_limit / float(len(self._algorithms)))

if (
total_time_already_spend + mean_time_already_spend
< 0.85 * self._total_time_limit / float(len(self._algorithms))
Expand Down Expand Up @@ -197,6 +191,8 @@ def ensemble_step(self, y):
def fit(self, X, y):
start_time = time.time()
X.reset_index(drop=True, inplace=True)
if not isinstance(y, pd.DataFrame):
y = pd.DataFrame(y)
y.reset_index(drop=True, inplace=True)

# start with not-so-random models
Expand All @@ -216,17 +212,18 @@ def fit(self, X, y):
self._fit_time = time.time() - start_time

def predict(self, X):
return self._best_model.predict(X)
if self._best_model is not None:
return self._best_model.predict(X)
return None

def to_json(self):
save_details = []
for il in self._models:
save_details += [il.save()]
return save_details
return self._best_model.to_json() if self._best_model is not None else None

def from_json(self, json_data):
self._models = []
for save_detail in json_data:
il = IterativeLearner()
il.load(save_detail)
self._models += [il]
# pretty sure that this can be easily refactored
if json_data["algorithm_short_name"] == "Ensemble":
self._best_model = Ensemble()
self._best_model.from_json(json_data)
else:
self._best_model = IterativeLearner(json_data.get("params"))
self._best_model.from_json(json_data)
6 changes: 4 additions & 2 deletions supervised/iterative_learner_framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def predict(self, X):
y_predicted += learner.predict(validation_data.get("X"))
return y_predicted / float(len(self.learners))

def save(self):
def to_json(self):
preprocessing = []
for p in self.preprocessings:
preprocessing += [p.to_json()]
Expand All @@ -118,14 +118,16 @@ def save(self):
zf.close()
desc = {
"uid": self.uid,
"algorithm_short_name": self.get_name(),
"framework_file": self.framework_file,
"framework_file_path": self.framework_file_path,
"preprocessing": preprocessing,
"learners": learners_desc,
"params": self.params # this is needed while constructing new Iterative Learner Framework
}
return desc

def load(self, json_desc):
def from_json(self, json_desc):
self.uid = json_desc.get("uid", self.uid)
self.framework_file = json_desc.get("framework_file", self.framework_file)
self.framework_file_path = json_desc.get(
Expand Down
4 changes: 2 additions & 2 deletions supervised/learner_framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ def train(self, data):
def predict(self, X):
pass

def save(self):
def to_json(self):
pass

def load(self, json_desc):
def from_json(self, json_desc):
pass
4 changes: 2 additions & 2 deletions supervised/models/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def predict(self, X):
)
return y_predicted / total_repeat

def save(self):
def to_json(self):
models_json = []
for selected in self.selected_models:
model = selected["model"]
Expand All @@ -128,7 +128,7 @@ def save(self):
}
return json_desc

def load(self, json_desc):
def from_json(self, json_desc):

self.library_version = json_desc.get("library_version", self.library_version)
self.algorithm_name = json_desc.get("algorithm_name", self.algorithm_name)
Expand Down
2 changes: 1 addition & 1 deletion supervised/models/learner_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def __init__(self, params):
super(SklearnTreesClassifierLearner, self).__init__(params)

def fit(self, X, y):
self.model.fit(X, y)
self.model.fit(X, np.ravel(y))
self.model.n_estimators += self.trees_in_step

def predict(self, X):
Expand Down
2 changes: 1 addition & 1 deletion supervised/preprocessing/preprocessing_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def run(self, train_data=None, validation_data=None):
columns_preprocessing,
)
)
print("Remove columns", cols_to_remove)

if X_train is not None:
X_train.drop(cols_to_remove, axis=1, inplace=True)
if X_validation is not None:
Expand Down
2 changes: 1 addition & 1 deletion supervised/tuner/preprocessing_tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def get(required_preprocessing, data, machinelearning_task):

# remove empty columns and columns with only one variable
empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0]
constant_column = len(np.unique(X[col])) == 1
constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1
if empty_column or constant_column:
preprocessing_to_apply += ["remove_column"]
columns_preprocessing[col] = preprocessing_to_apply
Expand Down
27 changes: 19 additions & 8 deletions tests/test_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,29 @@ def setUpClass(cls):
random_state=0,
)
cls.X = pd.DataFrame(cls.X, columns=["f0", "f1", "f2", "f3", "f4"])
cls.y = pd.DataFrame(cls.y)
#cls.y = pd.DataFrame(cls.y)

def test_fit_and_predict(self):
automl = AutoML(total_time_limit=1, algorithms=["Xgboost"])
metric = Metric({"name": "logloss"})

automl = AutoML(total_time_limit=10, algorithms=["Xgboost"],
start_random_models=5,
hill_climbing_steps=0)
automl.fit(self.X, self.y)

# y_predicted = automl.predict(self.X)
# print(y_predicted)
# metric = Metric({"name": "logloss"})
# loss = metric(self.y, y_predicted)
# print("Loss", loss)
# self.assertTrue(y_predicted is not None)
y_predicted = automl.predict(self.X)
self.assertTrue(y_predicted is not None)
loss = metric(self.y, y_predicted)
self.assertTrue(loss < 0.5)

params = automl.to_json()
automl2 = AutoML()
automl2.from_json(params)

y_predicted2 = automl2.predict(self.X)
self.assertTrue(y_predicted2 is not None)
loss2 = metric(self.y, y_predicted2)
self.assertTrue(loss2 < 0.5)


if __name__ == "__main__":
Expand Down
57 changes: 29 additions & 28 deletions tests/test_automl_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,35 +15,36 @@

class AutoMLTestWithData(unittest.TestCase):
def test_fit_and_predict(self):
with open("./result.txt", "a") as f_result:
for dataset_id in [38]: # 3, 24, 31, 38, 44, 179, 737, 720
df = pd.read_csv("./tests/data/data/{0}.csv".format(dataset_id))
x_cols = [c for c in df.columns if c != "target"]
X = df[x_cols]
y = df["target"]

for repeat in range(10):

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
X, y, test_size=0.3, random_state=1706 + repeat
)
automl = AutoML(
total_time_limit=60 * 1, # 1h limit
algorithms=[ "Xgboost"], # ["LightGBM", "CatBoost", "Xgboost", "RF", "NN"],
start_random_models=3,
hill_climbing_steps=0,
top_models_to_improve=1,
train_ensemble=True,
verbose=True,
)
automl.fit(X_train, y_train)

response = automl.predict(X_test)
# Compute the logloss on test dataset
ll = log_loss(y_test, response)
print(
"{} {} {} {}".format(repeat, dataset_id, ll, automl._fit_time)
)
for dataset_id in [3, 24, 31, 38, 44, 179, 737, 720]:
df = pd.read_csv("./tests/data/data/{0}.csv".format(dataset_id))
x_cols = [c for c in df.columns if c != "target"]
X = df[x_cols]
y = df["target"]

for repeat in range(1):

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
X, y, test_size=0.3, random_state=1706 + repeat
)
automl = AutoML(
total_time_limit=60 * 1, # 1h limit
algorithms=[ "Xgboost"], # ["LightGBM", "CatBoost", "Xgboost", "RF", "NN"],
start_random_models=3,
hill_climbing_steps=1,
top_models_to_improve=1,
train_ensemble=True,
verbose=True,
)
automl.fit(X_train, y_train)

response = automl.predict(X_test)
# Compute the logloss on test dataset
ll = log_loss(y_test, response)
print(
"{} {} {} {}".format(repeat, dataset_id, ll, automl._fit_time)
)
with open("./result.txt", "a") as f_result:
f_result.write(
"{} {} {} {}\n".format(repeat, dataset_id, ll, automl._fit_time)
)
Expand Down
7 changes: 3 additions & 4 deletions tests/test_iterative_learner_framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,11 @@ def test_save_and_load(self):
metric = Metric({"name": "logloss"})
loss = metric(self.y, il.predict(self.X))

json_desc = il.save()
print(json_desc)
json_desc = il.to_json()

il2 = IterativeLearner(self.train_params, callbacks=[])
il2 = IterativeLearner(json_desc.get("params"), callbacks=[])
self.assertTrue(il.uid != il2.uid)
il2.load(json_desc)
il2.from_json(json_desc)
self.assertTrue(il.uid == il2.uid)
loss2 = metric(self.y, il2.predict(self.X))
assert_almost_equal(loss, loss2)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_iterative_learner_framework_with_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,11 @@ def test_save_and_load(self):
metric = Metric({"name": "logloss"})
loss_1 = metric(self.data["train"]["y"], y_predicted)

json_desc = il.save()
json_desc = il.to_json()

il2 = IterativeLearner(self.train_params, callbacks=[])
self.assertTrue(il.uid != il2.uid)
il2.load(json_desc)
il2.from_json(json_desc)
self.assertTrue(il.uid == il2.uid)
y_predicted_2 = il2.predict(self.data["train"]["X"])
loss_2 = metric(self.data["train"]["y"], y_predicted_2)
Expand Down
4 changes: 2 additions & 2 deletions tests/tests_models/test_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,9 @@ def test_save_load(self):
ensemble.fit(self.X, self.y)
y = ensemble.predict(self.X)
assert_almost_equal(y[0], 0.1)
ensemble_json = ensemble.save()
ensemble_json = ensemble.to_json()
ensemble2 = Ensemble()
ensemble2.load(ensemble_json)
ensemble2.from_json(ensemble_json)
y2 = ensemble2.predict(self.X)
assert_almost_equal(y2[0], 0.1)

Expand Down
30 changes: 17 additions & 13 deletions tests/tests_preprocessing/test_preprocessing_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def test_run_fill_median_convert_integer(self):
self.assertEqual(X_train["col4"][3], 2)

params_json = ps.to_json()
print(params_json)

self.assertTrue("missing_values" in params_json)
self.assertTrue("categorical" in params_json)
self.assertTrue("categorical_y" not in params_json)
Expand Down Expand Up @@ -357,7 +357,6 @@ def test_to_and_from_json_run_fill_median_convert_integer(self):


def test_empty_column(self):
print("--------------------empty")
# training data
d = {
"col1": [np.nan, np.nan, np.nan, np.nan],
Expand All @@ -379,22 +378,27 @@ def test_empty_column(self):
}

ps = PreprocessingStep(preprocessing_params)

train_data, _ = ps.run(train_data={"X": X_train, "y": y_train})
X_train1 = train_data.get("X")

print("columns", X_train1.columns)

self.assertTrue("col1" not in X_train1.columns)
self.assertEqual(3, len(X_train1.columns))
train_data2 = ps.transform(validation_data={"X": X_train, "y": y_train})
X_train2 = train_data2.get("X")
self.assertTrue("col1" not in X_train2.columns)
self.assertEqual(3, len(X_train2.columns))
for col in ["col2", "col3", "col4"]:
self.assertTrue(col in X_train2.columns)

print("columns", X_train2.columns)

#for col in ["col1", "col2", "col3", "col4"]:
# self.assertTrue(col in X_train.columns)

#params_json = ps.to_json()
#self.assertFalse(params_json) # should be empty
params_json = ps.to_json()
ps2 = PreprocessingStep()
ps2.from_json(params_json)

train_data3 = ps2.transform(validation_data={"X": X_train, "y": y_train})
X_train3 = train_data3.get("X")
self.assertTrue("col1" not in X_train3.columns)
self.assertEqual(3, len(X_train3.columns))
for col in ["col2", "col3", "col4"]:
self.assertTrue(col in X_train3.columns)



Expand Down

0 comments on commit 6416529

Please sign in to comment.