working

mljar · Apr 9, 2019 · 6416529 · 6416529
1 parent 213b89b
commit 6416529
Show file tree

Hide file tree

Showing 13 changed files with 100 additions and 86 deletions.
diff --git a/supervised/automl.py b/supervised/automl.py
@@ -21,7 +21,7 @@
 class AutoML:
     def __init__(
         self,
-        total_time_limit=None,
+        total_time_limit=60*60,
         learner_time_limit=120,
         algorithms=["CatBoost", "Xgboost", "RF", "LightGBM", "NN"],
         start_random_models=10,
@@ -35,9 +35,9 @@ def __init__(
             learner_time_limit
         )  # time limit in seconds for single learner
         self._train_ensemble = train_ensemble
-        self._models = []
+        self._models = [] # instances of iterative learner framework or ensemble
         self._models_params_keys = []
-        self._best_model = None
+        self._best_model = None # it is instance of iterative learner framework or ensemble
         self._validation = {"validation_type": "kfold", "k_folds": 5, "shuffle": True}
 
         self._start_random_models = start_random_models
@@ -47,7 +47,6 @@ def __init__(
         self._verbose = verbose
 
         if self._total_time_limit is not None:
-
             estimated_models_to_check = (
                 len(self._algorithms)
                 * (
@@ -59,7 +58,6 @@ def __init__(
             # set time limit for single model training
             # the 0.85 is safe scale factor, to not exceed time limit
             self._time_limit = self._total_time_limit * 0.85 / estimated_models_to_check
-            print("single time limit->", self._time_limit)
 
         if len(self._algorithms) == 0:
             self._algorithms = list(
@@ -113,7 +111,7 @@ def should_train_next(self, model_type):
         # no time limit, just train, dont ask
         if self._total_time_limit is None:
             return True
-        print(self._models_train_time)
+
         total_time_already_spend = (
             0
             if model_type not in self._models_train_time
@@ -125,10 +123,6 @@ def should_train_next(self, model_type):
             else np.mean(self._models_train_time[model_type])
         )
 
-        print("Total time already spend", total_time_already_spend)
-        print("Mean time already spend", mean_time_already_spend)
-        print(">", 0.85 * self._total_time_limit / float(len(self._algorithms)))
-
         if (
             total_time_already_spend + mean_time_already_spend
             < 0.85 * self._total_time_limit / float(len(self._algorithms))
@@ -197,6 +191,8 @@ def ensemble_step(self, y):
     def fit(self, X, y):
         start_time = time.time()
         X.reset_index(drop=True, inplace=True)
+        if not isinstance(y, pd.DataFrame):
+            y = pd.DataFrame(y)
         y.reset_index(drop=True, inplace=True)
 
         # start with not-so-random models
@@ -216,17 +212,18 @@ def fit(self, X, y):
         self._fit_time = time.time() - start_time
 
     def predict(self, X):
-        return self._best_model.predict(X)
+        if self._best_model is not None:
+            return self._best_model.predict(X)
+        return None
 
     def to_json(self):
-        save_details = []
-        for il in self._models:
-            save_details += [il.save()]
-        return save_details
+        return self._best_model.to_json() if self._best_model is not None else None
 
     def from_json(self, json_data):
-        self._models = []
-        for save_detail in json_data:
-            il = IterativeLearner()
-            il.load(save_detail)
-            self._models += [il]
+        # pretty sure that this can be easily refactored
+        if json_data["algorithm_short_name"] == "Ensemble":
+            self._best_model = Ensemble()
+            self._best_model.from_json(json_data)
+        else:
+            self._best_model = IterativeLearner(json_data.get("params"))
+            self._best_model.from_json(json_data)
diff --git a/supervised/iterative_learner_framework.py b/supervised/iterative_learner_framework.py
@@ -101,7 +101,7 @@ def predict(self, X):
             y_predicted += learner.predict(validation_data.get("X"))
         return y_predicted / float(len(self.learners))
 
-    def save(self):
+    def to_json(self):
         preprocessing = []
         for p in self.preprocessings:
             preprocessing += [p.to_json()]
@@ -118,14 +118,16 @@ def save(self):
             zf.close()
         desc = {
             "uid": self.uid,
+            "algorithm_short_name": self.get_name(),
             "framework_file": self.framework_file,
             "framework_file_path": self.framework_file_path,
             "preprocessing": preprocessing,
             "learners": learners_desc,
+            "params": self.params # this is needed while constructing new Iterative Learner Framework
         }
         return desc
 
-    def load(self, json_desc):
+    def from_json(self, json_desc):
         self.uid = json_desc.get("uid", self.uid)
         self.framework_file = json_desc.get("framework_file", self.framework_file)
         self.framework_file_path = json_desc.get(

diff --git a/supervised/learner_framework.py b/supervised/learner_framework.py
@@ -60,8 +60,8 @@ def train(self, data):
     def predict(self, X):
         pass
 
-    def save(self):
+    def to_json(self):
         pass
 
-    def load(self, json_desc):
+    def from_json(self, json_desc):
         pass
diff --git a/supervised/models/ensemble.py b/supervised/models/ensemble.py
@@ -112,7 +112,7 @@ def predict(self, X):
             )
         return y_predicted / total_repeat
 
-    def save(self):
+    def to_json(self):
         models_json = []
         for selected in self.selected_models:
             model = selected["model"]
@@ -128,7 +128,7 @@ def save(self):
         }
         return json_desc
 
-    def load(self, json_desc):
+    def from_json(self, json_desc):
 
         self.library_version = json_desc.get("library_version", self.library_version)
         self.algorithm_name = json_desc.get("algorithm_name", self.algorithm_name)

diff --git a/supervised/models/learner_sklearn.py b/supervised/models/learner_sklearn.py
@@ -56,7 +56,7 @@ def __init__(self, params):
         super(SklearnTreesClassifierLearner, self).__init__(params)
 
     def fit(self, X, y):
-        self.model.fit(X, y)
+        self.model.fit(X, np.ravel(y))
         self.model.n_estimators += self.trees_in_step
 
     def predict(self, X):

diff --git a/supervised/preprocessing/preprocessing_step.py b/supervised/preprocessing/preprocessing_step.py
@@ -108,7 +108,7 @@ def run(self, train_data=None, validation_data=None):
                 columns_preprocessing,
             )
         )
-        print("Remove columns", cols_to_remove)
+
         if X_train is not None:
             X_train.drop(cols_to_remove, axis=1, inplace=True)
         if X_validation is not None:

diff --git a/supervised/tuner/preprocessing_tuner.py b/supervised/tuner/preprocessing_tuner.py
@@ -30,7 +30,7 @@ def get(required_preprocessing, data, machinelearning_task):
 
             # remove empty columns and columns with only one variable
             empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0]
-            constant_column = len(np.unique(X[col])) == 1
+            constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1
             if empty_column or constant_column:
                 preprocessing_to_apply += ["remove_column"]
                 columns_preprocessing[col] = preprocessing_to_apply

diff --git a/tests/test_automl.py b/tests/test_automl.py
@@ -25,18 +25,29 @@ def setUpClass(cls):
             random_state=0,
         )
         cls.X = pd.DataFrame(cls.X, columns=["f0", "f1", "f2", "f3", "f4"])
-        cls.y = pd.DataFrame(cls.y)
+        #cls.y = pd.DataFrame(cls.y)
 
     def test_fit_and_predict(self):
-        automl = AutoML(total_time_limit=1, algorithms=["Xgboost"])
+        metric = Metric({"name": "logloss"})
+
+        automl = AutoML(total_time_limit=10, algorithms=["Xgboost"],
+                        start_random_models=5,
+                        hill_climbing_steps=0)
         automl.fit(self.X, self.y)
 
-        # y_predicted = automl.predict(self.X)
-        # print(y_predicted)
-        # metric = Metric({"name": "logloss"})
-        # loss = metric(self.y, y_predicted)
-        # print("Loss", loss)
-        # self.assertTrue(y_predicted is not None)
+        y_predicted = automl.predict(self.X)
+        self.assertTrue(y_predicted is not None)
+        loss = metric(self.y, y_predicted)
+        self.assertTrue(loss < 0.5)
+
+        params = automl.to_json()
+        automl2 = AutoML()
+        automl2.from_json(params)
+
+        y_predicted2 = automl2.predict(self.X)
+        self.assertTrue(y_predicted2 is not None)
+        loss2 = metric(self.y, y_predicted2)
+        self.assertTrue(loss2 < 0.5)
 
 
 if __name__ == "__main__":

diff --git a/tests/test_automl_performance.py b/tests/test_automl_performance.py
@@ -15,35 +15,36 @@
 
 class AutoMLTestWithData(unittest.TestCase):
     def test_fit_and_predict(self):
-        with open("./result.txt", "a") as f_result:
-            for dataset_id in [38]:  # 3, 24, 31, 38, 44, 179, 737, 720
-                df = pd.read_csv("./tests/data/data/{0}.csv".format(dataset_id))
-                x_cols = [c for c in df.columns if c != "target"]
-                X = df[x_cols]
-                y = df["target"]
 
-                for repeat in range(10):
-
-                    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
-                        X, y, test_size=0.3, random_state=1706 + repeat
-                    )
-                    automl = AutoML(
-                        total_time_limit=60 * 1,  # 1h limit
-                        algorithms=[ "Xgboost"], # ["LightGBM", "CatBoost", "Xgboost", "RF", "NN"],
-                        start_random_models=3,
-                        hill_climbing_steps=0,
-                        top_models_to_improve=1,
-                        train_ensemble=True,
-                        verbose=True,
-                    )
-                    automl.fit(X_train, y_train)
-
-                    response = automl.predict(X_test)
-                    # Compute the logloss on test dataset
-                    ll = log_loss(y_test, response)
-                    print(
-                        "{} {} {} {}".format(repeat, dataset_id, ll, automl._fit_time)
-                    )
+        for dataset_id in [3, 24, 31, 38, 44, 179, 737, 720]:
+            df = pd.read_csv("./tests/data/data/{0}.csv".format(dataset_id))
+            x_cols = [c for c in df.columns if c != "target"]
+            X = df[x_cols]
+            y = df["target"]
+
+            for repeat in range(1):
+
+                X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+                    X, y, test_size=0.3, random_state=1706 + repeat
+                )
+                automl = AutoML(
+                    total_time_limit=60 * 1,  # 1h limit
+                    algorithms=[ "Xgboost"], # ["LightGBM", "CatBoost", "Xgboost", "RF", "NN"],
+                    start_random_models=3,
+                    hill_climbing_steps=1,
+                    top_models_to_improve=1,
+                    train_ensemble=True,
+                    verbose=True,
+                )
+                automl.fit(X_train, y_train)
+
+                response = automl.predict(X_test)
+                # Compute the logloss on test dataset
+                ll = log_loss(y_test, response)
+                print(
+                    "{} {} {} {}".format(repeat, dataset_id, ll, automl._fit_time)
+                )
+                with open("./result.txt", "a") as f_result:
                     f_result.write(
                         "{} {} {} {}\n".format(repeat, dataset_id, ll, automl._fit_time)
                     )

diff --git a/tests/test_iterative_learner_framework.py b/tests/test_iterative_learner_framework.py
@@ -70,12 +70,11 @@ def test_save_and_load(self):
         metric = Metric({"name": "logloss"})
         loss = metric(self.y, il.predict(self.X))
 
-        json_desc = il.save()
-        print(json_desc)
+        json_desc = il.to_json()
 
-        il2 = IterativeLearner(self.train_params, callbacks=[])
+        il2 = IterativeLearner(json_desc.get("params"), callbacks=[])
         self.assertTrue(il.uid != il2.uid)
-        il2.load(json_desc)
+        il2.from_json(json_desc)
         self.assertTrue(il.uid == il2.uid)
         loss2 = metric(self.y, il2.predict(self.X))
         assert_almost_equal(loss, loss2)

diff --git a/tests/test_iterative_learner_framework_with_preprocessing.py b/tests/test_iterative_learner_framework_with_preprocessing.py
@@ -108,11 +108,11 @@ def test_save_and_load(self):
         metric = Metric({"name": "logloss"})
         loss_1 = metric(self.data["train"]["y"], y_predicted)
 
-        json_desc = il.save()
+        json_desc = il.to_json()
 
         il2 = IterativeLearner(self.train_params, callbacks=[])
         self.assertTrue(il.uid != il2.uid)
-        il2.load(json_desc)
+        il2.from_json(json_desc)
         self.assertTrue(il.uid == il2.uid)
         y_predicted_2 = il2.predict(self.data["train"]["X"])
         loss_2 = metric(self.data["train"]["y"], y_predicted_2)

diff --git a/tests/tests_models/test_ensemble.py b/tests/tests_models/test_ensemble.py
@@ -63,9 +63,9 @@ def test_save_load(self):
         ensemble.fit(self.X, self.y)
         y = ensemble.predict(self.X)
         assert_almost_equal(y[0], 0.1)
-        ensemble_json = ensemble.save()
+        ensemble_json = ensemble.to_json()
         ensemble2 = Ensemble()
-        ensemble2.load(ensemble_json)
+        ensemble2.from_json(ensemble_json)
         y2 = ensemble2.predict(self.X)
         assert_almost_equal(y2[0], 0.1)
 

diff --git a/tests/tests_preprocessing/test_preprocessing_step.py b/tests/tests_preprocessing/test_preprocessing_step.py
@@ -171,7 +171,7 @@ def test_run_fill_median_convert_integer(self):
         self.assertEqual(X_train["col4"][3], 2)
 
         params_json = ps.to_json()
-        print(params_json)
+
         self.assertTrue("missing_values" in params_json)
         self.assertTrue("categorical" in params_json)
         self.assertTrue("categorical_y" not in params_json)
@@ -357,7 +357,6 @@ def test_to_and_from_json_run_fill_median_convert_integer(self):
 
 
     def test_empty_column(self):
-        print("--------------------empty")
         # training data
         d = {
             "col1": [np.nan, np.nan, np.nan, np.nan],
@@ -379,22 +378,27 @@ def test_empty_column(self):
         }
 
         ps = PreprocessingStep(preprocessing_params)
-
         train_data, _ = ps.run(train_data={"X": X_train, "y": y_train})
         X_train1 = train_data.get("X")
-
-        print("columns", X_train1.columns)
-
+        self.assertTrue("col1" not in X_train1.columns)
+        self.assertEqual(3, len(X_train1.columns))
         train_data2 = ps.transform(validation_data={"X": X_train, "y": y_train})
         X_train2 = train_data2.get("X")
+        self.assertTrue("col1" not in X_train2.columns)
+        self.assertEqual(3, len(X_train2.columns))
+        for col in ["col2", "col3", "col4"]:
+            self.assertTrue(col in X_train2.columns)
 
-        print("columns", X_train2.columns)
-
-        #for col in ["col1", "col2", "col3", "col4"]:
-        #    self.assertTrue(col in X_train.columns)
-
-        #params_json = ps.to_json()
-        #self.assertFalse(params_json)  # should be empty
+        params_json = ps.to_json()
+        ps2 = PreprocessingStep()
+        ps2.from_json(params_json)
+
+        train_data3 = ps2.transform(validation_data={"X": X_train, "y": y_train})
+        X_train3 = train_data3.get("X")
+        self.assertTrue("col1" not in X_train3.columns)
+        self.assertEqual(3, len(X_train3.columns))
+        for col in ["col2", "col3", "col4"]:
+            self.assertTrue(col in X_train3.columns)