Additional fixes to PR 777 (#967)

* Initial changes * Deleting TODO that will addressed by #968 * [skip ci] removing redundant imports * [skip ci] Simplifying flow to generate prediction probablities * Triggering unit tests * Fixing mypy and flake issues * [skip ci] Replacing HistGradientBoostingClassifier * Simplifying examples * Minor typo fix
openml · Oct 29, 2020 · 4923e5b · 4923e5b
1 parent 07e87ad
commit 4923e5b
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 73 deletions.
diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py
@@ -34,14 +34,12 @@
 
 import numpy as np
 import openml
-import sklearn.ensemble
-import sklearn.impute
-import sklearn.preprocessing
 from sklearn.pipeline import make_pipeline, Pipeline
 from sklearn.compose import ColumnTransformer
 from sklearn.impute import SimpleImputer
 from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
-from sklearn.experimental import enable_hist_gradient_boosting
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.decomposition import TruncatedSVD
 
 
 openml.config.start_using_configuration_for_example()
@@ -58,9 +56,6 @@
 # many potential hyperparameters. Of course, the model can be as complex and as
 # easy as you want it to be
 
-from sklearn.ensemble import HistGradientBoostingClassifier
-from sklearn.decomposition import TruncatedSVD
-
 
 # Helper functions to return required columns for ColumnTransformer
 def cont(X):
@@ -77,18 +72,16 @@ def cat(X):
     TruncatedSVD(),
 )
 ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)])
-model_original = sklearn.pipeline.Pipeline(
-    steps=[("transform", ct), ("estimator", HistGradientBoostingClassifier()),]
-)
+model_original = Pipeline(steps=[("transform", ct), ("estimator", RandomForestClassifier()),])
 
 # Let's change some hyperparameters. Of course, in any good application we
 # would tune them using, e.g., Random Search or Bayesian Optimization, but for
 # the purpose of this tutorial we set them to some specific values that might
 # or might not be optimal
 hyperparameters_original = {
-    "estimator__loss": "auto",
-    "estimator__learning_rate": 0.15,
-    "estimator__max_iter": 50,
+    "estimator__criterion": "gini",
+    "estimator__n_estimators": 50,
+    "estimator__max_depth": 10,
     "estimator__min_samples_leaf": 1,
 }
 model_original.set_params(**hyperparameters_original)

diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py
@@ -15,13 +15,7 @@
 
 import uuid
 
-import numpy as np
-import sklearn.tree
-from sklearn.pipeline import make_pipeline, Pipeline
-from sklearn.compose import ColumnTransformer
-from sklearn.impute import SimpleImputer
-from sklearn.decomposition import TruncatedSVD
-from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
+from sklearn.ensemble import RandomForestClassifier
 
 import openml
 
@@ -71,45 +65,25 @@
 )
 print(evaluations.head())
 
-###########################################################from openml.testing import cat, cont#################
+############################################################################
 # Uploading studies
 # =================
 #
 # Creating a study is as simple as creating any kind of other OpenML entity.
 # In this examples we'll create a few runs for the OpenML-100 benchmark
 # suite which is available on the OpenML test server.
-
 openml.config.start_using_configuration_for_example()
 
-# Model that can handle missing values
-from sklearn.experimental import enable_hist_gradient_boosting
-from sklearn.ensemble import HistGradientBoostingClassifier
-
-
-# Helper functions to return required columns for ColumnTransformer
-def cont(X):
-    return X.dtypes != "category"
-
-
-def cat(X):
-    return X.dtypes == "category"
+# Model to be used
+clf = RandomForestClassifier()
 
+# We'll create a study with one run on 3 datasets present in the suite
+tasks = [115, 259, 307]
 
-cat_imp = make_pipeline(
-    SimpleImputer(strategy="most_frequent"),
-    OneHotEncoder(handle_unknown="ignore", sparse=False),
-    TruncatedSVD(),
-)
-ct = ColumnTransformer(
-    [("cat", cat_imp, cat), ("cont", FunctionTransformer(lambda x: x, validate=False), cont)]
-)
-clf = sklearn.pipeline.Pipeline(
-    steps=[("transform", ct), ("estimator", HistGradientBoostingClassifier()),]
-)
-
+# To verify
 suite = openml.study.get_suite(1)
-# We'll create a study with one run on three random datasets each
-tasks = np.random.choice(suite.tasks, size=3, replace=False)
+print(all([t_id in suite.tasks for t_id in tasks]))
+
 run_ids = []
 for task_id in tasks:
     task = openml.tasks.get_task(task_id)

diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -1546,7 +1546,9 @@ def _run_model_on_fold(
         fold_no: int,
         y_train: Optional[np.ndarray] = None,
         X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame]] = None,
-    ) -> Tuple[np.ndarray, pd.DataFrame, "OrderedDict[str, float]", Optional[OpenMLRunTrace]]:
+    ) -> Tuple[
+        np.ndarray, Optional[pd.DataFrame], "OrderedDict[str, float]", Optional[OpenMLRunTrace]
+    ]:
         """Run a model on a repeat,fold,subsample triplet of the task and return prediction
         information.
 
@@ -1581,19 +1583,21 @@ def _run_model_on_fold(
         -------
         pred_y : np.ndarray
             Predictions on the training/test set, depending on the task type.
-            For supervised tasks, predicitons are on the test set.
-            For unsupervised tasks, predicitons are on the training set.
-        proba_y : pd.DataFrame
+            For supervised tasks, predictions are on the test set.
+            For unsupervised tasks, predictions are on the training set.
+        proba_y : pd.DataFrame, optional
             Predicted probabilities for the test set.
             None, if task is not Classification or Learning Curve prediction.
         user_defined_measures : OrderedDict[str, float]
             User defined measures that were generated on this fold
-        trace : Optional[OpenMLRunTrace]]
+        trace : OpenMLRunTrace, optional
             arff trace object from a fitted model and the trace content obtained by
             repeatedly calling ``run_model_on_task``
         """
 
-        def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.DataFrame:
+        def _prediction_to_probabilities(
+            y: np.ndarray, model_classes: List[Any], class_labels: Optional[List[str]]
+        ) -> pd.DataFrame:
             """Transforms predicted probabilities to match with OpenML class indices.
 
             Parameters
@@ -1603,28 +1607,26 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.
                 training data).
             model_classes : list
                 List of classes known_predicted by the model, ordered by their index.
+            class_labels : list
+                List of classes as stored in the task object fetched from server.
 
             Returns
             -------
             pd.DataFrame
             """
+            if class_labels is None:
+                raise ValueError("The task has no class labels")
 
-            if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
-                if task.class_labels is not None:
-                    if isinstance(y_train, np.ndarray) and isinstance(task.class_labels[0], str):
-                        # mapping (decoding) the predictions to the categories
-                        # creating a separate copy to not change the expected pred_y type
-                        y = [task.class_labels[pred] for pred in y]
-                else:
-                    raise ValueError("The task has no class labels")
-            else:
-                return None
+            if isinstance(y_train, np.ndarray) and isinstance(class_labels[0], str):
+                # mapping (decoding) the predictions to the categories
+                # creating a separate copy to not change the expected pred_y type
+                y = [class_labels[pred] for pred in y]  # list or numpy array of predictions
 
-            # y: list or numpy array of predictions
             # model_classes: sklearn classifier mapping from original array id to
             # prediction index id
             if not isinstance(model_classes, list):
                 raise ValueError("please convert model classes to list prior to calling this fn")
+
             # DataFrame allows more accurate mapping of classes as column names
             result = pd.DataFrame(
                 0, index=np.arange(len(y)), columns=model_classes, dtype=np.float32
@@ -1639,10 +1641,6 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.
             if X_test is None:
                 raise TypeError("argument X_test must not be of type None")
 
-        # TODO: if possible, give a warning if model is already fitted (acceptable
-        # in case of custom experimentation,
-        # but not desirable if we want to upload to OpenML).
-
         model_copy = sklearn.base.clone(model, safe=True)
         # sanity check: prohibit users from optimizing n_jobs
         self._prevent_optimize_n_jobs(model_copy)
@@ -1732,10 +1730,7 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.
                 proba_y = model_copy.predict_proba(X_test)
                 proba_y = pd.DataFrame(proba_y, columns=model_classes)  # handles X_test as numpy
             except AttributeError:  # predict_proba is not available when probability=False
-                if task.class_labels is not None:
-                    proba_y = _prediction_to_probabilities(pred_y, model_classes)
-                else:
-                    raise ValueError("The task has no class labels")
+                proba_y = _prediction_to_probabilities(pred_y, model_classes, task.class_labels)
 
             if task.class_labels is not None:
                 if proba_y.shape[1] != len(task.class_labels):
@@ -1759,6 +1754,7 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.
                         # adding missing columns with 0 probability
                         if col not in model_classes:
                             proba_y[col] = 0
+                    # We re-order the columns to move possibly added missing columns into place.
                     proba_y = proba_y[task.class_labels]
             else:
                 raise ValueError("The task has no class labels")