Skip to content

Commit

Permalink
Additional fixes to PR 777 (#967)
Browse files Browse the repository at this point in the history
* Initial changes

* Deleting TODO that will addressed by #968

* [skip ci] removing redundant imports

* [skip ci] Simplifying flow to generate prediction probablities

* Triggering unit tests

* Fixing mypy and flake issues

* [skip ci] Replacing HistGradientBoostingClassifier

* Simplifying examples

* Minor typo fix
  • Loading branch information
Neeratyoy committed Oct 29, 2020
1 parent 07e87ad commit 4923e5b
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 73 deletions.
19 changes: 6 additions & 13 deletions examples/30_extended/run_setup_tutorial.py
Expand Up @@ -34,14 +34,12 @@

import numpy as np
import openml
import sklearn.ensemble
import sklearn.impute
import sklearn.preprocessing
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD


openml.config.start_using_configuration_for_example()
Expand All @@ -58,9 +56,6 @@
# many potential hyperparameters. Of course, the model can be as complex and as
# easy as you want it to be

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.decomposition import TruncatedSVD


# Helper functions to return required columns for ColumnTransformer
def cont(X):
Expand All @@ -77,18 +72,16 @@ def cat(X):
TruncatedSVD(),
)
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)])
model_original = sklearn.pipeline.Pipeline(
steps=[("transform", ct), ("estimator", HistGradientBoostingClassifier()),]
)
model_original = Pipeline(steps=[("transform", ct), ("estimator", RandomForestClassifier()),])

# Let's change some hyperparameters. Of course, in any good application we
# would tune them using, e.g., Random Search or Bayesian Optimization, but for
# the purpose of this tutorial we set them to some specific values that might
# or might not be optimal
hyperparameters_original = {
"estimator__loss": "auto",
"estimator__learning_rate": 0.15,
"estimator__max_iter": 50,
"estimator__criterion": "gini",
"estimator__n_estimators": 50,
"estimator__max_depth": 10,
"estimator__min_samples_leaf": 1,
}
model_original.set_params(**hyperparameters_original)
Expand Down
44 changes: 9 additions & 35 deletions examples/30_extended/study_tutorial.py
Expand Up @@ -15,13 +15,7 @@

import uuid

import numpy as np
import sklearn.tree
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier

import openml

Expand Down Expand Up @@ -71,45 +65,25 @@
)
print(evaluations.head())

###########################################################from openml.testing import cat, cont#################
############################################################################
# Uploading studies
# =================
#
# Creating a study is as simple as creating any kind of other OpenML entity.
# In this examples we'll create a few runs for the OpenML-100 benchmark
# suite which is available on the OpenML test server.

openml.config.start_using_configuration_for_example()

# Model that can handle missing values
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier


# Helper functions to return required columns for ColumnTransformer
def cont(X):
return X.dtypes != "category"


def cat(X):
return X.dtypes == "category"
# Model to be used
clf = RandomForestClassifier()

# We'll create a study with one run on 3 datasets present in the suite
tasks = [115, 259, 307]

cat_imp = make_pipeline(
SimpleImputer(strategy="most_frequent"),
OneHotEncoder(handle_unknown="ignore", sparse=False),
TruncatedSVD(),
)
ct = ColumnTransformer(
[("cat", cat_imp, cat), ("cont", FunctionTransformer(lambda x: x, validate=False), cont)]
)
clf = sklearn.pipeline.Pipeline(
steps=[("transform", ct), ("estimator", HistGradientBoostingClassifier()),]
)

# To verify
suite = openml.study.get_suite(1)
# We'll create a study with one run on three random datasets each
tasks = np.random.choice(suite.tasks, size=3, replace=False)
print(all([t_id in suite.tasks for t_id in tasks]))

run_ids = []
for task_id in tasks:
task = openml.tasks.get_task(task_id)
Expand Down
46 changes: 21 additions & 25 deletions openml/extensions/sklearn/extension.py
Expand Up @@ -1546,7 +1546,9 @@ def _run_model_on_fold(
fold_no: int,
y_train: Optional[np.ndarray] = None,
X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame]] = None,
) -> Tuple[np.ndarray, pd.DataFrame, "OrderedDict[str, float]", Optional[OpenMLRunTrace]]:
) -> Tuple[
np.ndarray, Optional[pd.DataFrame], "OrderedDict[str, float]", Optional[OpenMLRunTrace]
]:
"""Run a model on a repeat,fold,subsample triplet of the task and return prediction
information.
Expand Down Expand Up @@ -1581,19 +1583,21 @@ def _run_model_on_fold(
-------
pred_y : np.ndarray
Predictions on the training/test set, depending on the task type.
For supervised tasks, predicitons are on the test set.
For unsupervised tasks, predicitons are on the training set.
proba_y : pd.DataFrame
For supervised tasks, predictions are on the test set.
For unsupervised tasks, predictions are on the training set.
proba_y : pd.DataFrame, optional
Predicted probabilities for the test set.
None, if task is not Classification or Learning Curve prediction.
user_defined_measures : OrderedDict[str, float]
User defined measures that were generated on this fold
trace : Optional[OpenMLRunTrace]]
trace : OpenMLRunTrace, optional
arff trace object from a fitted model and the trace content obtained by
repeatedly calling ``run_model_on_task``
"""

def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.DataFrame:
def _prediction_to_probabilities(
y: np.ndarray, model_classes: List[Any], class_labels: Optional[List[str]]
) -> pd.DataFrame:
"""Transforms predicted probabilities to match with OpenML class indices.
Parameters
Expand All @@ -1603,28 +1607,26 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.
training data).
model_classes : list
List of classes known_predicted by the model, ordered by their index.
class_labels : list
List of classes as stored in the task object fetched from server.
Returns
-------
pd.DataFrame
"""
if class_labels is None:
raise ValueError("The task has no class labels")

if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
if task.class_labels is not None:
if isinstance(y_train, np.ndarray) and isinstance(task.class_labels[0], str):
# mapping (decoding) the predictions to the categories
# creating a separate copy to not change the expected pred_y type
y = [task.class_labels[pred] for pred in y]
else:
raise ValueError("The task has no class labels")
else:
return None
if isinstance(y_train, np.ndarray) and isinstance(class_labels[0], str):
# mapping (decoding) the predictions to the categories
# creating a separate copy to not change the expected pred_y type
y = [class_labels[pred] for pred in y] # list or numpy array of predictions

# y: list or numpy array of predictions
# model_classes: sklearn classifier mapping from original array id to
# prediction index id
if not isinstance(model_classes, list):
raise ValueError("please convert model classes to list prior to calling this fn")

# DataFrame allows more accurate mapping of classes as column names
result = pd.DataFrame(
0, index=np.arange(len(y)), columns=model_classes, dtype=np.float32
Expand All @@ -1639,10 +1641,6 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.
if X_test is None:
raise TypeError("argument X_test must not be of type None")

# TODO: if possible, give a warning if model is already fitted (acceptable
# in case of custom experimentation,
# but not desirable if we want to upload to OpenML).

model_copy = sklearn.base.clone(model, safe=True)
# sanity check: prohibit users from optimizing n_jobs
self._prevent_optimize_n_jobs(model_copy)
Expand Down Expand Up @@ -1732,10 +1730,7 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.
proba_y = model_copy.predict_proba(X_test)
proba_y = pd.DataFrame(proba_y, columns=model_classes) # handles X_test as numpy
except AttributeError: # predict_proba is not available when probability=False
if task.class_labels is not None:
proba_y = _prediction_to_probabilities(pred_y, model_classes)
else:
raise ValueError("The task has no class labels")
proba_y = _prediction_to_probabilities(pred_y, model_classes, task.class_labels)

if task.class_labels is not None:
if proba_y.shape[1] != len(task.class_labels):
Expand All @@ -1759,6 +1754,7 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.
# adding missing columns with 0 probability
if col not in model_classes:
proba_y[col] = 0
# We re-order the columns to move possibly added missing columns into place.
proba_y = proba_y[task.class_labels]
else:
raise ValueError("The task has no class labels")
Expand Down

0 comments on commit 4923e5b

Please sign in to comment.