Skip to content

Commit

Permalink
FeatureUnion, PermutationImportance, documentation and code cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
mghasemi committed Mar 5, 2019
1 parent fd12571 commit 3db7571
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 166 deletions.
262 changes: 101 additions & 161 deletions SKSurrogate/aml.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,86 +491,11 @@ def _cast(self, n, X, y):
for seq in Pop:
if not self._validate_sequence(seq):
continue
idx = 0
ent_idx = 0
steps = []
config = {}
task_name = self.check_point + '_'.join(seq)
# for est in seq:
while ent_idx < n:
est = seq[ent_idx]
clss = self._get_class(est)
# mdl = clss()
pre = 'stp_%d' % idx
if self.config_types[est] in ['regressor', 'classifier'] and ent_idx < n - 1:
mdl = clss()
steps.append((pre, StackingEstimator(mdl, res=self.stack_res,
probs=self.stack_probs,
decision=self.stack_decision)))
ent_idx += 1
elif est == 'sklearn.pipeline.FeatureUnion':
self.config[est] = dict()
int_idx = 1
int_steps = []
next_est = seq[ent_idx + int_idx]
while ((self.config_types[next_est] in ['regressor', 'classifier']) or (
next_est in self.known_feature_selectors)) and (ent_idx + int_idx < n - 1):
int_pre = "int_%d" % int_idx
if next_est in self.known_feature_selectors:
int_mdl = self._get_class(next_est)()
# set the parameter's dictionary
for kw in self.config[next_est]:
self.config[est][int_pre + '__' + kw] = self.config[next_est][kw]
else:
from eli5.sklearn import PermutationImportance
from sklearn.feature_selection import SelectFromModel
from numpy import inf
int_est = self._get_class(next_est)()
int_mdl = SelectFromModel(PermutationImportance(int_est, cv=3),
threshold=-inf)
self.config[est][int_pre + '__' + 'max_features'] = Integer(1, self.num_features)
for kw in self.config[next_est]:
self.config[est][int_pre + '__' + 'estimator__estimator__' + kw] = \
self.config[next_est][kw]
int_steps.append((int_pre, int_mdl))
int_idx += 1
next_est = seq[ent_idx + int_idx]
if int_steps != []:
mdl = clss(int_steps)
steps.append((pre, mdl))
ent_idx += int_idx
else:
mdl = clss()
steps.append((pre, mdl))
ent_idx += 1
for kw in self.config[est]:
config[pre + '__' + kw] = self.config[est][kw]
idx += 1
ppln = Pipeline(steps)
best_mdl, best_scr = self.optimize_pipeline(seq, X, y)
self.models[seq] = (best_mdl, best_scr)
if self.verbose > 0:
print("=" * 90)
print(seq)
print("-" * 90)
OPTIM = None
for srgt in self.surrogates:
OPTIM = SurrogateRandomCV(ppln,
params=config,
max_iter=srgt[1],
min_evals=self.min_random_evals,
scoring=self.scoring,
cv=self.cv,
verbose=max(self.verbose - 1, 0),
sampling=srgt[2],
regressor=srgt[0],
scipy_solver=srgt[3],
task_name=task_name,
Continue=True,
warm_start=True)
OPTIM.fit(X, y)
self.models[seq] = (OPTIM.best_estimator_, OPTIM.best_estimator_score)
if self.verbose > 0:
print("score:%f" % OPTIM.best_estimator_score)
print(OPTIM.best_estimator_)
print("score:%f" % best_scr)
print(best_mdl)

def fit(self, X, y):
"""
Expand Down Expand Up @@ -663,90 +588,13 @@ def _eval(ppl):
from collections import OrderedDict
fitted = OrderedDict([])
for seq in ppl:
if not self._validate_sequence(seq):
continue
n = len(seq)
idx = 0
ent_idx = 0
steps = []
config = {}
task_name = self.check_point + '_'.join(seq)
while ent_idx < n:
est = seq[ent_idx]
clss = self._get_class(est)
# mdl = clss()
pre = 'stp_%d' % idx
if self.config_types[est] in ['regressor', 'classifier'] and ent_idx < n - 1:
mdl = clss()
steps.append((pre, StackingEstimator(mdl, res=self.stack_res,
probs=self.stack_probs,
decision=self.stack_decision)))
ent_idx += 1
elif est == 'sklearn.pipeline.FeatureUnion':
self.config[est] = dict()
int_idx = 1
int_steps = []
next_est = seq[ent_idx + int_idx]
while ((self.config_types[next_est] in ['regressor', 'classifier']) or (
next_est in self.known_feature_selectors)) and (ent_idx + int_idx < n - 1):
int_pre = "int_%d" % int_idx
if next_est in self.known_feature_selectors:
int_mdl = self._get_class(next_est)()
# set the parameter's dictionary
for kw in self.config[next_est]:
self.config[est][int_pre + '__' + kw] = self.config[next_est][kw]
else:
from eli5.sklearn import PermutationImportance
from sklearn.feature_selection import SelectFromModel
from numpy import inf
int_est = self._get_class(next_est)()
int_mdl = SelectFromModel(PermutationImportance(int_est, cv=3),
threshold=-inf)
self.config[est][int_pre + '__' + 'max_features'] = Integer(1, self.num_features)
for kw in self.config[next_est]:
self.config[est][int_pre + '__' + 'estimator__estimator__' + kw] = \
self.config[next_est][kw]
int_steps.append((int_pre, int_mdl))
int_idx += 1
next_est = seq[ent_idx + int_idx]
if int_steps != []:
mdl = clss(int_steps)
steps.append((pre, mdl))
ent_idx += int_idx
else:
mdl = clss()
steps.append((pre, mdl))
ent_idx += 1
for kw in self.config[est]:
config[pre + '__' + kw] = self.config[est][kw]
idx += 1
ppln = Pipeline(steps)
if self.verbose > 0:
print("=" * 90)
print(seq)
print("-" * 90)
OPTIM = None
for srgt in self.surrogates:
OPTIM = SurrogateRandomCV(ppln,
params=config,
max_iter=srgt[1],
min_evals=self.min_random_evals,
scoring=self.scoring,
cv=self.cv,
verbose=max(self.verbose - 1, 0),
sampling=srgt[2],
regressor=srgt[0],
scipy_solver=srgt[3],
task_name=task_name,
Continue=True,
warm_start=True)
OPTIM.fit(X_, y_)
best_mdl, best_scr = self.optimize_pipeline(seq, X_, y_)
if seq not in self.models:
self.models[seq] = (OPTIM.best_estimator_, OPTIM.best_estimator_score)
self.models[seq] = (best_mdl, best_scr)
if self.verbose > 0:
print("score:%f" % OPTIM.best_estimator_score)
print(OPTIM.best_estimator_)
fitted[seq] = -OPTIM.best_estimator_score
print("score:%f" % best_scr)
print(best_mdl)
fitted[seq] = -best_scr
return fitted

num_parents = kwargs.pop('num_parents', 30)
Expand All @@ -766,3 +614,95 @@ def get_top(self, num=5):
"""
from collections import OrderedDict
return OrderedDict(sorted(self.models.items(), key=lambda x: x[1][1])[:num])

def optimize_pipeline(self, seq, X, y):
"""
Constructs and optimizes a pipeline according to the steps passed through `seq` which is a tuple of
estimators and transformers.
:param seq: the tuple of steps of the pipeline to be optimized
:param X: numpy array of training features
:param y: numpy array of training values
:return: the optimized pipeline and its score
"""
from .structsearch import SurrogateRandomCV
if self.couldBfirst == []:
from sklearn.pipeline import Pipeline
else:
from imblearn.pipeline import Pipeline
OPTIM = None
n = len(seq)
idx = 0
ent_idx = 0
steps = []
config = {}
task_name = self.check_point + '_'.join(seq)
while ent_idx < n:
est = seq[ent_idx]
clss = self._get_class(est)
pre = 'stp_%d' % idx
if self.config_types[est] in ['regressor', 'classifier'] and ent_idx < n - 1:
mdl = clss()
steps.append((pre, StackingEstimator(mdl, res=self.stack_res,
probs=self.stack_probs,
decision=self.stack_decision)))
ent_idx += 1
elif est == 'sklearn.pipeline.FeatureUnion':
self.config[est] = dict()
int_idx = 1
int_steps = []
next_est = seq[ent_idx + int_idx]
while ((self.config_types[next_est] in ['regressor', 'classifier']) or (
next_est in self.known_feature_selectors)) and (ent_idx + int_idx < n - 1):
int_pre = "int_%d" % int_idx
if next_est in self.known_feature_selectors:
int_mdl = self._get_class(next_est)()
# set the parameter's dictionary
for kw in self.config[next_est]:
self.config[est][int_pre + '__' + kw] = self.config[next_est][kw]
else:
from eli5.sklearn import PermutationImportance
from sklearn.feature_selection import SelectFromModel
from numpy import inf
int_est = self._get_class(next_est)()
int_mdl = SelectFromModel(PermutationImportance(int_est, cv=3),
threshold=-inf)
self.config[est][int_pre + '__' + 'max_features'] = Integer(1, self.num_features)
for kw in self.config[next_est]:
self.config[est][int_pre + '__' + 'estimator__estimator__' + kw] = \
self.config[next_est][kw]
int_steps.append((int_pre, int_mdl))
int_idx += 1
next_est = seq[ent_idx + int_idx]
if int_steps != []:
mdl = clss(int_steps)
steps.append((pre, mdl))
ent_idx += int_idx
else:
mdl = clss()
steps.append((pre, mdl))
ent_idx += 1
for kw in self.config[est]:
config[pre + '__' + kw] = self.config[est][kw]
idx += 1
ppln = Pipeline(steps)
if self.verbose > 0:
print("=" * 90)
print(seq)
print("-" * 90)
for srgt in self.surrogates:
OPTIM = SurrogateRandomCV(ppln,
params=config,
max_iter=srgt[1],
min_evals=self.min_random_evals,
scoring=self.scoring,
cv=self.cv,
verbose=max(self.verbose - 1, 0),
sampling=srgt[2],
regressor=srgt[0],
scipy_solver=srgt[3],
task_name=task_name,
Continue=True,
warm_start=True)
OPTIM.fit(X, y)
return OPTIM.best_estimator_, OPTIM.best_estimator_score
4 changes: 2 additions & 2 deletions SKSurrogate/sensapprx.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ class SensAprx(BaseEstimator, TransformerMixin):
:param probs: pre-calculated values associated to `domain` points
"""

def __init__(self, n_features_to_select=10, regressor=None, method='sobol', margin=.2, num_smpl=600, num_levels=6,
grid_jump=1, num_resmpl=10, reduce=False, domain=None, probs=None):
def __init__(self, n_features_to_select=10, regressor=None, method='sobol', margin=.2, num_smpl=500, num_levels=5,
grid_jump=1, num_resmpl=8, reduce=False, domain=None, probs=None):
self.n_features_to_select = n_features_to_select
self.regressor = regressor
self.method = method
Expand Down
51 changes: 48 additions & 3 deletions docs/opd.rst
Original file line number Diff line number Diff line change
Expand Up @@ -131,19 +131,64 @@ The case of ``imblearn.base.SamplerMixin``, ``BaseSampler`` can only occur at th
pipeline. The rest could be ``RegressorMixin``, ``ClassifierMixin`` or ``TransformerMixin``.

Stacking
---------------------
--------------------------
If a non ``TransformerMixin`` occurs in the middle, then by ``StackingEstimator`` it will transform
the data to append columns based on the outcome of ``RegressorMixin`` or ``ClassifierMixin``.

Permutation Importance
--------------------------
If ``sklearn.pipeline.FeatureUnion`` is included within the config dictionary, in the scope of a
pipeline two scenarios are plausible:

+ **`FeatureUnion` is followed by a series of transformations:** in this case `FeatureUnion`
does exactly what is expected, i.e., gathers all the feature outputs of transformers;
+ **`FeatureUnion` is followed by a mixture of transformations and estimators:** then
`SKSurrogate` uses ``eli5.sklearn.PermutationImportance`` to weight the features based on
the estimator and then selects top features via ``sklearn.feature_selection.SelectFromModel``.

Not all transformers select a subset of of features (e.g., `Normalizer` or `StandardScaler`). If
`FeatureUnion` is followed by such transformers, it does not have any effect on the outcome of the
transformer. If the transformer selects a subset of features (`VarianceThreshold`, `skrebate.ReliefF`)
then `FeatureUnion` collects the outcomes and returns the union. This is also true for
`PermutationImportance`. The `FeatureUnion` affects the following transformers and estimators until
it reaches the last step or a transformer which is not a feature selector. Subclasses of
``sklearn.feature_selection.base.SelectorMixin`` are considered as feature selectors. Also, the
following transformers are considered as feature selectors:

- `FactorAnalysis`
- `FastICA`
- `IncrementalPCA`
- `KernelPCA`
- `LatentDirichletAllocation`
- `MiniBatchDictionaryLearning`
- `MiniBatchSparsePCA`
- `NMF`
- `PCA`
- `SparsePCA`
- `TruncatedSVD`
- `VarianceThreshold`
- `LocallyLinearEmbedding`
- `Isomap`
- `MDS`
- `SpectralEmbedding`
- `TSNE`
- `sksurrogate.SensAprx`
- `skrebate.ReliefF`
- `skrebate.SURF`
- `skrebate.SURFstar`
- `skrebate.MultiSURF`
- `skrebate.MultiSURFstar`
- `skrebate.TuRF`

imblearn pipelines
---------------------
--------------------------
If an ``imblearn`` sampler is included in the `config` dictionary, then
``imblearn.pipeline.Pipeline`` will be used instead of ``sklearn.pipeline.Pipeline`` which enables
the Pipeline to use `imblearn <https://imbalanced-learn.readthedocs.io/en/stable/index.html>`_
samples too.

Categorical Variables
---------------------
--------------------------
In case there are fields in the data that need to be treated as categorical, one could provide a
list of indices through `cat_cols`. Then, the data will be transformed via
``category_encoders.one_hot.OneHotEncoder`` before being passed to the pipelines.

0 comments on commit 3db7571

Please sign in to comment.