## generate scikit-learn model configurations WIP

The following function will generate a drop in model configuration for an mlrun training pipeline

It locates every class in the scikit-learn package that has a `fit` function and generates class `__init__` and `fit` parameters json files **with default values** that can be read and input **as-is** in an  mlrun pipeline.  

Auto-paramter tuning can then take over and find a more optimal model for eventual deployment.

In [94]:
import json, os, inspect 
from importlib import import_module
import pandas as pd
from cloudpickle import dump, load
from sklearn.utils.testing import all_estimators
from inspect import getfullargspec, FullArgSpec

def gen_sklearn_model_configs(
    attr: str = 'predict_proba',
    target_path: str = 'models'
):
    """Generate model configs for all models with a specific attribute
    
    Currently extracts the class __init__ parameters and the
    fit method and its parameters.
    
    Filters model by attribute `attr`, for example, only classes with a 
    `predict_proba` method (and `fit` of course).
    
    :param attrib:       the attribute filter
    :param target_path:  destination folder for config files.
    """
    os.makedirs(target_path, exist_ok=True)
    errors = []
    classes = []
    config_format = []
    
    estimators = all_estimators()
    clfs = []
    for name, class_ in estimators:
        if hasattr(class_, attr):
            clfs.append((name, getfullargspec(class_), getfullargspec(class_.fit)))
    
    n_clfs = len(clfs)
    # print(f'number of classes found: {n_clfs}')
    for IX in range(n_clfs):
        model = clfs[IX][0]
        classes.append(model)
        
        model_json = {"CLASS_PARAMS": {}, "FIT_PARAMS": {}}
        fit_params = {}
        for i, key in enumerate(model_json.keys()):
            f = clfs[IX][i+1]
            args_paired = []
            defs_paired = []

            # reverse the args since there are fewer defaults than args
            args = f.args
            args.reverse()
            n_args = len(args)

            defs = f.defaults
            if defs is None:
                defs = [defs]
            defs = list(defs)
            defs.reverse()
            n_defs = len(defs)

            n_smallest = min(n_args, n_defs)
            n_largest = max(n_args, n_defs)
            
            # build 2 lists that can be concatenated
            for ix in range(n_smallest):
                if args[ix] is not 'self':
                    args_paired.append(args[ix])
                    defs_paired.append(defs[ix])

            for ix in range(n_smallest, n_largest):
                if ix is not 0 and args[ix] is not 'self':
                    args_paired.append(args[ix])
                    defs_paired.append(None)
            
            # concatenate lists into appropriate structure
            model_json[key] = dict(zip(args_paired, defs_paired))
        
        # HACK: save this model config, most as json, a few with functions
        # as params are saved as pickle for now
        try:
            json.dump(model_json, open(f'{target_path}/{model}.json', 'w'))
            config_format.append('json')
        except Exception as e:
            dump(model_json, open(f'{target_path}/{model}.pkl', 'wb'))
            config_format.append('pickle')

    df = pd.DataFrame({"classes": classes, "config_format": config_format})
    df.sort_values(["config_format", "classes"], inplace=True)
    df.to_csv(f'{target_path}/sklearn_class_list.csv')
    
    return df

models = gen_sklearn_model_configs(attr='fit')
models

Unnamed: 0,classes,config_format
0,ARDRegression,json
1,AdaBoostClassifier,json
2,AdaBoostRegressor,json
3,AdditiveChi2Sampler,json
4,AffinityPropagation,json
...,...,...
165,SelectFpr,pickle
167,SelectFwe,pickle
168,SelectKBest,pickle
169,SelectPercentile,pickle


In [95]:
models[models.config_format=='pickle']

Unnamed: 0,classes,config_format
20,CountVectorizer,pickle
24,DictVectorizer,pickle
38,FeatureAgglomeration,pickle
39,FeatureHasher,pickle
47,GenericUnivariateSelect,pickle
55,HashingVectorizer,pickle
124,OneHotEncoder,pickle
127,OrdinalEncoder,pickle
164,SelectFdr,pickle
165,SelectFpr,pickle


In [96]:
# json example
json.load(open('models/MLPClassifier.json', 'r'))

{'CLASS_PARAMS': {'n_iter_no_change': 10,
  'epsilon': 1e-08,
  'beta_2': 0.999,
  'beta_1': 0.9,
  'validation_fraction': 0.1,
  'early_stopping': False,
  'nesterovs_momentum': True,
  'momentum': 0.9,
  'warm_start': False,
  'verbose': False,
  'tol': 0.0001,
  'random_state': None,
  'shuffle': True,
  'max_iter': 200,
  'power_t': 0.5,
  'learning_rate_init': 0.001,
  'learning_rate': 'constant',
  'batch_size': 'auto',
  'alpha': 0.0001,
  'solver': 'adam',
  'activation': 'relu',
  'hidden_layer_sizes': [100]},
 'FIT_PARAMS': {'y': None, 'X': None}}

In [97]:
# pickle example
config = load(open('models/SelectKBest.pkl', 'rb'))
config

{'CLASS_PARAMS': {'k': 10,
  'score_func': <function sklearn.feature_selection.univariate_selection.f_classif(X, y)>},
 'FIT_PARAMS': {'y': None, 'X': None}}

### create and fit a pipeline from a pickled model class-config: SelectKBest

In this sample we take a class model config that has been pickled, load it, define a set of scoring functions we want to apply, assemble them as a sklearn pipeline, fit and pickle the pipeline.

In [124]:
model_str              =  "sklearn.feature_selection.SelectKBest"

scoring_functions_list = ["sklearn.feature_selection.f_classif", 
                          "sklearn.feature_selection.mutual_info_classif", 
                          "sklearn.feature_selection.chi2", 
                          "sklearn.feature_selection.f_regression"]

In [110]:
splits = model_str.split(".")
ModelClass = getattr(import_module(".".join(splits[:-1])), splits[-1])

In [125]:
variations = []

for score_func in scoring_functions_list:
    mc = ModelClass(**config['CLASS_PARAMS'])
    splits = score_func.split(".")
    mc.score_func = getattr(import_module(".".join(splits[:-1])), 
                            splits[-1])
    variations.append(mc)

variations

[SelectKBest(k=10, score_func=<function f_classif at 0x7fb1d3b51730>),
 SelectKBest(k=10, score_func=<function mutual_info_classif at 0x7fb1d3a3e9d8>),
 SelectKBest(k=10, score_func=<function chi2 at 0x7fb1d3b51840>),
 SelectKBest(k=10, score_func=<function f_regression at 0x7fb1d3b518c8>)]

In [87]:
from sklearn.datasets import load_digits

In [88]:
X, y = load_digits(return_X_y=True)
X.shape

(1797, 64)

In [132]:
# run in parallel
for var in variations:
    print(var.fit_transform(X, y))

  f = msb / msw


[[13.  0. 11. ...  0. 11.  0.]
 [ 0. 16.  6. ... 16.  1. 16.]
 [ 3.  8. 16. ... 15. 16. 16.]
 ...
 [13. 10. 16. ... 16. 16. 14.]
 [14.  3. 16. ... 14.  0.  0.]
 [16.  8. 15. ... 15. 16.  6.]]
[[11. 12.  0. ...  0.  0.  0.]
 [ 6. 15. 16. ... 16.  0. 10.]
 [16.  1. 15. ... 16.  5. 16.]
 ...
 [16.  8. 16. ... 14.  0.  6.]
 [16. 16.  7. ...  0.  2. 12.]
 [15.  5. 16. ...  6.  8. 12.]]
[[ 0. 11. 12. ...  0.  0.  0.]
 [16.  6. 15. ... 16.  0.  0.]
 [ 8. 16.  1. ... 16.  5.  9.]
 ...
 [10. 16.  8. ... 14.  0.  0.]
 [ 3. 16. 16. ...  0.  2.  0.]
 [ 8. 15.  5. ...  6.  8.  1.]]
[[ 0. 10.  5. ...  0. 10. 10.]
 [ 0. 16.  0. ... 16. 16. 16.]
 [ 0. 15.  0. ... 13. 16. 11.]
 ...
 [ 0.  8.  1. ... 16. 15. 13.]
 [ 0. 16.  1. ... 14.  7. 16.]
 [ 0.  6.  0. ... 15.  8. 14.]]


  corr /= X_norms
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)
