In [1]:
import sys, os 
current_dir = os.getcwd()
path = os.path.dirname(current_dir)
sys.path.append(path)

from ml_workflow.tuned_estimator import TunedEstimator

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import average_precision_score 

import numpy as np
import pandas as pd 

In [2]:
X,y = make_classification(n_samples=10000, random_state=42, class_sep=0.7)
X = pd.DataFrame(X)

In [5]:
# Initialize the estimator that will be using.
# Can be any scikit-learn model or scikit-learn-style model (XGBoost). 
estimator = RandomForestClassifier(n_jobs=-1, random_state=30, criterion = 'entropy',) 

# Define the ML pipeline. 
# A pipeline is a wrapper around the ML model itself,
# that performs as pre-processing procedures. 
# These methods are saved as attributes of the 
# Pipeline class and saved directly with it. 

# if pipeline_kwargs = None, then no pipeline is created
# and what's returned is the ML model by itself. 

pipeline_kwargs = dict(
    # Method to handle missing data.
    imputer = 'simple', # None, 'simple', 'iterative'
    # Whether to scale the input data and the method
    scaler = 'standard', # None, 'minmax', 'standard', 'robust'
    # Whether to perform Principal component transform
    pca = None, # True, None/False
    # Whether to resample the dataset to artifically
    # increase the base rate; useful for highly imbalanced dataset
    # CAUTION: resampling requires calibration afterwards! 
    
    # Specify which features are numerical or categorical.
    # For categorical features, we do not want to apply 
    # certain scaling methods. Default is None 
    # and all features treated as numerical.
    numeric_features = None, 
    categorical_features= None
)

# Define the hyperparameter optimizer 

def scorer(estimator, X, y):
    pred = estimator.predict_proba(X)[:,1]
    return 1.0 - average_precision_score(y, pred)

hyperopt_kwargs = dict(
    # Create a hyperparameter search grid. 
    search_space = {  'n_estimators' : [100,150,300,400,500], 
                      'max_depth' : [6,8,10,15,20],
                      'max_features' : [5,6,8,10],
                      'min_samples_split' : [4,5,8,10,15,20,25,50],
                      'min_samples_leaf' : [4,5,8,10,15,20,25,50],
             },
    # Optimizer method, leave it as "tpe"
    optimizer = "tpe", 
    # Number of settings to explore. 
    max_evals = 5,
    # Number of steps before exiting if a 1%
    # improvement in the score does not occur.
    patience = 10,
    # Method that returns a single score and takes
    # callable of form scorer(estimator, X, y)
    scorer = scorer, 
    # Jobs to run in parallel, 
    # CAUTION: set == 1, if the estimator is 
    # already run in parallel (e.g., a random forest),
    # as it cause conflicts
    n_jobs=1,
    # the CV method to use
    cv = None, 
    
    # Output filename of the hyperparam results.
    # Results can be viewed later. 
    output_fname = None
   
)

# Define calibration method 
# NOTE: Only define if performing classification; 
# no method exists for regression calibration at the moment. 
calibration_cv_kwargs = None


tuned_estimator = TunedEstimator(estimator, 
                                 pipeline_kwargs,
                                 hyperopt_kwargs,
                                 calibration_cv_kwargs,
                                )


In [None]:
tuned_estimator.fit(X,y)

In [None]:
df = pd.read_pickle('hyperopt_results.pkl')

In [None]:
df.sort_values('loss')