# POC on using Sklearn pipeline with RandomizedSearchCV

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import sklearn
from joblib import dump, load
import xgboost
from xgboost import XGBClassifier
import tempfile

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
print("NumPy", np.__version__)
print("Pandas", pd.__version__)
print("Scikit-Learn", sklearn.__version__)
print("XGBoost", xgboost.__version__)

NumPy 1.16.4
Pandas 0.24.2
Scikit-Learn 0.21.2
XGBoost 0.90


In [3]:
np.random.seed(432)
obs = 10000

In [4]:
population_df = (
    # Make columns of noise
    pd.DataFrame(np.random.rand(obs, 3), columns=[f"f{i}" for i in range(3)])
    .assign(target_class=np.random.rand(obs, 1) > 0.5)
    # Make columns that can help predict the target
    .assign(f3=lambda p_df: p_df.target_class.apply(lambda tc: tc + np.random.standard_normal()))
    .assign(c1=['category1', 'category2', 'category3', 'category4'] * int(obs/4))
    .assign(c2=['cat_type_2', 'cat_type_2'] * int(obs/2))
)
print(population_df.head(2))

         f0        f1        f2  target_class        f3         c1          c2
0  0.903661  0.896010  0.589950         False -0.547680  category1  cat_type_2
1  0.987273  0.851603  0.208817          True  0.622484  category2  cat_type_2


In [5]:
categorical_features = ['c1', 'c2']
numeric_features = ['f0', 'f1', 'f2', 'f3']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    population_df.drop(columns=['target_class']), population_df.target_class, test_size=0.1, random_state=43223)

In [7]:
class MultiplierTransformer(BaseEstimator, TransformerMixin):
    """Multiply the numeric cols by a mean of the columns.  Just a test"""

    def __init__(self, numeric_cols):
        self.numeric_cols = numeric_cols

    def fit(self, X_df, _):
        self.multiple = population_df[numeric_features[2:]].mean().mean()
        return self

    def transform(self, X_df):
        X_df[self.numeric_cols] = X_df[self.numeric_cols] * self.multiple
        return X_df

# RandomForest

In [8]:
# Setup the pipeline and the RandomizedSearch
pipeline_test_1 = make_pipeline(
    MultiplierTransformer(numeric_features[2:]),
    ColumnTransformer(
        remainder='passthrough',
        transformers=[
            ('impute', Pipeline(steps=[('input', SimpleImputer(strategy='mean'))]), numeric_features[:2]),
            ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_features),
    ]),
    RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=4431)
)

param_dist = {
    "randomforestclassifier__max_depth": sp_randint(1, 21),
    "randomforestclassifier__max_features": sp_randint(1, X_train.shape[1] + 1),
    "randomforestclassifier__min_samples_split": sp_randint(1, 1000),
    "randomforestclassifier__bootstrap": [True, False]
}

pipeline_test_1_search_model = RandomizedSearchCV(
    pipeline_test_1, 
    param_distributions=param_dist,
    n_iter=3,
    cv=3,
    iid=False
)

In [9]:
%%time
%%capture
# Not sure where the SettingWithCopyWarning is coming from so use capture

out_file = tempfile.NamedTemporaryFile()

print("Fitting Model")
pipeline_test_1_search_model.fit(X_train, y_train)

print(f"Saving model to file {out_file.name}")
dump(pipeline_test_1_search_model, out_file.name)

print(f"Pulling model from file {out_file.name}")
pipeline_test_1_search_model = load(out_file.name)

predictions = pipeline_test_1_search_model.predict(X_test)

CPU times: user 3.09 s, sys: 273 ms, total: 3.37 s
Wall time: 6.13 s


# XGBoost

In [10]:
# Setup the pipeline and the RandomizedSearch
pipeline_test_1 = make_pipeline(
    MultiplierTransformer(numeric_features[2:]),
    ColumnTransformer(
        remainder='passthrough',
        transformers=[
            ('impute', Pipeline(steps=[('input', SimpleImputer(strategy='mean'))]), numeric_features[:2]),
            ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_features),
    ]),
    XGBClassifier(n_estimators=10, random_state=432, n_jobs=-1)
)

param_dist = {
    'xgbclassifier__min_child_weight': sp_randint(1, 10),
    'xgbclassifier__gamma': [0.5, 1, 1.5, 2, 5],
    'xgbclassifier__subsample': [0.6, 0.8, 1.0],
    'xgbclassifier__colsample_bytree': [0.6, 0.8, 1.0],
    'xgbclassifier__max_depth': sp_randint(1, 21),
    'xgbclassifier__num_feature': sp_randint(1, X_train.shape[1] + 1)
}

print("Configuring RandomSearch")
pipeline_test_1_search_model = RandomizedSearchCV(
    pipeline_test_1, 
    param_distributions=param_dist,
    n_iter=3,
    cv=3,
    iid=False
)

Configuring RandomSearch


In [11]:
%%time
%%capture
# Not sure where the SettingWithCopyWarning is coming from so use capture

out_file = tempfile.NamedTemporaryFile()

print("Fitting Model")
pipeline_test_1_search_model.fit(X_train, y_train)

print(f"Saving model to file {out_file.name}")
dump(pipeline_test_1_search_model, out_file.name)

print(f"Pulling model from file {out_file.name}")
pipeline_test_1_search_model = load(out_file.name)


predictions = pipeline_test_1_search_model.predict(X_test)

CPU times: user 2.04 s, sys: 13 ms, total: 2.05 s
Wall time: 2.05 s
