In [1]:
from weight_of_evidence.tree_binner import TreeBinner

In [2]:
from sklearn.pipeline import Pipeline
from category_encoders.woe import WOEEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import numpy as np
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import yaml

In [3]:
def prepare_data(config):
    data = pd.read_csv(config['data_path'] , 
                       sep=" ", names=config['feature_names'])

    data["response"] = data["response"] - 1

    return data.drop(columns=['response']),data['response']


In [4]:
CONFIG = yaml.safe_load(open('config.yaml','r'))

In [5]:
X, y = prepare_data(CONFIG)

In [6]:
AUTO_BIN_PIPELINE = Pipeline([
    ('tree_binner', TreeBinner(max_depth=5,min_samples_split=4,min_samples_leaf=4,category_type='str')),
    ('woe_encoder', WOEEncoder(regularization=1)),
    ('standard_scaler', StandardScaler()),
    ('logistic_regression', LogisticRegression(max_iter=10_000,C=0.01))
])

In [7]:
text_columns_indices = [index for index, dtype in enumerate(X.dtypes) if dtype == 'object']
numerical_columns_indices = [index for index, dtype in enumerate(X.dtypes) if dtype != 'object']
preprocessor = ColumnTransformer(
    transformers=[
        ('text', OneHotEncoder(handle_unknown='ignore'), text_columns_indices),
        ('num', StandardScaler(with_mean=False), numerical_columns_indices)
    ])

LINEAR_PIPELINE = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=10_000,random_state=42))
])

In [8]:
XGB_Pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42))
])

In [9]:
PIPELINES = {'xgb':XGB_Pipeline,'linear':LINEAR_PIPELINE,'auto_bin':AUTO_BIN_PIPELINE}

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
def find_best_model_params(model, params_grid, X, y, cv=5, n=50, scoring='roc_auc'):
    random_search = RandomizedSearchCV(
        model, 
        param_distributions=params_grid, 
        n_iter=n, 
        cv=cv, 
        scoring=scoring, 
        n_jobs=-1, 
        verbose=1, 
        random_state=42
    )
    random_search.fit(X, y)
    print(f'Best score: {random_search.best_score_:.3f}')
    print(f'Best parameters: {random_search.best_params_}')
    return random_search.best_params_

In [19]:
TREE_BIN_PARAMS_GRID = {
    'tree_binner__max_depth': sp_randint(2, 6),
    'tree_binner__min_samples_leaf': sp_randint(1, 5),
    'tree_binner__min_impurity_decrease': sp_uniform(0, 1),
    'woe_encoder__regularization': sp_uniform(0, 1),
    'logistic_regression__C': np.logspace(-3, 2, 6),
}

In [20]:
BEST_PARAMS_WOE = find_best_model_params(AUTO_BIN_PIPELINE, TREE_BIN_PARAMS_GRID, X_train, y_train,n=200)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best score: 0.762
Best parameters: {'logistic_regression__C': 0.001, 'tree_binner__max_depth': 4, 'tree_binner__min_impurity_decrease': 0.42754101835854963, 'tree_binner__min_samples_leaf': 2, 'woe_encoder__regularization': 0.4393365018657701}


In [21]:
PIPELINES['auto_bin'] = AUTO_BIN_PIPELINE.set_params(**BEST_PARAMS_WOE)

In [22]:
for name, pipeline in PIPELINES.items():
    print(f'Pipeline: {name}')
    print(f'CV score: {cross_val_score(pipeline, X_train, y_train, scoring="roc_auc", cv=5).mean():.3f}')

Pipeline: xgb
CV score: 0.756
Pipeline: linear
CV score: 0.770
Pipeline: auto_bin
CV score: 0.762
Pipeline: auto_bin_tuned
CV score: 0.762


In [None]:


#BEST_PARAMS_XGB = find_best_model_params(AUTO_BIN_PIPELINE, CONFIG['params_grid'], X_train, y_train)

In [None]:
cross_val_score(AUTO_BIN_PIPELINE, X_train, y_train, scoring='roc_auc', cv=5).mean()

In [None]:
X.select_dtypes('object').nunique()

In [None]:
cross_val_score(BASELINE_PIPELINE, X, y, scoring='roc_auc', cv=5).mean()