In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, Normalizer, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
import sklearn
import optuna
from optuna.samplers import TPESampler
import pickle
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, Normalizer

from optuna import Trial
from category_encoders import WOEEncoder

import lightgbm as lgb

In [2]:
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = LabelEncoder()

    def fit(self, y):
        self.encoder.fit(y)
        return self

    def transform(self, y):
        return self.encoder.transform(y)

    def inverse_transform(self, y):
        return self.encoder.inverse_transform(y)

In [3]:
train_b=pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv')
test=pd.read_csv('/kaggle/input/playground-series-s4e1/test.csv')
train_org=pd.read_csv('/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv')

train_b.drop(columns=['id'], inplace=True)
test.drop(columns=['id'], inplace=True)
train_org.drop(columns=['RowNumber'], inplace=True)

# train = train_b
train=pd.concat([train_b, train_org], ignore_index=True)

X = train.drop(['Exited','Surname'], axis=1)
test = test.drop(['Surname'], axis=1)
y = train['Exited']

encoder = TargetEncoder()
y = encoder.fit_transform(train['Exited'])

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.1, random_state=0)

In [4]:
from sklearn.impute import SimpleImputer
from optuna import Trial

def instantiate_numerical_simple_imputer(trial : Trial, fill_value : int=-1) -> SimpleImputer:
  strategy = trial.suggest_categorical(
    'numerical_strategy', ['mean', 'median', 'most_frequent', 'constant']
  )
  return SimpleImputer(strategy=strategy, fill_value=fill_value)

def instantiate_categorical_simple_imputer(trial : Trial, fill_value : str='missing') -> SimpleImputer:
  strategy = trial.suggest_categorical(
    'categorical_strategy', ['most_frequent', 'constant']
  )
  return SimpleImputer(strategy=strategy, fill_value=fill_value)

In [5]:
from category_encoders import WOEEncoder

def instantiate_woe_encoder(trial : Trial) -> WOEEncoder:
  params = {
    'sigma': trial.suggest_float('sigma', 0.001, 5),
    'regularization': trial.suggest_float('regularization', 0, 5),
    'randomized': trial.suggest_categorical('randomized', [True, False])
  }
  return WOEEncoder(**params)

In [6]:
from sklearn.preprocessing import RobustScaler

def instantiate_robust_scaler(trial : Trial) -> RobustScaler:
    params = {
        'with_centering': trial.suggest_categorical(
        'with_centering', [True, False]
    ),
    'with_scaling': trial.suggest_categorical(
      'with_scaling', [True, False]
    )
    }
    return RobustScaler(**params)

In [7]:
import xgboost as xgb

def instantiate_xgb(trial : Trial) -> xgb.XGBClassifier:
    params={
            'eval_metric': 'auc',
            'lambda': trial.suggest_float( 'xgb_lambda',1e-06, 1e-03),
            'alpha':  trial.suggest_float( 'xgb_alpha',0.00001, 0.01),
            'max_depth': trial.suggest_int('xgb_max_depth', 1, 40),
            'eta': trial.suggest_float( 'xgb_eta',0.00001, 0.01),
            'gamma': trial.suggest_float( 'xgb_gamma',1e-08, 1e-05),
            'n_estimators': trial.suggest_int('xgb_n_estimators', 50, 1000),
            'learning_rate': trial.suggest_float( 'xgb_learning_rate',0.001, 1),
            'tree_method': 'hist', 
#             'device': 'cuda',
           }
    return xgb.XGBClassifier(**params)

In [8]:
from catboost import CatBoostClassifier

def instantiate_catb(trial : Trial) -> CatBoostClassifier:
    params = {
        'logging_level': 'Silent', 
        'random_seed': 0, 
        'iterations':  trial.suggest_int('catb_iterations', 50, 1000),
        'depth': trial.suggest_int('catb_depth', 10, 200),
        'min_data_in_leaf': trial.suggest_int('catb_min_data_in_leaf', 5, 200),
        'learning_rate': trial.suggest_float( 'catb_learning_rate',0.001, 1),
        'subsample': trial.suggest_float( 'catb_subsample',0.01, 1),
        'random_strength': trial.suggest_float( 'catb_random_strength',0.01, 1),
        'eval_metric' : 'AUC',
        'grow_policy': 'Lossguide',
        'bootstrap_type' : 'Bernoulli',
#         'task_type':"GPU"
}
    return CatBoostClassifier(**params)

In [9]:
from lightgbm import LGBMClassifier

def instantiate_lgbm(trial : Trial) -> LGBMClassifier:
    params = {
        'metric': 'auc', 
        'max_depth': trial.suggest_int('lgbm_max_depth', 1, 50),
        'min_child_samples': trial.suggest_int('lgbm_min_child_samples', 1, 20), 
        'learning_rate': trial.suggest_float( 'lgbm_learning_rate',0.001, 1),
        'n_estimators': trial.suggest_int('lgbm_n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('lgbm_min_child_weight', 1, 20), 
        'subsample': trial.suggest_float( 'lgbm_subsample',0.01, 1),
        'colsample_bytree':trial.suggest_float( 'lgbm_colsample_bytree',0.01, 1),
        'reg_alpha':trial.suggest_float( 'lgbm_reg_alpha',0.01, 1), 
        'reg_lambda': trial.suggest_float( 'lgbm_reg_lambda',0.01, 1),
        'random_state': 42,
        'verbose': -1,
#         'device':"gpu" 
    }
    return LGBMClassifier(**params)

In [10]:
def instantiate_lr(trial : Trial) -> LogisticRegression:
    params = {
        'max_iter': trial.suggest_int('lr_max_iter',5000, 10000, log=True),
        'C': trial.suggest_float('lr_C',0.001, 100, log=True),
        'tol': trial.suggest_float('lr_tol', 0.0001, 1, log=True),
#         'warm_start': True,
        'solver':'lbfgs',
    }
    return LogisticRegression(**params)

In [11]:
from sklearn.pipeline import Pipeline

def instantiate_numerical_pipeline(trial : Trial) -> Pipeline:
  pipeline = Pipeline([
    ('imputer', instantiate_numerical_simple_imputer(trial)),
    ('scaler', instantiate_robust_scaler(trial))
  ])
  return pipeline

def instantiate_categorical_pipeline(trial : Trial) -> Pipeline:
  pipeline = Pipeline([
    ('imputer', instantiate_categorical_simple_imputer(trial)),
    ('encoder', instantiate_woe_encoder(trial))
  ])
  return pipeline

In [12]:
from sklearn.compose import ColumnTransformer

def instantiate_processor(trial : Trial, numerical_columns : list[str], categorical_columns : list[str]) -> ColumnTransformer:
  
  numerical_pipeline = instantiate_numerical_pipeline(trial)
  categorical_pipeline = instantiate_categorical_pipeline(trial)
  
  processor = ColumnTransformer([
    ('numerical_pipeline', numerical_pipeline, numerical_columns),
    ('categorical_pipeline', categorical_pipeline, categorical_columns)
  ])
  
  return processor

def instantiate_model(trial : Trial, numerical_columns : list[str], categorical_columns : list[str], classifier_name : str) -> Pipeline:
    processor = instantiate_processor(trial, numerical_columns, categorical_columns)
    model = Pipeline([('processor', processor),])
    if classifier_name == "XGB":
        xgb = instantiate_xgb(trial)
        model.steps.append(['XGB', xgb])
    elif classifier_name == "CatB":
        catb = instantiate_catb(trial)
        model.steps.append(['CatB', catb])
    elif classifier_name == "LGBM":
        lgbm = instantiate_lgbm(trial)
        model.steps.append(['LGBM', lgbm])
    elif classifier_name == "LogR":
        lr = instantiate_lr(trial)
        model.steps.append(['LogR', lr])
    
    return model



In [13]:
from typing import Optional
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score, make_scorer
from pandas import DataFrame, Series
import numpy as np

def objective_single(trial : Trial, X : DataFrame, y : np.ndarray | Series, numerical_columns : Optional[list[str]]=None, categorical_columns : Optional[list[str]]=None, random_state : int=42) -> float:
    if numerical_columns is None:
        numerical_columns = [
          *X.select_dtypes(exclude=['object', 'category']).columns
    ]
  
    if categorical_columns is None:
        categorical_columns = [
      *X.select_dtypes(include=['object', 'category']).columns
    ]
  
    model = instantiate_model(trial, numerical_columns, categorical_columns, "LogR")
  
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    roc_auc_scorer = make_scorer(roc_auc_score, needs_proba=True)
    scores = cross_val_score(model, X, y, scoring=roc_auc_scorer, cv=kf)
    return np.min([np.mean(scores), np.median([scores])])



In [14]:
models = {
 'XGB':{
            'xgb_numerical_strategy': 'most_frequent',
            'xgb_with_centering': False,
            'xgb_with_scaling': False,
            'xgb_categorical_strategy': 'most_frequent',
            'xgb_sigma': 3.729290980204139,
            'xgb_regularization': 4.525593729070233,
            'xgb_randomized': False,
            'xgb_lambda': 9.80216226429736e-05,
            'xgb_alpha': 0.00247707065685737,
            'xgb_max_depth': 7,
            'xgb_eta': 0.000949921634938113,
            'xgb_gamma': 8.432433257204815e-07,
            'xgb_n_estimators': 597,
            'xgb_learning_rate': 0.0704490770938814},
 
 'CatB':{
            'catb_numerical_strategy': 'most_frequent',
            'catb_with_centering': True,
            'catb_with_scaling': False,
            'catb_categorical_strategy': 'most_frequent',
            'catb_sigma': 1.0102999809269857,
            'catb_regularization': 0.005134854246679088,
            'catb_randomized': False,
            'catb_iterations': 512,
            'catb_depth': 151,
            'catb_min_data_in_leaf': 200,
            'catb_learning_rate': 0.026405733602736736,
            'catb_subsample': 0.8789512739723611,
            'catb_random_strength': 0.268413400762971},
 
 'LGBM':{
            'lgbm_numerical_strategy': 'most_frequent',
            'lgbm_with_centering': False,
            'lgbm_with_scaling': False,
            'lgbm_categorical_strategy': 'most_frequent',
            'lgbm_sigma': 0.24133299951881426, 
            'lgbm_regularization': 1.9467558269802216, 
            'lgbm_randomized': False, 
            'lgbm_max_depth': 12, 
            'lgbm_learning_rate': 0.018029136438413307,
            'lgbm_n_estimators': 888,
            'lgbm_subsample': 0.16889238269740878,
            'lgbm_colsample_bytree': 0.5758061565891749,
            'lgbm_reg_alpha': 0.877129013958167, 
            'lgbm_reg_lambda': 0.474890927333532},
    
 'LogR':{'lr_max_iter': 3673,
         'lr_C': 0.0016492543471898426,
         'lr_tol': 0.03959339746440095},
}

In [15]:
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier

def instantiate_stacker(trial : Trial, numerical_columns : list[str], categorical_columns : list[str] ) -> StackingClassifier:
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 20),
        'max_features': trial.suggest_float('max_features', 0, 1),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'n_jobs': -1,
        'random_state': 42,
        'n_jobs':-1,
        
    }
    model1 = instantiate_model(trial, numerical_columns, categorical_columns, "XGB")
    model2 = instantiate_model(trial, numerical_columns, categorical_columns, "CatB")
    model3 = instantiate_model(trial, numerical_columns, categorical_columns, "LGBM")
    model4 = instantiate_model(trial, numerical_columns, categorical_columns, "LogR")
    
    level0 = [
        ('model1',model1), 
        ('model2',model2),
        ('model3',model3),
        ('model4',model4)]
    
    level1 = ExtraTreesClassifier(**params)
    meta_model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return meta_model

In [16]:
def objective(trial : Trial, X : DataFrame, y : np.ndarray | Series, numerical_columns : Optional[list[str]]=None, categorical_columns : Optional[list[str]]=None, random_state : int=42) -> float:
    if numerical_columns is None:
        numerical_columns = [
          *X.select_dtypes(exclude=['object', 'category']).columns
    ]
  
    if categorical_columns is None:
        categorical_columns = [
      *X.select_dtypes(include=['object', 'category']).columns
    ]
    
    model = instantiate_stacker(trial, numerical_columns, categorical_columns)
  
    kf = KFold(n_splits=2, shuffle=True, random_state=random_state)
    roc_auc_scorer = make_scorer(roc_auc_score, needs_proba=True)
    scores = cross_val_score(model, X, y, scoring=roc_auc_scorer, cv=kf)
    return np.min([np.mean(scores), np.median([scores])])


In [17]:
from optuna import create_study


study = create_study(study_name='optimization', direction='maximize')
study.enqueue_trial({'n_estimators': 276, 'max_depth': 6, 'max_features': 0.815431846329508, 'bootstrap': False, 'numerical_strategy': 'median', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'sigma': 0.5061308987241582, 'regularization': 0.9131708454111955, 'randomized': False, 'xgb_lambda': 2.8568825005909486e-05, 'xgb_alpha': 0.005109851856729309, 'xgb_max_depth': 10, 'xgb_eta': 0.005801612118752712, 'xgb_gamma': 9.016383631650964e-07, 'xgb_n_estimators': 626, 'xgb_learning_rate': 0.35435152425844973, 'catb_iterations': 760, 'catb_depth': 53, 'catb_min_data_in_leaf': 168, 'catb_learning_rate': 0.024179238250688007, 'catb_subsample': 0.2813943547300177, 'catb_random_strength': 0.8318716425051079, 'lgbm_max_depth': 7, 'lgbm_learning_rate': 0.017852637947762904, 'lgbm_n_estimators': 656, 'lgbm_subsample': 0.10350043889710368, 'lgbm_colsample_bytree': 0.6501457690789654, 'lgbm_reg_alpha': 0.6031771192732664, 'lgbm_reg_lambda': 0.8535631974964178, 'lr_max_iter': 5973, 'lr_C': 1.0827852350663947, 'lr_tol': 0.0032674314134142203})
study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=1)

[I 2024-02-27 15:12:03,447] A new study created in memory with name: optimization
[I 2024-02-27 15:19:54,469] Trial 0 finished with value: 0.8877968192695411 and parameters: {'n_estimators': 276, 'max_depth': 6, 'max_features': 0.815431846329508, 'bootstrap': False, 'numerical_strategy': 'median', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'sigma': 0.5061308987241582, 'regularization': 0.9131708454111955, 'randomized': False, 'xgb_lambda': 2.8568825005909486e-05, 'xgb_alpha': 0.005109851856729309, 'xgb_max_depth': 10, 'xgb_eta': 0.005801612118752712, 'xgb_gamma': 9.016383631650964e-07, 'xgb_n_estimators': 626, 'xgb_learning_rate': 0.35435152425844973, 'catb_iterations': 760, 'catb_depth': 53, 'catb_min_data_in_leaf': 168, 'catb_learning_rate': 0.024179238250688007, 'catb_subsample': 0.2813943547300177, 'catb_random_strength': 0.8318716425051079, 'lgbm_max_depth': 7, 'lgbm_min_child_samples': 9, 'lgbm_learning_rate': 0.017852637947762904, 'l

In [18]:
print('Best parameters:', study.best_params)
print('Best value:', study.best_value)


Best parameters: {'n_estimators': 276, 'max_depth': 6, 'max_features': 0.815431846329508, 'bootstrap': False, 'numerical_strategy': 'median', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'sigma': 0.5061308987241582, 'regularization': 0.9131708454111955, 'randomized': False, 'xgb_lambda': 2.8568825005909486e-05, 'xgb_alpha': 0.005109851856729309, 'xgb_max_depth': 10, 'xgb_eta': 0.005801612118752712, 'xgb_gamma': 9.016383631650964e-07, 'xgb_n_estimators': 626, 'xgb_learning_rate': 0.35435152425844973, 'catb_iterations': 760, 'catb_depth': 53, 'catb_min_data_in_leaf': 168, 'catb_learning_rate': 0.024179238250688007, 'catb_subsample': 0.2813943547300177, 'catb_random_strength': 0.8318716425051079, 'lgbm_max_depth': 7, 'lgbm_min_child_samples': 9, 'lgbm_learning_rate': 0.017852637947762904, 'lgbm_n_estimators': 656, 'lgbm_min_child_weight': 3, 'lgbm_subsample': 0.10350043889710368, 'lgbm_colsample_bytree': 0.6501457690789654, 'lgbm_reg_alpha': 0.6

In [19]:

best_trial = study.best_trial
numerical_columns = [
      *X.select_dtypes(exclude=['object', 'category']).columns
    ]
categorical_columns = [
      *X.select_dtypes(include=['object', 'category']).columns
    ]
  
# model = instantiate_model(best_trial, numerical_columns, categorical_columns, "LogR")
model=instantiate_stacker(best_trial, numerical_columns, categorical_columns)
model.fit(X_train, y_train)

In [20]:
probabilities = model.predict_proba(X_test)[:, 1]
score = roc_auc_score(y_test, probabilities)
print(score)

0.8917870424278663


In [21]:
test_prediction = model.predict_proba(test)
test_prediction = pd.DataFrame(test_prediction, columns=('Not','Exited'))

In [22]:
submission = pd.read_csv('/kaggle/input/playground-series-s4e1/sample_submission.csv')
submission[['Exited']]= test_prediction[['Exited']]
submission.to_csv('submission.csv', index=False)