In [102]:
from datetime import datetime as dt

import pandas as pd
import numpy as np

from preprocessing import Preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score


from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import RandomOverSampler


# Preprocessing

In [103]:
col_drop = ['CustomerId','Surname']
types = {
    'HasCrCard':bool,
    'IsActiveMember':bool
}
train = pd.read_csv('data/train.csv', index_col='id', dtype=types).drop(columns=col_drop)
test = pd.read_csv('data/test.csv', index_col='id', dtype=types).drop(columns=col_drop)

In [104]:
train.duplicated().sum()

123

In [105]:
train.drop_duplicates(inplace=True)

In [106]:
train.shape

(164911, 11)

In [107]:
X = train.drop(columns='Exited')
y = train.Exited

In [108]:
prepro = Preprocessing()

In [109]:
X_scld = prepro.fit_transform(X).astype(float)

In [110]:
ros = RandomOverSampler(random_state=42)
X_resample, y_resample = ros.fit_resample(X_scld, y)

In [111]:
skf = StratifiedKFold(n_splits=5)

In [112]:
WeightTarget = y.value_counts(True).to_dict()

In [113]:
rf = {
    'n_estimators'          : 300,
    'max_depth'             : 16,
    'min_samples_leaf'      : 16,
    'min_samples_split'     : 96,
    'class_weight'          : WeightTarget    
}
light = {
}
xgb  = {
    'booster'         : 'gbtree'
}
cat  = {
    'iterations'            : 300,
    'depth'                 : 6,
    'l2_leaf_reg'           : 5,
    'learning_rate'         : 0.1,
    'verbose'               : False,
    'class_weights'         : WeightTarget  
    
}

mlp = {
    'hidden_layer_sizes'    : (64,32),
    'learning_rate_init'    : 1e-4,
    'learning_rate'         : 'adaptive'
    
}

In [114]:
results = {
    'models' : [
        #('rf', RandomForestClassifier(**rf)),
        #('histgradboost', HistGradientBoostingClassifier()),
        #('gradboost', GradientBoostingClassifier()),
        #('adaboost', AdaBoostClassifier()),
        ('Catboost',CatBoostClassifier(**cat)),
        ('LGBM',LGBMClassifier(**light)),
        ('Xgb_gbtree', XGBClassifier(**xgb)),
        #('mlp',MLPClassifier(**mlp))
        
        ],
    'mean_score' :[],
    'std_dev_score' :[] 
}

In [115]:
for _, model in results.get('models') :
    scores = cross_val_score(model,
                            X_scld,
                            y,
                            scoring = 'roc_auc',
                            cv = skf,
                            n_jobs = -1)
    results['mean_score'].append(scores.mean())
    results['std_dev_score'].append(scores.std())
    
    name = type(model).__name__ 
    print(f'{name} - Roc AUC score: {scores.mean():.4f} ± {scores.std():.4f}')

RandomForestClassifier - Roc AUC score: 0.8882 ± 0.0020
HistGradientBoostingClassifier - Roc AUC score: 0.8892 ± 0.0017
GradientBoostingClassifier - Roc AUC score: 0.8885 ± 0.0017
AdaBoostClassifier - Roc AUC score: 0.8802 ± 0.0017
CatBoostClassifier - Roc AUC score: 0.8896 ± 0.0016
LGBMClassifier - Roc AUC score: 0.8894 ± 0.0016
XGBClassifier - Roc AUC score: 0.8877 ± 0.0015
MLPClassifier - Roc AUC score: 0.8875 ± 0.0019


In [116]:
voting = VotingClassifier(
    estimators=results.get('models'),
    voting='soft',
    #weights=results.get('mean_score'),
    n_jobs=-1
    
)

In [117]:
scores = cross_val_score(
    voting,
    X_resample, y_resample,
    scoring = 'roc_auc',
    cv = skf,
    n_jobs = -1)

print("")
print(f'Roc AUC score: {scores.mean():.4f} ± {scores.std():.4f} min:{scores.min():.4f} | max:{scores.max():.4f}')


Roc AUC score: 0.8980 ± 0.0015 min:0.8959 | max:0.9002


# Submission

In [118]:
voting.fit(X_resample, y_resample)

In [119]:
X_val_scld = prepro.transform(test).astype(float)

In [120]:
submission = pd.read_csv("data/sample_submission.csv", index_col='id')

In [121]:
submission.loc[:,'Exited'] = voting.predict_proba(X_val_scld.values)[:,1]



In [122]:
name = dt.now().strftime("%Y%m%d_%H%M")

In [123]:
submission.to_csv(f"submission/{name}.csv")

In [124]:
submission

Unnamed: 0_level_0,Exited
id,Unnamed: 1_level_1
165034,0.120143
165035,0.865305
165036,0.122544
165037,0.453129
165038,0.577937
...,...
275052,0.149339
275053,0.245797
275054,0.108492
275055,0.350557
