In [1]:
from datetime import datetime as dt

import pandas as pd
import numpy as np

from preprocessing import Preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score


from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier


# Preprocessing

In [2]:
col_drop = ['CustomerId','Surname']
types = {
    'HasCrCard':bool,
    'IsActiveMember':bool
}
train = pd.read_csv('data/train.csv', index_col='id', dtype=types).drop(columns=col_drop)
test = pd.read_csv('data/test.csv', index_col='id', dtype=types).drop(columns=col_drop)

In [3]:
train.duplicated().sum()

123

In [4]:
train.drop_duplicates(inplace=True)

In [5]:
train.shape

(164911, 11)

In [6]:
X = train.drop(columns='Exited')
y = train.Exited

In [7]:
prepro = Preprocessing()

In [8]:
X_scld = prepro.fit_transform(X).astype(float)

In [9]:
X_scld

Unnamed: 0_level_0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.143975,0.0,1.0,-0.512580,0.3,-0.916849,0.333333,1.0,0.0,1.369458
1,-0.367944,0.0,1.0,-0.512580,0.1,-0.916849,0.333333,1.0,1.0,-1.254249
2,0.268833,0.0,1.0,0.353882,1.0,-0.916849,0.333333,1.0,0.0,1.437399
3,-0.942292,0.0,1.0,-0.375423,0.2,1.119911,0.000000,1.0,1.0,-0.557146
4,0.743295,0.0,1.0,-0.512580,0.5,-0.916849,0.333333,1.0,1.0,-1.938970
...,...,...,...,...,...,...,...,...,...,...
165029,0.131489,0.0,0.0,-0.512580,0.2,-0.916849,0.000000,1.0,1.0,0.382878
165030,1.692217,0.0,1.0,-0.243194,0.3,-0.916849,0.000000,0.0,0.0,0.382872
165031,-1.142065,0.0,1.0,-0.803062,0.5,-0.916849,0.000000,1.0,1.0,0.295282
165032,-1.279409,0.0,0.0,-0.957168,0.7,1.130439,0.000000,0.0,1.0,-0.823359


In [10]:
skf = StratifiedKFold(n_splits=5)

In [11]:
WeightTarget = y.value_counts(True).to_dict()

In [12]:
results = {
    'models' : [
        ('rf', RandomForestClassifier(n_estimators=300, max_depth=16, min_samples_leaf=16, min_samples_split=96,class_weight=WeightTarget)),
        ('histgradboost', HistGradientBoostingClassifier()),
        ('gradboost', GradientBoostingClassifier()),
        ('adaboost', AdaBoostClassifier()),
        ('Catboost',CatBoostClassifier(verbose=False, class_weights=WeightTarget,depth=6,iterations=300,l2_leaf_reg=5,learning_rate=0.1)),
        ('LGBM',LGBMClassifier()),
        ('Xgb_gbtree', XGBClassifier(booster='gbtree')),
        
        ],
    'mean_score' :[],
    'std_dev_score' :[] 
}
for _, model in results.get('models') :
    scores = cross_val_score(model,
                            X_scld,
                            y,
                            scoring = 'roc_auc',
                            cv = skf,
                            n_jobs = -1)
    results['mean_score'].append(scores.mean())
    results['std_dev_score'].append(scores.std())
    
    name = type(model).__name__ 
    print(f'{name} - Roc AUC score: {scores.mean():.4f} ± {scores.std():.4f}')

RandomForestClassifier - Roc AUC score: 0.8880 ± 0.0021
HistGradientBoostingClassifier - Roc AUC score: 0.8894 ± 0.0016
GradientBoostingClassifier - Roc AUC score: 0.8885 ± 0.0017
AdaBoostClassifier - Roc AUC score: 0.8802 ± 0.0017
CatBoostClassifier - Roc AUC score: 0.8896 ± 0.0016
LGBMClassifier - Roc AUC score: 0.8894 ± 0.0016
XGBClassifier - Roc AUC score: 0.8867 ± 0.0015


In [13]:
voting = VotingClassifier(
    estimators=results.get('models'),
    voting='soft',
    weights=results.get('mean_score'),
    n_jobs=-1
    
)

In [16]:
scores = cross_val_score(voting,
                        X_scld,
                        y,
                        scoring = 'roc_auc',
                        cv = skf,
                        n_jobs = -1,
                        verbose=4)

print("")
print(f'Roc AUC score: {scores.mean():.4f} ± {scores.std():.4f} min:{scores.min():.4f} | max:{scores.max():.4f}')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


[CV] END ................................ score: (test=0.888) total time=  47.2s
[CV] END ................................ score: (test=0.890) total time=  47.3s


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   47.4s remaining:  1.2min


[CV] END ................................ score: (test=0.889) total time=  47.6s
[CV] END ................................ score: (test=0.889) total time=  47.7s
[CV] END ................................ score: (test=0.893) total time=  47.1s

Roc AUC score: 0.8899 ± 0.0017 min:0.8879 / max:0.8928


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   48.0s finished


# Submission

In [29]:
voting.fit(X_scld,y)

[Voting] ....................... (1 of 7) Processing rf, total=  48.0s
[Voting] ............ (2 of 7) Processing histgradboost, total=   1.2s
[Voting] ................ (3 of 7) Processing gradboost, total=  13.0s
[Voting] ................. (4 of 7) Processing adaboost, total=   2.9s
[Voting] ............... (5 of 7) Processing Xgb_gbtree, total=   0.3s
[Voting] ................. (6 of 7) Processing Catboost, total=   7.6s
[Voting] ..................... (7 of 7) Processing LGBM, total=   0.8s


In [30]:
X_val_scld = prepro.transform(test).astype(float)

In [31]:
submission = pd.read_csv("data/sample_submission.csv", index_col='id')

In [32]:
submission.loc[:,'Exited'] = voting.predict_proba(X_val_scld.values)[:,1]



In [33]:
name = dt.now().strftime("%Y%m%d_%H%M")

In [34]:
submission.to_csv(f"submission/{name}.csv")

In [35]:
submission

Unnamed: 0_level_0,Exited
id,Unnamed: 1_level_1
165034,0.092018
165035,0.789643
165036,0.092393
165037,0.261585
165038,0.363675
...,...
275052,0.105166
275053,0.177527
275054,0.086462
275055,0.202179
