In [13]:
from datetime import datetime as dt

import pandas as pd

from preprocessing import Preprocessing
from sklearn.model_selection import cross_val_score


from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


# Preprocessing

In [14]:
col_drop = ['CustomerId','Surname']
types = {
    'HasCrCard':bool,
    'IsActiveMember':bool
}
train = pd.read_csv('data/train.csv', index_col='id', dtype=types).drop(columns=col_drop)
test = pd.read_csv('data/test.csv', index_col='id', dtype=types).drop(columns=col_drop)

In [15]:
train.shape

(165034, 11)

In [16]:
X = train.drop(columns='Exited')
y = train.Exited

In [20]:
prepro = Preprocessing()

In [21]:
X_scld = prepro.fit_transform(X).astype(float)

# Model

In [40]:
results = {
    'models' : [
        ('rf', RandomForestClassifier()),
        ('histgradboost', HistGradientBoostingClassifier()),
        ('gradboost', GradientBoostingClassifier()),
        ('adaboost', AdaBoostClassifier()),
        ('Xgb_gbtree', XGBClassifier(booster='gbtree')),
        ('Xgb_dart', XGBClassifier(booster='dart')),
        ('Catboost',CatBoostClassifier(verbose=False)),
        
        ],
    'mean_score' :[],
    'std_dev_score' :[] 
}
for _, model in results.get('models') :
    scores = cross_val_score(model,
                            X_scld,
                            y,
                            scoring = 'roc_auc',
                            cv = 4,
                            n_jobs = -1)
    results['mean_score'].append(scores.mean())
    results['std_dev_score'].append(scores.std())
    
    name = type(model).__name__ 
    print(f'{name} - Roc AUC score: {scores.mean():.4f} ± {scores.std():.4f}')

RandomForestClassifier - Roc AUC score: 0.8734 ± 0.0019
HistGradientBoostingClassifier - Roc AUC score: 0.8890 ± 0.0011
GradientBoostingClassifier - Roc AUC score: 0.8882 ± 0.0010
AdaBoostClassifier - Roc AUC score: 0.8801 ± 0.0012
XGBClassifier - Roc AUC score: 0.8864 ± 0.0013
XGBClassifier - Roc AUC score: 0.8864 ± 0.0013
CatBoostClassifier - Roc AUC score: 0.8885 ± 0.0011


In [41]:
voting = VotingClassifier(
    estimators=results.get('models'),
    voting='soft',
    weights=results.get('mean_score')
)

In [42]:
voting.fit(X_scld,y)

# Submission

In [43]:
X_val_scld = prepro.transform(test).astype(float)

In [44]:
submission = pd.read_csv("data/sample_submission.csv", index_col='id')

In [45]:
submission.loc[:,'Exited'] = voting.predict_proba(X_val_scld.values)[:,1]



In [46]:
submission

Unnamed: 0_level_0,Exited
id,Unnamed: 1_level_1
165034,0.087242
165035,0.791981
165036,0.088294
165037,0.255614
165038,0.347767
...,...
275052,0.113875
275053,0.167403
275054,0.082832
275055,0.198871


In [37]:
name = dt.now().strftime("%Y%m%d_%H%M")

In [38]:
submission.to_csv(f"submission/{name}.csv")