In [17]:
from datetime import datetime as dt

import pandas as pd

from preprocessing import Preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score


from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier


# Preprocessing

In [7]:
col_drop = ['CustomerId','Surname']
types = {
    'HasCrCard':bool,
    'IsActiveMember':bool
}
train = pd.read_csv('data/train.csv', index_col='id', dtype=types).drop(columns=col_drop)
test = pd.read_csv('data/test.csv', index_col='id', dtype=types).drop(columns=col_drop)

In [8]:
train.duplicated().sum()

123

In [9]:
train.drop_duplicates(inplace=True)

In [10]:
train.shape

(164911, 11)

In [11]:
X = train.drop(columns='Exited')
y = train.Exited

In [12]:
prepro = Preprocessing()

In [13]:
X_scld = prepro.fit_transform(X).astype(float)

# Model

In [18]:
skf = StratifiedKFold(n_splits=5)

In [19]:
results = {
    'models' : [
        ('rf', RandomForestClassifier()),
        ('histgradboost', HistGradientBoostingClassifier()),
        ('gradboost', GradientBoostingClassifier()),
        ('adaboost', AdaBoostClassifier()),
        ('Xgb_gbtree', XGBClassifier(booster='gbtree')),
        ('Catboost',CatBoostClassifier(verbose=False)),
        ('LGBM',LGBMClassifier()),
        
        ],
    'mean_score' :[],
    'std_dev_score' :[] 
}
for _, model in results.get('models') :
    scores = cross_val_score(model,
                            X_scld,
                            y,
                            scoring = 'roc_auc',
                            cv = skf,
                            n_jobs = -1)
    results['mean_score'].append(scores.mean())
    results['std_dev_score'].append(scores.std())
    
    name = type(model).__name__ 
    print(f'{name} - Roc AUC score: {scores.mean():.4f} ± {scores.std():.4f}')

RandomForestClassifier - Roc AUC score: 0.8728 ± 0.0018
HistGradientBoostingClassifier - Roc AUC score: 0.8893 ± 0.0018
GradientBoostingClassifier - Roc AUC score: 0.8885 ± 0.0017
AdaBoostClassifier - Roc AUC score: 0.8802 ± 0.0017
XGBClassifier - Roc AUC score: 0.8868 ± 0.0018
CatBoostClassifier - Roc AUC score: 0.8889 ± 0.0016
LGBMClassifier - Roc AUC score: 0.8895 ± 0.0018


In [20]:
voting = VotingClassifier(
    estimators=results.get('models'),
    voting='soft',
    weights=results.get('mean_score')
)

In [21]:
voting.fit(X_scld,y)

# Submission

In [22]:
X_val_scld = prepro.transform(test).astype(float)

In [23]:
submission = pd.read_csv("data/sample_submission.csv", index_col='id')

In [24]:
submission.loc[:,'Exited'] = voting.predict_proba(X_val_scld.values)[:,1]



In [25]:
name = dt.now().strftime("%Y%m%d_%H%M")

In [26]:
submission.to_csv(f"submission/{name}.csv")

In [27]:
submission

Unnamed: 0_level_0,Exited
id,Unnamed: 1_level_1
165034,0.087596
165035,0.792854
165036,0.088513
165037,0.266240
165038,0.370765
...,...
275052,0.113458
275053,0.174901
275054,0.083016
275055,0.204561
