In [1]:
from datetime import datetime as dt

import pandas as pd
import numpy as np

from preprocessing import Preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score


from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from sklearn.ensemble import VotingClassifier


# Preprocessing

In [2]:
col_drop = ['CustomerId','Surname']
types = {
    'HasCrCard':bool,
    'IsActiveMember':bool
}
train = pd.read_csv('data/train.csv', index_col='id', dtype=types).drop(columns=col_drop)
test = pd.read_csv('data/test.csv', index_col='id', dtype=types).drop(columns=col_drop)

In [3]:
train.duplicated().sum()

123

In [4]:
train.drop_duplicates(inplace=True)

In [5]:
train.shape

(164911, 11)

In [6]:
X = train.drop(columns='Exited')
y = train.Exited

In [7]:
X.shape

(164911, 10)

In [8]:
prepro = Preprocessing()

In [9]:
X_scld = prepro.fit_transform(X).astype(float)

In [10]:
X_scld.shape

(164911, 91)

In [11]:
skf = StratifiedKFold(n_splits=5, shuffle=True)

In [12]:
WeightTarget = y.value_counts(True).to_dict()

In [13]:
cat  = {
    'iterations'            : 300,
    'depth'                 : 6,
    'l2_leaf_reg'           : 5,
    'learning_rate'         : 0.1,
    'verbose'               : False,
    'class_weights'         : WeightTarget  
    
}

light = {
    
}

rf = {
    'n_estimators'          : 300,
    'max_depth'             : 16,
    'min_samples_leaf'      : 16,
    'min_samples_split'     : 96,
    'class_weight'          : WeightTarget    
}

xgb_dart = {
    'booster'         : 'dart',
    'objective'       : 'binary:logistic',
    'learning_rate'          : [1e-1, 1e-2],
    'n_estimators'           : [50, 100],
    'max_depth'              : [4, 8, 16],
    'subsample'              : [0.8, 0.9, 1.0],
    'eval_metric'            : ['auc'],
    'rate_drop'              : [0.1, 0.2],
    'skip_drop'              : [0.1, 0.2]
    
}

#ok
xgb_tree  = {
    'booster'                : 'gbtree',
    'objective'              : 'binary:logistic',
    'learning_rate'          : 1e-1,
    'n_estimators'           : 100,
    'max_depth'              : 4,
    'subsample'              : 0.9,
    'eval_metric'            : 'auc'
}




models =  [
        ('rf', RandomForestClassifier(**rf)),
        ('Catboost',CatBoostClassifier(**cat)),
        ('LGBM',LGBMClassifier(**light)),
        ('Xgb_gbtree', XGBClassifier(**xgb_tree)),
        ('Xgb_dart', XGBClassifier(**xgb_dart))      
]

In [14]:
mean_scores = []
for name, model in models :
    scores = cross_val_score(
        model,
        X_scld,
        y,
        scoring = 'roc_auc',
        cv=skf,
        n_jobs=-1
    )
    mean_scores.append(scores.mean())
    print(f"{name} = scores : {scores.mean():.4f} ± {scores.std():.4f} ")

rf = scores : 0.8872 ± 0.0025 
Catboost = scores : 0.8889 ± 0.0013 
LGBM = scores : 0.8889 ± 0.0021 
Xgb_gbtree = scores : 0.8892 ± 0.0016 
Xgb_dart = scores : 0.8858 ± 0.0012 


In [15]:
voting = VotingClassifier(
    estimators=models,
    voting='soft',
    weights=[1,2,2,2,1],
    n_jobs=-1
    
)

In [16]:
scores = cross_val_score(
    voting,
    X_scld, 
    y,
    scoring = 'roc_auc',
    cv = skf,
    n_jobs = -1)

print("")
print(f'Roc AUC score: {scores.mean():.4f} ± {scores.std():.4f} min:{scores.min():.4f} | max:{scores.max():.4f}')


Roc AUC score: 0.8896 ± 0.0027 min:0.8845 | max:0.8922


# Submission

In [17]:
voting.fit(X_scld, y)



In [18]:
X_val_scld = prepro.transform(test).astype(float)

In [19]:
submission = pd.read_csv("data/sample_submission.csv", index_col='id')

In [21]:
submission.loc[:,'Exited'] = voting.predict_proba(X_val_scld)[:,1]

In [22]:
name = dt.now().strftime("%Y%m%d_%H%M")

In [23]:
submission.to_csv(f"submission/{name}.csv")

In [24]:
submission

Unnamed: 0_level_0,Exited
id,Unnamed: 1_level_1
165034,0.036148
165035,0.770981
165036,0.037868
165037,0.256961
165038,0.377526
...,...
275052,0.058151
275053,0.133326
275054,0.032504
275055,0.207552
