Code taken, simplified and modified from Manvendra Raj Singh.



In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.metrics import roc_auc_score               #measures the area under the Receiver Operating Characteristic (ROC) curve (doesn't train the model)
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold     #k-fold cross-validation
from sklearn.feature_extraction.text import TfidfVectorizer #feature extraction module; converts raw documents (text) into a matrix of TF-IDF features
from sklearn.decomposition import TruncatedSVD              #decomposition module; used for dimensionality reduction of sparse data
from catboost import CatBoostClassifier, Pool               #gradient boosting library by Yandex that handles categorical features naturally. Pool is used to hold data and can be passed as an argument for training a CatBoost model
from catboost.utils import eval_metric                  #evaluation metrics for CatBoost models, such as accuracy, AUC (Area Under the Curve)
import warnings
warnings.filterwarnings("ignore")

In [None]:
RAND_VAL=32
num_folds=5 ## Number of folds
n_est=3000 ## Number of estimators

In [None]:
df_train = pd.read_csv('train.csv')
df_train.head()

In [None]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


In [None]:
df_train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Age_scaled,CreditScore_scaled,Balance_scaled,EstimatedSalary_scaled
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0,0.202703,0.636,0.0,0.907279
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0,0.202703,0.554,0.0,0.247483
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0,0.297297,0.656,0.0,0.924364
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0,0.216216,0.462,0.593398,0.422787
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0,0.202703,0.732,0.0,0.075293


In [None]:
feat_cols=df_train.columns.drop(['label'])

Number of Features: 64
Index(['CustomerId', 'Surname', 'Geography', 'Gender', 'Tenure',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Age_scaled',
       'CreditScore_scaled', 'Balance_scaled', 'EstimatedSalary_scaled',
       'IsSenior', 'IsActive_by_CreditCard', 'Products_Per_Tenure', 'AgeCat',
       'Sur_Geo_Gend_Sal', 'Agemin', 'Agemax', 'Agemean', 'Balancemin',
       'Balancemax', 'Balancemean', 'Balancesum', 'NumOfProductsmean',
       'NumOfProductssum', 'IsActiveMembermin', 'IsActiveMembermax',
       'IsActiveMembermean', 'IsActiveMembersum', 'CreditScoremin',
       'CreditScoremax', 'CreditScoremean', 'EstimatedSalarymin',
       'EstimatedSalarymax', 'EstimatedSalarymean', 'EstimatedSalarysum',
       'idcount', 'Balancegrps1min', 'Balancegrps1max', 'Balancegrps1mean',
       'Balancegrps1sum', 'NumOfProductsgrps1mean', 'NumOfProductsgrps1sum',
       'IsActiveMembergrps1min', 'IsActiveMembergrps1max',
       'IsActiveMembergrps1mean', 'IsActiveMembergrps1sum',


Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,...,EstimatedSalarygrps1sum,idgrps1count,Exit_lag1,Exit_lag2,Exit_lag3,Exit_lead1,Exit_lead2,Exit_lead3,Balance_lag_diff1,Balance_lead_diff1
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,...,181449.97,1,-1,-1,-1,-1,-1,-1,0,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,...,49503.5,1,-1,-1,-1,-1,-1,-1,115587,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,...,184866.69,1,-1,-1,-1,-1,-1,-1,0,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,...,84560.88,1,-1,-1,-1,-1,-1,-1,0,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,...,15068.83,1,-1,-1,-1,-1,-1,-1,135759,142084


In [None]:
X=df_train[feat_cols]
y=df_train['label']
##
cat_features = np.where(X.dtypes != np.float64)[0]

array([ 0,  1,  2,  3,  4,  5, 12, 15, 16, 25, 30, 31, 37, 43, 48, 49, 55,
       56, 57, 58, 59, 60, 61, 62, 63])

In [None]:
folds = StratifiedKFold(n_splits=num_folds,random_state=RAND_VAL,shuffle=True)

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):

    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[valid_idx], y.iloc[valid_idx]

    train_pool = Pool(X_train, y_train,cat_features=cat_features)
    val_pool = Pool(X_val, y_val,cat_features=cat_features)

    clf = CatBoostClassifier(
    eval_metric='AUC',
    task_type='GPU',
    learning_rate=0.02,
    iterations=n_est)

    clf.fit(train_pool, eval_set=val_pool,verbose=300)

    y_pred_val = clf.predict_proba(X_val[feat_cols])[:,1]
    auc_val = roc_auc_score(y_val, y_pred_val)
    print("AUC for fold ",n_fold,": ",auc_val)

    y_pred_test = clf.predict_proba(df_test[feat_cols])[:,1]
    print("----------------")


NameError: name 'StratifiedKFold' is not defined

In [None]:
prediction = pd.DataFrame(y_pred_test)

prediction.to_csv("predictions_final",index = False)





            id    Exited
0       165034  0.011324
1       165035  0.829112
2       165036  0.015673
3       165037  0.191764
4       165038  0.433668
...        ...       ...
110018  275052  0.025265
110019  275053  0.176701
110020  275054  0.009736
110021  275055  0.176166
110022  275056  0.117606

[110023 rows x 2 columns]


numpy.ndarray

In [None]:
"Mean AUC: ",np.mean(auc_vals)

('Mean AUC: ', 0.8998544222397277)