Code taken, simplified and modified from Manvendra Raj Singh.



In [None]:
!pip3 install catboost

In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.metrics import roc_auc_score               #measures the area under the Receiver Operating Characteristic (ROC) curve (doesn't train the model)
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold     #k-fold cross-validation
from sklearn.feature_extraction.text import TfidfVectorizer #feature extraction module; converts raw documents (text) into a matrix of TF-IDF features
from sklearn.decomposition import TruncatedSVD              #decomposition module; used for dimensionality reduction of sparse data
from catboost import CatBoostClassifier, Pool               #gradient boosting library by Yandex that handles categorical features naturally. Pool is used to hold data and can be passed as an argument for training a CatBoost model
from catboost.utils import eval_metric                  #evaluation metrics for CatBoost models, such as accuracy, AUC (Area Under the Curve)
import warnings
warnings.filterwarnings("ignore")

In [None]:
RAND_VAL=32
num_folds=5 ## Number of folds
n_est=3000 ## Number of estimators

In [None]:
df_train = pd.read_csv('train.csv')
df_train.head()

In [None]:
df_test = pd.read_csv('test.csv')
df_test.head()

In [None]:
df_train.head()

In [19]:
label = 'Exited'
feat_cols=df_train.columns.drop([label])

In [20]:
X=df_train[feat_cols]
y=df_train[label]
##
cat_features = np.where(X.dtypes != np.float64)[0]

In [None]:
folds = StratifiedKFold(n_splits=num_folds,random_state=RAND_VAL,shuffle=True)

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):

    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[valid_idx], y.iloc[valid_idx]

    train_pool = Pool(X_train, y_train,cat_features=cat_features)
    val_pool = Pool(X_val, y_val,cat_features=cat_features)

    clf = CatBoostClassifier(
    eval_metric='AUC',
    #task_type='GPU',
    learning_rate=0.02,
    iterations=n_est)

    clf.fit(train_pool, eval_set=val_pool,verbose=300)

    y_pred_val = clf.predict_proba(X_val[feat_cols])[:,1]
    auc_val = roc_auc_score(y_val, y_pred_val)
    print("AUC for fold ",n_fold,": ",auc_val)

    y_pred_test = clf.predict_proba(df_test[feat_cols])[:,1]
    print("----------------")


In [None]:
prediction = pd.DataFrame(y_pred_test)

prediction.to_csv("predictions_final",index = False)





In [None]:
"Mean AUC: ",np.mean(auc_vals)

('Mean AUC: ', 0.8998544222397277)