In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt

In [None]:
data=pd.read_csv("../data/train.csv")
test=pd.read_csv("../data/test.csv")

In [None]:
data.head(10)

In [None]:
test_final=test["id"]

In [None]:
def clean(data):
    data=data.drop(["id","CustomerId","Surname"],axis=1)
    
    floatcols=["Age","HasCrCard","IsActiveMember","Balance","EstimatedSalary"]
    data[floatcols]=data[floatcols].astype('int64')

    return data

In [None]:
data=clean(data)
test=clean(test)

In [None]:
data.head(10)

In [None]:
nulls=(data==0).sum().sort_values(ascending=False)
nulls.plot(kind='bar',color='pink')
plt.title('Top 10 Features by Missing Values')
plt.ylabel('Count of Nulls')
plt.show()

In [None]:
X=data.drop("Exited",axis=1)
y=data["Exited"]

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

catcols=["Geography","Gender"]
numcols=["Balance","IsActiveMember","HasCrCard","Tenure","CreditScore","Age","NumOfProducts","EstimatedSalary"]

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

def make_lgb_pipeline(numcols):
    preprocess=ColumnTransformer(
    transformers=[
        ("cat",OneHotEncoder(drop="first",handle_unknown="ignore"), catcols),
        ("num","passthrough",numcols),])
    lgb=Pipeline(steps=[
        ("preprocess",preprocess),
        ("model",LGBMClassifier(verbose=-1))])

    return lgb

In [None]:
lgb=make_lgb_pipeline(numcols)

lgb.fit(X_train,y_train)
lgb_predict=lgb.predict_proba(X_test)[:,1]
lgb_auc=roc_auc_score(y_test,lgb_predict)
print("ROC AUC:",lgb_auc)

In [None]:
from sklearn.model_selection import cross_val_score

lgbscores=cross_val_score(
    lgb,
    X,
    y,
    cv=5,
    scoring="roc_auc")

print("Fold AUCs:", lgbscores)
print("Mean AUC:",lgbscores.mean())
print("Std AUC:", lgbscores.std())

In [None]:
from sklearn.metrics import RocCurveDisplay

RocCurveDisplay.from_predictions(y_test,lgb_predict)
plt.show()

In [None]:
lgb.get_params()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

lgbparam_dist={
    "model__num_leaves":randint(20,200),
    "model__max_depth":randint(3,10),
    "model__learning_rate":uniform(0.01,0.09),
    "model__min_child_samples":randint(10,100),
    "model__subsample":uniform(0.6,0.4),
    "model__colsample_bytree":uniform(0.6,0.4)}

lgb_search=RandomizedSearchCV(
    estimator=lgb,
    param_distributions=lgbparam_dist,
    n_iter=30,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1,
    random_state=42)

lgb_search.fit(X,y)
print(lgb_search.best_score_,lgb_search.best_params_)

In [None]:
lgb_final=make_lgb_pipeline(numcols)
lgb_final.set_params(
    model__colsample_bytree=np.float64(0.749816047538945),
    model__learning_rate=np.float64(0.09556428757689245),
    model__max_depth=5,
    model__min_child_samples=81,
    model__num_leaves=40,
    model__subsample=np.float64(0.6624074561769746))

lgb_final.fit(X_train,y_train)
lgb_final_predict=lgb_final.predict_proba(X_test)[:,1]
lgb_final_auc=roc_auc_score(y_test,lgb_final_predict)
print("ROC AUC:",lgb_final_auc)

In [None]:
from catboost import CatBoostClassifier

cbm=CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    depth=6,
    learning_rate=0.05,
    iterations=1000,
    verbose=False)

cbm.fit(X_train,y_train,cat_features=catcols,eval_set=(X_test,y_test))
cbm_predict=cbm.predict_proba(X_test)[:,1]
cbm_auc=roc_auc_score(y_test,cbm_predict)
print("ROC AUC:",cbm_auc)

In [None]:
cbm_scores=cross_val_score(cbm,X,y,cv=5,scoring="roc_auc",fit_params={"cat_features":catcols})

print("ROC:",cbm_scores.mean(),cbm_scores.std())

In [None]:
cbmparam_dist={
    "depth":[3,4,5,6,7,8],
    "learning_rate":[0.01,0.02,0.03,0.05,0.07,0.1],
    "l2_leaf_reg":[1,3,5,7,10,15,20],
    "bagging_temperature":[0.0,0.5,1.0,2.0,4.0],
    "border_count":[64,128,254]}

cbm_search=RandomizedSearchCV(
    estimator=cbm,
    param_distributions=cbmparam_dist,
    n_iter=15,
    scoring="roc_auc",
    cv=5,
    n_jobs=1,
    random_state=42)

cbm_search.fit(X,y,cat_features=catcols)
print(cbm_search.best_score_,cbm_search.best_params_)

In [None]:
from sklearn.ensemble import VotingClassifier

ensemble=VotingClassifier(
    estimators=[
        ("lgb",lgb_base),
        ("cat",cbm)],
    voting="soft",
    weights=[0.5,0.5])

ensemble.fit(X_train,y_train)
ensemble_prediction=ensemble.predict_proba(X_test)[:,1]
ensemble_auc=roc_auc_score(y_test,ensemble_prediction)
print("Ensemble AUC:",ensemble_auc)