In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_validate,validation_curve
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score



warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)


# EDA
(Exploratory Data Analysis (EDA) is a crucial initial step in data science projects. It involves analyzing and visualizing data to understand its key characteristics, uncover patterns, and identify relationships between variables refers to the method of studying and exploring record sets to apprehend their predominant traits, discover patterns, locate outliers, and identify relationships between variables. EDA is normally carried out as a preliminary step before undertaking extra formal statistical analyses or modeling.)

In [None]:
df = pd.read_csv('Bank Customer Churn Prediction.csv')
df.head()

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.isnull().sum()

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=30):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes not in ["O", 'datetime64[ns]']]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    binary_cols = [col for col in dataframe.columns if
                   dataframe[col].dtype not in [int, float] and dataframe[col].nunique() == 2]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    print(f'binary_cols: {len(binary_cols)}')

    return cat_cols, num_cols, cat_but_car, binary_cols

result = grab_col_names(df)
cat_cols, num_cols, cat_but_car = result[0], result[1], result[2]

In [None]:
def cat_summary(dataframe, col_names):
    for col in col_names:
        print(pd.DataFrame({col: dataframe[col].value_counts(),
                            "Ratio": 100 * dataframe[col].value_counts() / len(dataframe)}))
        print("##########################################")
cat_summary(df, cat_cols)

In [None]:
def target_summary_with_cat(dataframe, target, categorical_col):
    print(pd.DataFrame({"TARGET_MEAN": dataframe.groupby(categorical_col)[target].mean()}), end="\n\n\n")

for col in cat_cols:
    target_summary_with_cat(df, 'churn', col)

In [None]:
def correlation_matrix(df, cols):
    fig = plt.gcf()
    fig.set_size_inches(10, 8)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    fig = sns.heatmap(df[cols].corr(), annot=True, linewidths=0.5, annot_kws={'size': 12}, linecolor='w', cmap='RdBu')
    plt.show(block=True)
    
correlation_matrix(df, num_cols)

In [None]:
def rare_analyser(dataframe, target, cat_cols):
    for col in cat_cols:
        print(col, ":", len(dataframe[col].value_counts()))
        print(pd.DataFrame({"COUNT": dataframe[col].value_counts(),
                            "RATIO": dataframe[col].value_counts() / len(dataframe),
                            "TARGET_MEAN": dataframe.groupby(col)[target].mean()}), end="\n\n\n")

rare_analyser(df, 'churn', cat_cols=cat_cols)

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
sns.boxplot(data=df['age'], ax=ax[0][0])
sns.boxplot(data=df['tenure'], ax=ax[0][1])
sns.boxplot(data=df['estimated_salary'], ax=ax[0][2])
sns.boxplot(data=df['balance'], ax=ax[1][0])
sns.boxplot(data=df['credit_score'], ax=ax[1][1])

In [None]:
def plot_numerical_col(dataframe, num_cols, plot_type='hist'):
    num_cols_count = len(num_cols)
    num_rows = num_cols_count // 3
    num_rows += 1 if num_cols_count % 3 != 0 else 0  # Eğer sütun sayısı 3'e tam bölünmüyorsa bir ek satır oluştur.


    col_groups = [num_cols[i:i+12] for i in range(0, num_cols_count, 12)]

    for group in col_groups:
        fig, axes = plt.subplots(num_rows, 3, figsize=(10, 10))
        axes = axes.flatten()

        for i, col in enumerate(group):
            if plot_type == 'hist':
                sns.histplot(data=dataframe[col], ax=axes[i])
            elif plot_type == 'kde':
                sns.kdeplot(data=dataframe[col], ax=axes[i])
            elif plot_type == 'box':
                sns.boxplot(data=dataframe[col], ax=axes[i])
            else:
                print("Geçersiz grafik türü. Lütfen 'hist', 'kde', veya 'box' olarak belirtin.")
                return
            axes[i].set_xlabel(col)

        for j in range(len(group), num_rows * 3):
            fig.delaxes(axes[j])

        plt.tight_layout()
        plt.show()
        
plot_numerical_col(df, num_cols)

In [None]:
def plot_categoric_col(dataframe, cat_cols):
    cat_cols_count = len(cat_cols)
    cat_rows = cat_cols_count // 3
    cat_rows += 1 if cat_cols_count % 3 != 0 else 0  # Eğer sütun sayısı 3'e tam bölünmüyorsa bir ek satır oluştur.

    fig, axes = plt.subplots(cat_rows, 3, figsize=(10, 10), squeeze=True)
    axes = axes.flatten()

    for i, col in enumerate(cat_cols):
        sns.countplot(data=dataframe, x=col, ax=axes[i], order=dataframe[col].value_counts().index)
        axes[i].set_xlabel(col)

    plt.tight_layout()
    plt.show()

plot_categoric_col(df, cat_cols=cat_cols)

In [None]:
sizes = [df.churn[df['churn']==1].count(), df.churn[df['churn']==0].count()]
labels = ['Churned', 'Not Churned']
colors = ['red', 'orange']

plt.pie(sizes, labels=labels, autopct='%.2f%%', colors=colors)
plt.legend(loc='upper left')
plt.title("Churned VS Not Churned", size=10)
plt.show()

# Feature Engineering

In [None]:
df.loc[df['products_number'] == 4, 'products_number'] = 3


df['credit_score_seg'] = pd.cut(df['credit_score'], bins=[349, 500, 590, 620, 660, 690, 720, np.inf],
                                labels=['A', 'B', 'C', 'D', 'E', 'F', 'G'])

df['balance_seg'] = pd.cut(df['balance'], bins=[-1, 50000, 90000, 127000, np.inf],
                           labels=['A', 'B', 'C', 'D'])

df['age_seg'] = pd.cut(df['age'], bins=[17, 36, 55, np.inf],
                       labels=['A', 'B', 'C'])

df['tenure_seg'] = pd.cut(df['tenure'], bins=[-1, 3, 5, 7, np.inf],
                          labels=['A', 'B', 'C', 'D'])

df_final = df.drop('customer_id', axis=1)

In [None]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=True):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first, dtype=int)
    return dataframe

def label_encoder(dataframe, binary_col, info=False):
    labelencoder = LabelEncoder()

    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    if info:
        d1, d2 = labelencoder.inverse_transform([0, 1])
        print(f'{binary_col}\n0:{d1}, 1:{d2}')
    return dataframe

In [None]:
df_final = one_hot_encoder(df_final, ['country', 'gender','age_seg'], drop_first=True)

label_encoder(df_final, 'credit_score_seg')
label_encoder(df_final, 'balance_seg')
label_encoder(df_final, 'tenure_seg')

# Modeling

In [None]:
y = df_final['churn']
X = df_final.drop(columns=['churn'], axis=1)

# No Smote:

In [None]:
model = LGBMClassifier(verbose=-1).fit(X, y)
cv_results = cross_validate(model, X, y, cv=3, scoring=["accuracy", "f1", "roc_auc"])
f1 = cv_results['test_f1'].mean()
auc = cv_results['test_roc_auc'].mean()
accuracy = cv_results['test_accuracy'].mean()
print(f'f1: {f1:.2f}')
print(f'auc: {auc:.2f}')
print(f'accuracy: {accuracy:.2f}')

Hyperparameter Optimazation

In [None]:
lightgbm_params = {"learning_rate": [0.01, 0.1],
                   "n_estimators": [300, 500]}

gs_best = GridSearchCV(model, lightgbm_params, cv=3, n_jobs=-1, verbose=False).fit(X, y)
final_model = model.set_params(**gs_best.best_params_)

cv_results = cross_validate(final_model, X, y, cv=3, scoring=['accuracy', 'f1', 'roc_auc'])
f1 = cv_results['test_f1'].mean()
auc = cv_results['test_roc_auc'].mean()
accuracy = cv_results['test_accuracy'].mean()

print(f'f1: {f1:.2f}')
print(f'auc: {auc:.2f}')
print(f'accuracy: {accuracy:.2f}')

# FEATURE IMPORTANCE

In [None]:
def plot_importance(model, features, num=len(X), save=False):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                     ascending=False)[0:num])
    plt.title('Features')
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig('importances.png')


plot_importance(model, X)

# With Smote:

In [None]:
oversample = SMOTE()
X_smote, y_smote = oversample.fit_resample(X, y)

model = LGBMClassifier(verbose=-1).fit(X_smote, y_smote)
cv_results = cross_validate(model, X_smote, y_smote, cv=3, scoring=["accuracy", "f1", "roc_auc"])
f1 = cv_results['test_f1'].mean()
auc = cv_results['test_roc_auc'].mean()
accuracy = cv_results['test_accuracy'].mean()
print(f'f1: {f1:.2f}')
print(f'auc: {auc:.2f}')
print(f'accuracy: {accuracy:.2f}')


Hyperparameter Optimization

In [None]:
lightgbm_params = {"learning_rate": [0.01, 0.1],
                   "n_estimators": [300, 500]}

gs_best = GridSearchCV(model, lightgbm_params, cv=3, n_jobs=-1, verbose=False).fit(X_smote, y_smote)
final_model = model.set_params(**gs_best.best_params_)

cv_results = cross_validate(final_model, X_smote, y_smote, cv=3, scoring=['accuracy', 'f1', 'roc_auc'])
f1 = cv_results['test_f1'].mean()
auc = cv_results['test_roc_auc'].mean()
accuracy = cv_results['test_accuracy'].mean()

print(f'f1: {f1:.2f}')
print(f'auc: {auc:.2f}')
print(f'accuracy: {accuracy:.2f}')

Feature Importance

In [None]:
def plot_importance(model, features, num=len(X), save=False):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                     ascending=False)[0:num])
    plt.title('Features')
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig('importances.png')


plot_importance(model, X_smote)