# Predicting customers who have churned (stopped using their credit cards)

In [1]:
# EDA Libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.subplots as sp
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid")

In [2]:
df = pd.read_csv('../input/credit-card-customers/BankChurners.csv')

In [3]:
# Dropping the last two columns
df = df.iloc[:, :-2]

# Checking for the number of null values present in each feature
df.isnull().sum()

CLIENTNUM                   0
Attrition_Flag              0
Customer_Age                0
Gender                      0
Dependent_count             0
Education_Level             0
Marital_Status              0
Income_Category             0
Card_Category               0
Months_on_book              0
Total_Relationship_Count    0
Months_Inactive_12_mon      0
Contacts_Count_12_mon       0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct              0
Total_Ct_Chng_Q4_Q1         0
Avg_Utilization_Ratio       0
dtype: int64

In [4]:
# checking for duplicated values
df.duplicated().sum()

0

In [5]:
# Changing the Attrition_Flag to 0 and 1 (Hidden Input/Output)
df["Attrition_Flag"] = df["Attrition_Flag"].map({"Existing Customer":0, "Attrited Customer":1})

# Data Explorarion

> - **`CLIENTNUM`** : Unique client identifier.
> - **`Attrition_Flag`** : Indicates whether the customer's account is active or has churned.
> - **`Customer_Age`** : Age of the customer.
> - **`Gender`** : Gender of the customer.
> - **`Dependent_count`** : Number of dependents of the customer.
> - **`Education_Level`** : Educational level of the customer.
> - **`Marital_Status`** : Marital status of the customer.
> - **`Income_Category`** : Income category of the customer.
> - **`Card_Category`** : Category of the credit card held by the customer.
> - **`Months_on_book`** : Number of months the customer has been a bank client.
> - **`Total_Relationship_Count`** : Total number of bank products held by the customer.
> - **`Months_Inactive_12_mon`** : Number of months with inactivity in the last 12 months.
> - **`Contacts_Count_12_mon`** : Number of contacts with the bank in the last 12 months.
> - **`Credit_Limit`** : Credit limit on the credit card.
> - **`Total_Revolving_Bal`** : Total revolving balance on the credit card.
> - **`Avg_Open_To_Buy`** : Average open to buy credit line on the credit card.
> - **`Total_Amt_Chng_Q4_Q1`** : Change in transaction amount over the last four quarters.
> - **`Total_Trans_Amt`** : Total transaction amount in the last 12 months.
> - **`Total_Trans_Ct`** : Total transaction count in the last 12 months.
> - **`Total_Ct_Chng_Q4_Q1`** : Change in transaction count over the last four quarters.
> - **`Avg_Utilization_Ratio`** : Average utilization ratio of the credit card.

In [6]:
fig = px.pie(df, names='Attrition_Flag', 
             title='Attrition Flag Distribution',
             color_discrete_sequence=['#ff7f0e', '#3498db'],# Setting custom color
            )

# format the layout
fig.update_layout(
    xaxis=dict(showgrid=False, zeroline=False),
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
)

fig.update_traces(marker=dict(line=dict(width=2, color='DarkSlateGrey')))


# Show the pie chart
fig.show()

## Exploring Categorical Features

In [7]:
fig = px.histogram(df, x='Gender', color='Attrition_Flag',
             title='Churn Rates by Gender',
             labels={'country': 'Country', 'state': 'Project State'},
             template='plotly_white', barmode='group',
             color_discrete_sequence=['#ff7f0e', '#3498db']
            )

# Customizing marker appearance
fig.update_traces(marker=dict(line=dict(width=2, color='DarkSlateGrey')))

# format the layout
fig.update_layout(
    xaxis=dict(showgrid=False, zeroline=False),
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
)

# Show the pl

In [8]:
fig = px.histogram(df, x='Education_Level', color='Attrition_Flag',
             title='Churn Rates by Education_Level',
             labels={'country': 'Country', 'state': 'Project State'},
             template='plotly_white', barmode='group',
             color_discrete_sequence=['#ff7f0e', '#3498db']
            )

# Customizing marker appearance
fig.update_traces(marker=dict(line=dict(width=2, color='DarkSlateGrey')))

# format the layout
fig.update_layout(
    xaxis=dict(showgrid=False, zeroline=False),
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
)

# Show the plot
fig.show()

In [9]:
fig = px.histogram(df, x='Marital_Status', color='Attrition_Flag',
             title='Churn Rates by Marital_Status',
             labels={'country': 'Country', 'state': 'Project State'},
             template='plotly_white', barmode='group',
             color_discrete_sequence=['#ff7f0e', '#3498db']
            )

# Customizing marker appearance
fig.update_traces(marker=dict(line=dict(width=2, color='DarkSlateGrey')))

# format the layout
fig.update_layout(
    xaxis=dict(showgrid=False, zeroline=False),
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
)

# Show the plot
fig.show()

In [10]:
fig = px.histogram(df, x='Income_Category', color='Attrition_Flag',
             title='Churn Rates by Income_Category',
             labels={'country': 'Country', 'state': 'Project State'},
             template='plotly_white', barmode='group',
             color_discrete_sequence=['#ff7f0e', '#3498db']
            )

# Customizing marker appearance
fig.update_traces(marker=dict(line=dict(width=2, color='DarkSlateGrey')))

# format the layout
fig.update_layout(
    xaxis=dict(showgrid=False, zeroline=False),
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
)

# Show the plot
fig.show()

## Exploring Numerical Features

In [11]:
# Creating a histogram using Plotly Express to visualize the relationship between age and the risk of heart attack
fig = px.histogram(df, x='Customer_Age', color='Attrition_Flag', title='Churn rates by Age',
                   labels={'age': 'Age', 'output': 'Output'}, 
                   marginal='box', barmode='group',
                   color_discrete_sequence=['#ff7f0e', '#3498db']
                 )

# Customizing marker appearance
fig.update_traces(marker=dict(line=dict(width=2, color='DarkSlateGrey')))

# format the layout
fig.update_layout(
    xaxis=dict(showgrid=False, zeroline=False),
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
)

fig.show()

In [12]:
fig = px.box(df, y='Avg_Utilization_Ratio', x='Attrition_Flag', 
                color='Attrition_Flag', 
                title='Relationship Between Credit Card Utilization and Churn Status',
                color_discrete_sequence=['#ff7f0e', '#3498db'],
                labels={'Avg_Utilization_Ratio': 'Credit Card Utilization Ratio', 'Attrition_Flag': 'Churn Status'})

# Format the layout
fig.update_layout(
    xaxis=dict(showgrid=False, zeroline=False),
    yaxis=dict(title='Credit Card Utilization Ratio', zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
)

fig.show()

In [13]:
fig = px.box(df, y='Credit_Limit', x='Attrition_Flag', 
                 color='Attrition_Flag', 
                 title='Correlation Between Credit Limit and Churn Status',
                 color_discrete_sequence=['#ff7f0e', '#3498db'],
                 labels={'Credit_Limit': 'Credit Limit', 'Attrition_Flag': 'Churn Status'},)

# Format the layout
fig.update_layout(
    yaxis=dict(showgrid=False, zeroline=False),
    xaxis=dict(title='Credit Limit', zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
)

fig.show()

In [14]:
fig = px.box(df, y='Contacts_Count_12_mon', x='Attrition_Flag', 
                color='Attrition_Flag', 
                title='Influence of Contacts Count on Churn Status',
                color_discrete_sequence=['#ff7f0e', '#3498db'],
                labels={'Contacts_Count_12_mon': 'Number of Contacts in Last 12 Months', 'Attrition_Flag': 'Churn Status'},
                category_orders={'Attrition_Flag': ['Existing Customer', 'Attrited Customer']})  

# Customizing marker appearance
fig.update_traces(marker=dict(line=dict(width=2, color='DarkSlateGrey')))

# Format the layout
fig.update_layout(
    yaxis=dict(title='Number of Contacts in Last 12 Months', showgrid=False, zeroline=False),
    xaxis=dict(title='Churn Status', zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233, 233, 233)',
    plot_bgcolor='rgb(233, 233, 233)',
)

fig.show()

# Data Preprocessing

In [15]:
# Data Preprocessing Libraries
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler

1. Categorial data encoding

In [16]:
one_hot_encoded  = pd.get_dummies(df, columns=['Gender', 'Marital_Status'])

In [17]:
cat_cols = ['Gender', 'Marital_Status', 'Education_Level', 'Income_Category', 'Card_Category'] 
one_hot_encoded = pd.get_dummies(df[cat_cols])

In [18]:
df = pd.concat([df, one_hot_encoded], axis=1).drop(cat_cols, axis = 1)

In [19]:
df

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,...,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver
0,768805383,0,45,3,39,5,1,3,12691.0,777,...,False,False,True,False,False,False,True,False,False,False
1,818770008,0,49,5,44,6,1,2,8256.0,864,...,False,False,False,False,True,False,True,False,False,False
2,713982108,0,51,3,36,4,1,0,3418.0,0,...,False,False,False,True,False,False,True,False,False,False
3,769911858,0,40,4,34,3,4,1,3313.0,2517,...,False,False,False,False,True,False,True,False,False,False
4,709106358,0,40,3,21,5,1,0,4716.0,0,...,False,False,True,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,772366833,0,50,2,40,3,2,3,4003.0,1851,...,False,True,False,False,False,False,True,False,False,False
10123,710638233,1,41,2,25,4,2,3,4277.0,2186,...,False,True,False,False,False,False,True,False,False,False
10124,716506083,1,44,1,36,5,3,4,5409.0,0,...,False,False,False,False,True,False,True,False,False,False
10125,717406983,1,30,2,36,4,3,3,5281.0,0,...,False,True,False,False,False,False,True,False,False,False


2. Train Test Spilt

In [20]:
x = df.drop(['Attrition_Flag'], axis=1)
y = df['Attrition_Flag']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size = 0.2, 
                                                    random_state = 42)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 8101 samples.
Testing set has 2026 samples.


3. Standardizing Continuous Features with StandardScaler

In [22]:
numerical_features = [
    'Customer_Age', 'Dependent_count', 'Months_on_book', 
    'Total_Relationship_Count', 'Months_Inactive_12_mon',
    'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
    'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
    'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio'
]

In [23]:
# Creating a StandardScaler instance
scaler = StandardScaler()

# Fitting the StandardScaler on the training data
scaler.fit(X_train[numerical_features])

# Transforming (standardize) the continuous features in the training and testing data
X_train_cont_scaled = scaler.transform(X_train[numerical_features])
X_test_cont_scaled = scaler.transform(X_test[numerical_features])

# Replacing the scaled continuous features in the original data
X_train[numerical_features] = X_train_cont_scaled
X_test[numerical_features] = X_test_cont_scaled

X_train

Unnamed: 0,CLIENTNUM,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,...,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver
9066,713071383,0.959061,-1.035122,0.009523,-1.812952,0.644039,0.499433,-0.540791,0.696075,-0.603671,...,False,False,False,False,False,True,True,False,False,False
5814,714246333,1.457688,1.291390,1.505062,-1.812952,1.630121,0.499433,-0.356658,0.788048,-0.427675,...,False,False,False,False,False,True,True,False,False,False
792,718206783,-0.162849,1.291390,0.009523,1.409234,-1.328125,0.499433,0.808999,0.597970,0.755842,...,False,False,False,False,True,False,False,True,False,False
1791,721096983,-1.534073,-0.259618,0.009523,0.120360,0.644039,1.408012,-0.551577,1.663634,-0.701361,...,False,False,False,False,True,False,True,False,False,False
5011,720028683,0.335778,-0.259618,0.383408,0.764797,0.644039,1.408012,-0.651182,0.938885,-0.735944,...,False,True,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,769053033,-0.287506,-1.035122,0.258779,-0.524077,-0.342043,2.316591,-0.494675,1.663634,-0.644421,...,False,True,False,False,False,False,True,False,False,False
5191,714406158,0.834404,0.515886,0.009523,0.120360,0.644039,3.225171,-0.076773,-1.422990,0.050975,...,False,False,False,False,False,True,True,False,False,False
5390,714140133,-0.536819,1.291390,-0.488990,-0.524077,-1.328125,-0.409147,-0.695867,0.474113,-0.738918,...,False,False,False,False,True,False,True,False,False,False
860,720244983,-0.786133,0.515886,-0.987503,0.120360,-1.328125,-1.317726,-0.558401,0.670323,-0.618980,...,False,True,False,False,False,False,True,False,False,False


# Models Training and Evaluating

In [24]:
# Machine Learing (classification models) Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from imblearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, f_regression, RFE, SelectFromModel
import lightgbm as lgb
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report, roc_curve, roc_auc_score

In [25]:
# List of classifiers to evaluate
classifiers = [
    ("Logistic Regression", LogisticRegression(random_state=42, max_iter= 1500, n_jobs=-1)),
    ("Random Forest", RandomForestClassifier(random_state=42, n_jobs =-1)),
    ("XGBoost", xgb.XGBClassifier(random_state=42, n_jobs =-1))
]

In [26]:
# Initialize RFE feature selector
RFE_selector = RFE(lgb.LGBMClassifier(random_state=42, verbose=-1), n_features_to_select=12)


# Creating lists for classifier names, mean_test_accuracy_scores, and results.
results = []
mean_test_accuracy_scores = []
classifier_names = []

for model_name, model in classifiers:
    # Print model name
    print(f"For {model_name}:")
    
    # Steps Creation
    steps = list()
    
    steps.append(('feature_selector', RFE_selector))  # RFE feature selection
    
    steps.append((model_name, model))

    # Create the pipeline
    pipeline = Pipeline(steps=steps)
                        
    # 5-fold Stratified Cross-Validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Perform cross-validation with train scores
    cv_results = cross_validate(pipeline, X_train, y_train, cv=cv, scoring='accuracy',n_jobs=-1, return_train_score=True)
    
    print(f"Cross-validation completed successfully for {model_name}")
    print('*' * 50)

    # Append results to the list
    results.append({
        "Model Name": model_name,
        "Mean Train Accuracy": np.mean(cv_results['train_score']),
        "Mean Test Accuracy": np.mean(cv_results['test_score'])
    })
    
    mean_test_accuracy_scores.append(np.mean(cv_results['test_score']))
    classifier_names.append(model_name)

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)

# Display the DataFrame
display(results_df)

For Logistic Regression:
Cross-validation completed successfully for Logistic Regression
**************************************************
For Random Forest:
Cross-validation completed successfully for Random Forest
**************************************************
For XGBoost:
Cross-validation completed successfully for XGBoost
**************************************************


Unnamed: 0,Model Name,Mean Train Accuracy,Mean Test Accuracy
0,Logistic Regression,0.839526,0.839526
1,Random Forest,1.0,0.964079
2,XGBoost,1.0,0.96914


In [27]:
# Creating a DataFrame from the data
data = pd.DataFrame({'Classifier': classifier_names, 'Test Accuracy': mean_test_accuracy_scores})

# Creating Plotly bar chart
fig = px.bar(data, x='Test Accuracy', y='Classifier', orientation='h', color='Test Accuracy',
             title='Mean Test Accuracy Scores by Classifiers', text='Test Accuracy', color_continuous_scale='viridis')

# Customizing the layout
fig.update_layout(
    xaxis_title='Test Accuracy',
    yaxis_title='Classifier',
    xaxis=dict(range=[0, 1]),
    yaxis=dict(categoryorder='total ascending'),
    showlegend=False,
    height=500,
    width=900
)

fig.show()

selected features

In [28]:
# Initialize RFE feature selector
RFE_selector = RFE(lgb.LGBMClassifier(random_state=42, verbose=-1), n_features_to_select=12)

# Fit RFE selector to the training data
RFE_selector.fit(X_train, y_train)

# Get the indices of the selected features
selected_feature_indices = np.where(RFE_selector.support_)[0]

# Get the names of the selected features based on their indices
selected_feature_names = X_train.columns[selected_feature_indices]

# Print the names of the selected features
print("Selected Features:")
print(selected_feature_names)

Selected Features:
Index(['CLIENTNUM', 'Customer_Age', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Credit_Limit',
       'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1',
       'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1'],
      dtype='object')


Training the Chosen Model (XGBoost Classifier)

In [29]:
# Define the pipeline with the feature selector
pipeline = Pipeline(steps=[
    ('feature_selector', RFE_selector),
    ("XGBoost", xgb.XGBClassifier(random_state=42, n_jobs =-1))
])

pipeline.fit(X_train, y_train)

# Predictions on test data
y_pred = pipeline.predict(X_test)

# Calculate F1-score
f1 = f1_score(y_test, y_pred, average='weighted')

# Printing model details
print(f'Model: XGBoost')
print(f'Training Accuracy: {accuracy_score(y_train, pipeline.predict(X_train))}')
print(f'Testing Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'F1-score: {f1}')
print('-----------------------------------------------------------')
print(f'Testing Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print('-----------------------------------------------------------')
print(f'Testing Classification report: \n{classification_report(y_test, y_pred)}')

Model: XGBoost
Training Accuracy: 1.0
Testing Accuracy: 0.9619940769990128
F1-score: 0.9615815310652124
-----------------------------------------------------------
Testing Confusion Matrix: 
[[1669   30]
 [  47  280]]
-----------------------------------------------------------
Testing Classification report: 
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      1699
           1       0.90      0.86      0.88       327

    accuracy                           0.96      2026
   macro avg       0.94      0.92      0.93      2026
weighted avg       0.96      0.96      0.96      2026



Great, XGBoost demonstrates F1-score of 96%.

In [30]:
z=confusion_matrix(y_test, y_pred)
fig = ff.create_annotated_heatmap(z, x=['Not Churn','Churn'], y=['Predicted Not Churn','Predicted Churn'], colorscale='Fall',xgap=3,ygap=3)
fig['data'][0]['showscale'] = True
fig.update_layout(title='Prediction On Original Data With Random Forest Model Confusion Matrix')
fig.show()