# PROJECT SCOPE

The scope of this project in this code base is to build a customer churn model; here, we will be using the bank churn data, and for this model, we will be focusing on companies in the financial services sector. for this and the remaining model we will be using the BankChurners.csv file saved in the Datasets folder.

Customer churn is the percentage of customers who stopped purchasing your business's products or services during a certain period of time

# IMPORTING THE NECESSARY PACKAGES

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.inspection import permutation_importance
from imblearn.over_sampling import SMOTE
import joblib

# LOADING THE DATA

In [2]:
# Defining file paths
file_path = r'/Users/abduljalaalabubakar/Desktop/Projects/Symply Finance/Customer Insight Model/Fintech Customer Insight Model/Datasets/Bank Churn Dataset/BankChurners.csv'
model_folder_path = r'/Users/abduljalaalabubakar/Desktop/Projects/Symply Finance/Customer Insight Model/Fintech Customer Insight Model/Customer_Churn_Best_Models'
os.makedirs(model_folder_path, exist_ok=True)

# Load dataset
data = pd.read_csv(file_path)

# DATA INVESTIGATION

In [3]:
# Checking for missing values
missing_values = data.isnull().sum()

# Checking the class distribution
class_distribution = data['Attrition_Flag'].value_counts(normalize=True) * 100

# Encoding the categorical variables for correlation analysis
data_encoded = data.copy()
for column in data_encoded.select_dtypes(include=['object']).columns:
    data_encoded[column] = pd.factorize(data_encoded[column])[0]

# Correlation analysis
correlations = data_encoded.corr()['Attrition_Flag'].sort_values(ascending=False)
print("\nHigh Correlation Features:")
print(correlations.head(10))
print("\nLow Correlation Features:")
print(correlations.tail(10))

# Checking for duplicates
duplicate_rows = data.duplicated().sum()


High Correlation Features:
Attrition_Flag                                                                                                                        1.000000
Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1    0.999989
Contacts_Count_12_mon                                                                                                                 0.204491
Months_Inactive_12_mon                                                                                                                0.152449
Gender                                                                                                                                0.037272
Education_Level                                                                                                                       0.025966
Dependent_count                                                                                                   

# CLEANING AND BALANCING THE DATA

In [4]:
# Data Cleaning and Balancing
# DropPING irrelevant columns
columns_to_drop = ['CLIENTNUM', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
                   'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2']
data = data.drop(columns=columns_to_drop, errors='ignore')

# Encoding categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Balancing the dataset using SMOTE
X = data.drop('Attrition_Flag', axis=1)
y = data['Attrition_Flag']
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

# Checking the class distribution after balancing
balanced_class_distribution = pd.Series(y_balanced).value_counts(normalize=True) * 100
print("\nClass Distribution After Balancing:")
print(balanced_class_distribution)


Class Distribution After Balancing:
Attrition_Flag
1    50.0
0    50.0
Name: proportion, dtype: float64


# TRAINING THE DATA 

In [None]:
# Standardizing the numerical features
scaler = StandardScaler()
X_balanced = scaler.fit_transform(X_balanced)

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

#Model Training and Evaluation
print("\n### Model Training and Evaluation ###")
models = {
    "Logistic Regression": LogisticRegression(max_iter=300),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Neural Network": MLPClassifier(max_iter=300)
}

best_model = None
best_score = 0
results = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None

    # Saving the metrics
    results[name] = {
        "Accuracy": accuracy,
        "AUC": auc,
        "Classification Report": classification_report(y_test, y_pred)
    }

    # Updating the best model
    if accuracy > best_score:
        best_score = accuracy
        best_model = model


### Model Training and Evaluation ###
Training Logistic Regression...
Training Random Forest...
Training Gradient Boosting...
Training Neural Network...


# SAVING THE BEST MODEL

In [None]:
# Saving the best model
best_model_path = os.path.join(model_folder_path, "best_churn_model.pkl")
joblib.dump(best_model, best_model_path)
print(f"\nBest model saved to: {best_model_path}")

# DISPLAYING THE RESULT

In [None]:
# Displaying the results
for name, metrics in results.items():
    print(f"\nModel: {name}")
    print(f"Accuracy: {metrics['Accuracy']}")
    if metrics["AUC"]:
        print(f"AUC: {metrics['AUC']}")
    print(f"Classification Report:\n{metrics['Classification Report']}")

# Feature Validation with Permutation Importance
print("\n### Feature Validation with Permutation Importance ###")
from sklearn.inspection import permutation_importance
perm_importance = permutation_importance(best_model, X_test, y_test, scoring='accuracy', n_repeats=10, random_state=42)

feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': perm_importance.importances_mean
}).sort_values(by='Importance', ascending=False)
print("\nTop Features Based on Permutation Importance:")
print(feature_importances.head())

# NEXT STEPS

Data Integration: Develop ingestion pipelines for customer data. Real-Time Feature Engineering: Create dynamic processes for feature updates. Model Deployment: Build and deploy a scalable API for churn prediction. Feedback Loop: Implement systems to capture outcomes and retrain the model. Business Alignment: Define clear interventions based on model predictions