In [1]:
%pip install seaborn

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Matplotlib is building the font cache; this may take a moment.


In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline

In [4]:
import joblib

In [7]:
# Step 2: Load Data
df = pd.read_csv('Bank-Customer-Churn.csv')
print("Data loaded successfully")
print(df.head())

Data loaded successfully
   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  Complain  Satisfaction Score Card Type  \
0        101348.88       1         1                   2   DIAM

In [10]:
# Step 3: Preprocessing
# Drop unnecessary columns
X = df.drop(['CustomerId', 'Surname', 'Exited'], axis=1)
y = df['Exited']

In [15]:
# Encode 'Gender' using LabelEncoder (binary categorical)
le = LabelEncoder()
if 'Gender' in X.columns:
    X['Gender'] = le.fit_transform(X['Gender'])  # Male=1, Female=0

In [16]:
# One-hot encode 'Geography' only if it exists
if 'Geography' in X.columns:
    X = pd.get_dummies(X, columns=['Geography'], drop_first=True)

In [18]:
X = X.select_dtypes(include=[np.number])

In [19]:
# Confirm the features are numeric after preprocessing
print("\n✅ Columns after one-hot encoding:")
print(X.columns)
print("\n✅ Data types after one-hot encoding:")
print(X.dtypes)


✅ Columns after one-hot encoding:
Index(['RowNumber', 'CreditScore', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Complain', 'Satisfaction Score', 'Point Earned'],
      dtype='object')

✅ Data types after one-hot encoding:
RowNumber               int64
CreditScore             int64
Gender                  int32
Age                     int64
Tenure                  int64
Balance               float64
NumOfProducts           int64
HasCrCard               int64
IsActiveMember          int64
EstimatedSalary       float64
Complain                int64
Satisfaction Score      int64
Point Earned            int64
dtype: object


In [20]:
# Step 4: Split the data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Step 5: Build the ML pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(random_state=42))
])

In [23]:
# Step 6: Hyperparameter tuning using GridSearchCV
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [5, 10, None]
}

grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc')
grid.fit(X_train, y_train)

In [24]:
# Step 7: Evaluate the model
y_pred = grid.predict(X_test)
y_proba = grid.predict_proba(X_test)[:, 1]

print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))
print("\n🧱 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\n🏆 ROC AUC Score:", roc_auc_score(y_test, y_proba))


📊 Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1607
           1       1.00      1.00      1.00       393

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000


🧱 Confusion Matrix:
[[1606    1]
 [   1  392]]

🏆 ROC AUC Score: 0.9993413041860436


In [26]:
# Step 8: Save the trained pipeline
joblib.dump(grid.best_estimator_, 'churn_model.pkl')
print("✅ Model pipeline saved to churn_model.pkl")

✅ Model pipeline saved to churn_model.pkl


In [27]:
import joblib
import numpy as np

# Load the trained model
model = joblib.load('churn_model.pkl')
print("✅ Model loaded successfully!")

# Example: Predict on one row from the test set
sample = X_test.iloc[0:1]
prediction = model.predict(sample)
probability = model.predict_proba(sample)

print("Prediction (0 = stay, 1 = churn):", prediction[0])
print("Churn probability:", round(probability[0][1], 4))


✅ Model loaded successfully!
Prediction (0 = stay, 1 = churn): 0
Churn probability: 0.0082
