# 03 – Modeling & Evaluation
## Bank Customer Churn

**Objective:** Train Logistic Regression and Random Forest; evaluate with confusion matrix and feature importance.

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_STATE = 42

# Use preprocessed data from 02_data_cleaning (run that notebook first)
# Or load raw and repeat cleaning:
df = pd.read_csv('../data/raw/Churn_Modelling.csv')
X = df.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'Exited'])
X = pd.get_dummies(X, columns=['Geography', 'Gender'], drop_first=True)
y = df['Exited']
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)
feature_names = X.columns.tolist()

In [None]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
lr.fit(X_train_sc, y_train)
y_pred_lr = lr.predict(X_test_sc)
print('--- Logistic Regression ---')
print(classification_report(y_test, y_pred_lr))
print('AUC:', round(roc_auc_score(y_test, lr.predict_proba(X_test_sc)[:, 1]), 4))

In [None]:
# Random Forest – feature importance
rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
rf.fit(X_train_sc, y_train)
y_pred_rf = rf.predict(X_test_sc)
importance = pd.Series(rf.feature_importances_, index=feature_names).sort_values(ascending=False)
importance.plot(kind='barh')
plt.title('Feature importance (Random Forest)')
plt.tight_layout()
plt.show()
print('--- Random Forest ---')
print(classification_report(y_test, y_pred_rf))
print('AUC:', round(roc_auc_score(y_test, rf.predict_proba(X_test_sc)[:, 1]), 4))

In [None]:
# Confusion matrix (Random Forest)
cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Random Forest)')
plt.tight_layout()
plt.show()

**Outputs for Power BI / reports:**
- Churn probability per customer (for dashboard)
- Feature importance table
- Segment-level risk (join with SQL segments)