# 02 – Data Cleaning & Feature Engineering
## Bank Customer Churn

**Objective:** Clean data, create features, prepare train/test split for modeling.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

df = pd.read_csv('../data/raw/Churn_Modelling.csv')
df.head()

In [None]:
# Drop non-predictive columns
X = df.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'Exited'])
y = df['Exited']

# One-hot encode categoricals (Geography, Gender)
X = pd.get_dummies(X, columns=['Geography', 'Gender'], drop_first=True)
feature_names = X.columns.tolist()

# Train/test split (stratify on Exited)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# Scale numeric features for Logistic Regression
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

# Save processed data for Power BI (with Exited and key cols)
out = df.copy()
out['Geography_France'] = (out['Geography'] == 'France').astype(int)
out['Geography_Germany'] = (out['Geography'] == 'Germany').astype(int)
out['Geography_Spain'] = (out['Geography'] == 'Spain').astype(int)
out['Gender_Male'] = (out['Gender'] == 'Male').astype(int)
out.to_csv('../data/processed/churn_cleaned.csv', index=False)

X.info()

In [None]:
# Check split sizes
print('Train:', X_train.shape[0], 'Test:', X_test.shape[0])
print('Churn rate train:', y_train.mean().round(4), 'test:', y_test.mean().round(4))

**Next:** `03_analysis.ipynb` – Logistic Regression, Random Forest, confusion matrix, feature importance.