# Customer Churn Prediction with Neural Network (No sklearn)

This notebook trains a neural network **from scratch** on the `Churn_Modelling.csv` dataset, following the style of your intro neural network notebook (manual forward & backprop, no sklearn).

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.style.use("seaborn-v0_8")

## 1. Load & Inspect Data

In [None]:
# Load dataset (make sure Churn_Modelling.csv is in the same folder)
df = pd.read_csv("Churn_Modelling.csv")

print("Shape:", df.shape)
df.head()

## 2. Basic Exploration

In [None]:
print(df.info())
print("\nExited value counts:")
print(df["Exited"].value_counts())
print("\nExited value counts (normalized):")
print(df["Exited"].value_counts(normalize=True))

df.describe()[["CreditScore", "Age", "Balance", "EstimatedSalary"]]

## 3. Preprocessing (No sklearn)

- Drop ID-like columns
- Encode `Gender`
- One-hot encode `Geography`
- Standardize features
- Train/test split

In [None]:
# 1. Drop ID columns
data = df.drop(columns=["RowNumber", "CustomerId", "Surname"])

# 2. Separate labels
y = data["Exited"].to_numpy()
X = data.drop(columns=["Exited"]).copy()

# 3. Encode Gender as 0/1
X["Gender"] = (X["Gender"] == "Male").astype(int)

# 4. One-hot encode Geography using pandas.get_dummies (still no sklearn)
geo_dummies = pd.get_dummies(X["Geography"], prefix="Geo")
X = pd.concat([X.drop(columns=["Geography"]), geo_dummies], axis=1)

print("Features:", X.columns.tolist())
print("X shape before scaling:", X.shape)

In [None]:
# 5. Convert to numpy
X = X.to_numpy().astype(float)

# 6. Standardize features
X_mean = X.mean(axis=0, keepdims=True)
X_std = X.std(axis=0, keepdims=True)
X_std[X_std == 0] = 1.0

X_scaled = (X - X_mean) / X_std

print("Mean (approx):", X_scaled.mean(axis=0)[:5])
print("Std (approx):", X_scaled.std(axis=0)[:5])

In [None]:
# 7. Train-test split (80/20) using numpy only
N = X_scaled.shape[0]
rng = np.random.RandomState(0)
idx = rng.permutation(N)

train_size = int(0.8 * N)
train_idx = idx[:train_size]
test_idx = idx[train_size:]

X_train = X_scaled[train_idx]
y_train = y[train_idx]

X_test = X_scaled[test_idx]
y_test = y[test_idx]

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

## 4. Helper Functions (Activations, Loss, Metrics)

In [None]:
def ReLU(H):
    return np.maximum(0, H)

def sigmoid(H):
    return 1 / (1 + np.exp(-H))

def softmax(H):
    H_shift = H - np.max(H, axis=1, keepdims=True)
    expH = np.exp(H_shift)
    return expH / np.sum(expH, axis=1, keepdims=True)

def one_hot(y, K):
    Y = np.zeros((len(y), K))
    Y[np.arange(len(y)), y] = 1
    return Y

def cross_entropy(Y, P_hat):
    return -np.mean(np.sum(Y * np.log(P_hat + 1e-9), axis=1))

def accuracy(y, y_hat):
    return np.mean(y == y_hat)

## 5. General ANN Class (Manual Forward & Backprop)

Follows the same style as your neural net notebook: softmax output, cross-entropy loss, ReLU hidden layers by default.

In [None]:
class ANN:
    def __init__(self, architecture, activations=None):
        """
        architecture: list of hidden layer sizes, e.g. [16, 8]
        activations: list of activation functions for each hidden layer
                     if None -> all ReLU
        """
        self.architecture = architecture
        self.activations = activations
        self.W = {}
        self.B = {}
        self.loss_history = []

    def _init_params(self, D, K):
        layers = [D] + self.architecture + [K]
        self.L = len(layers) - 1

        if self.activations is None:
            self.activations = [ReLU] * (self.L - 1)

        for l in range(1, self.L + 1):
            fan_in = layers[l-1]
            fan_out = layers[l]
            self.W[l] = 0.1 * np.random.randn(fan_in, fan_out)
            self.B[l] = np.zeros(fan_out)

    def _forward(self, X):
        Z = {0: X}
        for l in range(1, self.L + 1):
            H = Z[l-1] @ self.W[l] + self.B[l]
            if l < self.L:
                A = self.activations[l-1](H)
            else:
                A = softmax(H)
            Z[l] = A
        return Z

    def fit(self, X, y, eta=1e-2, epochs=2000, print_every=200):
        np.random.seed(0)
        N, D = X.shape
        K = len(np.unique(y))

        Y = one_hot(y, K)
        self._init_params(D, K)
        self.loss_history = []

        for epoch in range(epochs):
            Z = self._forward(X)
            P_hat = Z[self.L]

            loss = cross_entropy(Y, P_hat)
            self.loss_history.append(loss)

            dH = (P_hat - Y) / N

            for l in range(self.L, 0, -1):
                dW = Z[l-1].T @ dH
                dB = np.sum(dH, axis=0)

                self.W[l] -= eta * dW
                self.B[l] -= eta * dB

                if l > 1:
                    dA_prev = dH @ self.W[l].T
                    Z_prev = Z[l-1]

                    if self.activations[l-2] == ReLU:
                        dH = dA_prev * (Z_prev > 0)
                    elif self.activations[l-2] == np.tanh:
                        dH = dA_prev * (1 - Z_prev**2)
                    elif self.activations[l-2] == sigmoid:
                        dH = dA_prev * Z_prev * (1 - Z_prev)
                    else:
                        dH = dA_prev

            if (epoch + 1) % print_every == 0:
                print(f"Epoch {epoch+1}/{epochs} - Loss: {loss:.4f}")

        return self

    def predict_proba(self, X):
        Z = self._forward(X)
        return Z[self.L]

    def predict(self, X):
        P_hat = self.predict_proba(X)
        return np.argmax(P_hat, axis=1)

## 6. Train the ANN on Churn Data

In [None]:
np.random.seed(0)

ann = ANN(
    architecture=[16, 8],
    activations=[ReLU, ReLU]
)

ann.fit(
    X_train,
    y_train,
    eta=1e-2,
    epochs=2000,
    print_every=200
)

## 7. Training Loss Curve

In [None]:
plt.figure(figsize=(6,4))
plt.plot(ann.loss_history)
plt.xlabel("Epoch")
plt.ylabel("Training Loss (Cross-Entropy)")
plt.title("Training Loss Curve")
plt.grid(True)
plt.show()

## 8. Evaluation on Train & Test Sets

In [None]:
y_train_hat = ann.predict(X_train)
y_test_hat  = ann.predict(X_test)

train_acc = accuracy(y_train, y_train_hat)
test_acc  = accuracy(y_test, y_test_hat)

print(f"Training Accuracy: {train_acc:.4f}")
print(f"Test Accuracy:     {test_acc:.4f}")

## 9. Confusion Matrix (Test Set, Manual Implementation)

In [None]:
def confusion_matrix(y_true, y_pred):
    K = len(np.unique(y_true))
    cm = np.zeros((K, K), dtype=int)
    for yt, yp in zip(y_true, y_pred):
        cm[yt, yp] += 1
    return cm

cm = confusion_matrix(y_test, y_test_hat)
cm