# Customer Churn Prediction (Telco Dataset)

This notebook builds a complete **machine learning pipeline** to predict customer churn for a telecom company.

**Steps:**
1. Load Telco churn data  
2. Clean & preprocess data  
3. Encode categorical features  
4. Train models (Logistic Regression, Random Forest, XGBoost)  
5. Evaluate performance (ROC-AUC, F1, confusion matrix)  
6. Explain model predictions with SHAP  
7. Export predictions for Power BI and save trained models  


## 1. Setup & Imports

In [None]:

# If you're in Google Colab, you may need to install these:
# !pip install xgboost shap

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
import shap
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.figsize"] = (8, 5)
sns.set(style="whitegrid")


## 2. Load Data

In [None]:

# Adjust the path if needed; here we assume the notebook is inside /notebooks
import os

data_path = os.path.join("..", "data", "telco_churn_full.csv")
df = pd.read_csv(data_path)
print(df.shape)
df.head()


## 3. Basic Cleaning & Preparation

In [None]:

# Keep a copy of original data
df_raw = df.copy()

# Preview info
df.info()


In [None]:

# Convert TotalCharges to numeric (should already be numeric, but safe)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Fill missing TotalCharges if any
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)

# Separate target
target_col = "Churn"

# Encode target as 0/1 (No = 0, Yes = 1)
y = (df[target_col] == "Yes").astype(int)

# Drop target from features for now
X_raw = df.drop(columns=[target_col])

X_raw.head()


## 4. Encode Categorical Features

In [None]:

# Identify categorical columns
cat_cols = X_raw.select_dtypes(include=["object"]).columns.tolist()
cat_cols


In [None]:

X = X_raw.copy()
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

X.head()


## 5. Train/Test Split

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape


## 6. Train Models

In [None]:

# Logistic Regression
log_model = LogisticRegression(max_iter=2000)
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)
log_auc = roc_auc_score(y_test, log_model.predict_proba(X_test)[:, 1])

# Random Forest
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    random_state=42
)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])

# XGBoost
xgb_model = xgb.XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_auc = roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])

log_auc, rf_auc, xgb_auc


## 7. Evaluation Metrics

In [None]:

print("Logistic Regression AUC:", round(log_auc, 4))
print("Random Forest AUC:", round(rf_auc, 4))
print("XGBoost AUC:", round(xgb_auc, 4))


In [None]:

print("XGBoost Classification Report:")
print(classification_report(y_test, xgb_pred))


In [None]:

cm = confusion_matrix(y_test, xgb_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - XGBoost")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


## 8. SHAP Explainability

In [None]:

# SHAP can be slow on full data; sample for speed if needed
sample_size = 1000
X_test_sample = X_test.sample(n=min(sample_size, len(X_test)), random_state=42)

explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_test_sample)

shap.summary_plot(shap_values, X_test_sample, show=False)
plt.title("SHAP Summary Plot - XGBoost")
plt.show()


## 9. Export Predictions for Power BI & Save Models

In [None]:

import pickle
import numpy as np
import os

# 1) Save all three models
with open("logistic_regression_churn.pkl", "wb") as f:
    pickle.dump(log_model, f)

with open("random_forest_churn.pkl", "wb") as f:
    pickle.dump(rf_model, f)

with open("xgboost_churn.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

print("Models saved to current working directory.")

# 2) Build prediction DataFrame for ALL customers
xgb_probs_all = xgb_model.predict_proba(X)[:, 1]
xgb_preds_all = (xgb_probs_all >= 0.5).astype(int)

# Risk level buckets
def get_risk_level(p):
    if p >= 0.75:
        return "High"
    elif p >= 0.5:
        return "Medium"
    else:
        return "Low"

risk_levels = [get_risk_level(p) for p in xgb_probs_all]

# Use customerID from original df if available
if "customerID" in df.columns:
    customer_ids = df["customerID"]
else:
    customer_ids = np.arange(1, len(X) + 1)

pred_df = pd.DataFrame({
    "customerID": customer_ids,
    "PredictedChurnFlag": xgb_preds_all,
    "PredictedChurnLabel": np.where(xgb_preds_all == 1, "Yes", "No"),
    "ChurnProbability": xgb_probs_all,
    "RiskLevel": risk_levels
})

pred_df.head()


In [None]:

# 3) Save predictions CSV (for Power BI)
pred_csv_path = os.path.join("..", "data", "churn_predictions.csv")
pred_df.to_csv(pred_csv_path, index=False)
print(f"Predictions saved to {pred_csv_path}")


In [None]:

# 4) If running in Google Colab, trigger downloads
try:
    from google.colab import files
    print("Attempting to download models and predictions...")

    files.download("logistic_regression_churn.pkl")
    files.download("random_forest_churn.pkl")
    files.download("xgboost_churn.pkl")
    files.download(pred_csv_path)
except Exception as e:
    print("Download step skipped (likely not in Colab).", e)
