# Project 3: Churn-Aware Customer Segmentation

## Objective
Combine customer segmentation (K-Means clusters) with churn outcomes to identify:
- high-value segments at high churn risk
- low-risk segments
- recommended retention actions by segment

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [2]:
df = pd.read_excel("data/raw/Telco_customer_churn.xlsx")
df.shape

(7043, 33)

In [3]:
seg_cols = ["Tenure Months", "Monthly Charges", "Total Charges", "CLTV"]
df_seg = df[seg_cols].copy()

# Fix Total Charges (blank strings)
df_seg["Total Charges"] = pd.to_numeric(df_seg["Total Charges"], errors="coerce")
df_seg["Total Charges"] = df_seg["Total Charges"].fillna(df_seg["Total Charges"].median())

In [4]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_seg)

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df["Cluster"] = kmeans.fit_predict(X_scaled)

df["Cluster"].value_counts()

Cluster
1    2516
0    2451
2    2076
Name: count, dtype: int64

In [5]:
df["Churn"] = df["Churn Label"].map({"Yes": 1, "No": 0})
df["Churn"].value_counts(normalize=True)

Churn
0    0.73463
1    0.26537
Name: proportion, dtype: float64

In [7]:
df_seg.dtypes

Tenure Months        int64
Monthly Charges    float64
Total Charges      float64
CLTV                 int64
dtype: object

In [8]:
df_seg["Cluster"] = df["Cluster"]

In [9]:
cluster_profile = (
    df_seg
        .groupby("Cluster")[["Tenure Months", "Monthly Charges", "Total Charges", "CLTV"]]
        .mean()
        .round(2)
)

cluster_profile

Unnamed: 0_level_0,Tenure Months,Monthly Charges,Total Charges,CLTV
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,16.1,63.63,1046.61,3101.14
1,26.21,44.43,962.34,5086.44
2,59.05,90.74,5339.61,5102.55


In [10]:
df_seg["Cluster"] = df["Cluster"]
df_seg["Churn"] = df["Churn Label"].map({"Yes": 1, "No": 0})

segment_churn = (
    df_seg.groupby("Cluster")
          .agg(customers=("Churn", "size"),
               churn_rate=("Churn", "mean"))
)

segment_churn["churn_rate"] = (segment_churn["churn_rate"] * 100).round(2)
segment_churn

Unnamed: 0_level_0,customers,churn_rate
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2451,38.27
1,2516,24.32
2,2076,15.37
