# ==============================
# Telco Customer Churn - Final Model Training
# ==============================

In [1]:
import pandas as pd
import numpy as np
import joblib
import os

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier


# 1️⃣ Load Dataset



In [3]:
df=pd.read_csv(r"C:\Users\Pragn\Desktop\Telco Customer Churn\data\WA_Fn-UseC_-Telco-Customer-Churn.csv")
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes



# 2️⃣ Data Cleaning


In [5]:
# Convert target to numeric
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

# Fix TotalCharges column
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)

# Drop unnecessary column
df.drop("customerID", axis=1, inplace=True)

# 3️⃣ Define Features & Target

In [6]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

categorical_cols = X.select_dtypes(include="object").columns.tolist()
numerical_cols = X.select_dtypes(exclude="object").columns.tolist()



# 4️⃣ Train-Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)



# 5️⃣ Preprocessing + Model Pipeline

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

xgb_model = XGBClassifier(
    scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train),
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", xgb_model)
])


# 6️⃣ Train Model

In [9]:

pipeline.fit(X_train, y_train)

# 7️⃣ Evaluation

In [13]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

# Predict probabilities
y_prob = pipeline.predict_proba(X_test)[:, 1]

# Default prediction (threshold = 0.5)
y_pred_default = (y_prob >= 0.5).astype(int)

# Custom prediction (threshold = 0.3)
y_pred_custom = (y_prob >= 0.3).astype(int)

print("===== Evaluation at Default Threshold (0.5) =====")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_default))
print("\nClassification Report:\n", classification_report(y_test, y_pred_default))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_prob))


print("\n\n===== Evaluation at Custom Threshold (0.3) =====")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_custom))
print("\nClassification Report:\n", classification_report(y_test, y_pred_custom))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_prob))

===== Evaluation at Default Threshold (0.5) =====
Confusion Matrix:
 [[1031  263]
 [ 148  319]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.80      0.83      1294
           1       0.55      0.68      0.61       467

    accuracy                           0.77      1761
   macro avg       0.71      0.74      0.72      1761
weighted avg       0.79      0.77      0.77      1761


ROC-AUC Score: 0.8219570476817729


===== Evaluation at Custom Threshold (0.3) =====
Confusion Matrix:
 [[920 374]
 [104 363]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.71      0.79      1294
           1       0.49      0.78      0.60       467

    accuracy                           0.73      1761
   macro avg       0.70      0.74      0.70      1761
weighted avg       0.79      0.73      0.74      1761


ROC-AUC Score: 0.8219570476817729


# 8️⃣ Save Model

In [14]:
os.makedirs("models", exist_ok=True)

joblib.dump(pipeline, "models/churn_pipeline.pkl")

print("\n✅ Model saved successfully in models/churn_pipeline.pkl")


✅ Model saved successfully in models/churn_pipeline.pkl
