In [26]:
import numpy as np
import pandas as pd
import pickle
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [28]:
df = pd.read_csv('/Users/hemanthchalla/Downloads/WA_Fn-UseC_-Telco-Customer-Churn.csv')     # <-- ADD THIS!!

features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
            'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
            'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
            'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
            'MonthlyCharges', 'TotalCharges']

target = "Churn"

df["TotalCharges"] = df["TotalCharges"].astype(str).str.strip().replace("", np.nan)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)

df[target] = df[target].map({"Yes": 1, "No": 0})

In [30]:
categorical_cols = [c for c in features if c not in ["tenure", "MonthlyCharges", "TotalCharges"]]
numeric_cols = ["tenure", "MonthlyCharges", "TotalCharges"]

encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = df[col].astype(str)
    df[col + "_enc"] = le.fit_transform(df[col])
    encoders[col] = le

with open("encoders.pkl", "wb") as f:
    pickle.dump(encoders, f)

X = df[[*(c + "_enc" for c in categorical_cols), *numeric_cols]]
y = df[target]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [34]:
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [36]:

smote = SMOTE(random_state=42)
X_sm, y_sm = smote.fit_resample(X_train, y_train)

rus = RandomUnderSampler(random_state=42)
X_comb, y_comb = rus.fit_resample(X_sm, y_sm)

In [38]:

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [40]:
models = {
    "LogisticRegression": {
        "model": LogisticRegression(solver="liblinear"),
        "params": {"C": [0.01, 0.1, 1, 5, 10], "penalty": ["l1", "l2"]}
    },
    "RandomForest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200, 400],
            "max_depth": [5, 10, 15, None],
            "min_samples_split": [2, 5, 10]
        }
    },
    "XGBoost": {
        "model": XGBClassifier(eval_metric="logloss", random_state=42),
        "params": {
            "n_estimators": [100, 200, 400],
            "max_depth": [3, 5, 7],
            "learning_rate": [0.01, 0.05, 0.1],
            "subsample": [0.8, 1.0],
            "colsample_bytree": [0.8, 1.0]
        }
    }
}

results = []
best_estimator = None
best_auc = -1
best_info = None


In [42]:
for name, m in models.items():
    print(f" Tuning {name}...")
    
    search = RandomizedSearchCV(
        m["model"],
        m["params"],
        n_iter=10,
        scoring="roc_auc",
        cv=cv,
        n_jobs=-1,
        random_state=42
    )
    
    search.fit(X_comb, y_comb)
    best_model = search.best_estimator_
    
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, best_model.predict(X_test))
    acc = accuracy_score(y_test, best_model.predict(X_test))
    
    results.append([name, auc, f1, acc, search.best_params_])
    
    if auc > best_auc:
        best_auc = auc
        best_estimator = best_model
        best_info = search.best_params_


 Tuning LogisticRegression...
 Tuning RandomForest...
 Tuning XGBoost...


In [43]:
comparison = pd.DataFrame(results, columns=["Model", "AUC", "F1", "Accuracy", "Best Params"])
print("\n MODEL COMPARISON:\n")
print(comparison)



 MODEL COMPARISON:

                Model       AUC        F1  Accuracy  \
0  LogisticRegression  0.829236  0.602333  0.733854   
1        RandomForest  0.824968  0.592683  0.762952   
2             XGBoost  0.825187  0.606768  0.760823   
3  LogisticRegression  0.829236  0.602333  0.733854   
4        RandomForest  0.824968  0.592683  0.762952   
5             XGBoost  0.825187  0.606768  0.760823   

                                         Best Params  
0                         {'penalty': 'l2', 'C': 10}  
1  {'n_estimators': 200, 'min_samples_split': 5, ...  
2  {'subsample': 1.0, 'n_estimators': 200, 'max_d...  
3                         {'penalty': 'l2', 'C': 10}  
4  {'n_estimators': 200, 'min_samples_split': 5, ...  
5  {'subsample': 1.0, 'n_estimators': 200, 'max_d...  


In [44]:
with open("best_churn_model.pkl", "wb") as f:
    pickle.dump(best_estimator, f)

print("\n Best model saved to best_churn_model.pkl")


 Best model saved to best_churn_model.pkl


In [46]:
y_pred = best_estimator.predict(X_test)
y_pred_proba = best_estimator.predict_proba(X_test)[:, 1]

print("\n FINAL MODEL RESULTS")
print("AUC:", roc_auc_score(y_test, y_pred_proba))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


 FINAL MODEL RESULTS
AUC: 0.8292360949649953
Accuracy: 0.7338537970191625
F1-score: 0.6023329798515377

Confusion Matrix:
 [[750 285]
 [ 90 284]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.72      0.80      1035
           1       0.50      0.76      0.60       374

    accuracy                           0.73      1409
   macro avg       0.70      0.74      0.70      1409
weighted avg       0.79      0.73      0.75      1409



In [47]:
def prepare_input(row):
    row = row.copy()
    for col, le in encoders.items():
        row[col + "_enc"] = le.transform([row[col]])[0]
    row_df = pd.DataFrame([{
        **{c + "_enc": row[c + "_enc"] for c in categorical_cols},
        **{col: row[col] for col in numeric_cols}
    }])
    row_df[numeric_cols] = scaler.transform(row_df[numeric_cols])
    return row_df

def predict_customer(customer):
    x = prepare_input(customer)
    proba = best_estimator.predict_proba(x)[0][1]
    return {"Churn": int(proba >= 0.5), "Probability": float(proba)}

print("\n Prediction pipeline ready! Use predict_customer(sample_customer)")


 Prediction pipeline ready! Use predict_customer(sample_customer)
