In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix


In [2]:
df = pd.read_csv("../data/raw_data.csv") 
df = df.drop(columns=["customerID"])
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")



In [3]:
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})


In [4]:
X = df.drop(columns=["Churn"])
y = df["Churn"]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [6]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])


In [7]:
lr_pipe = Pipeline([
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=1000))
])

rf_pipe = Pipeline([
    ("preprocess", preprocess),
    ("model", RandomForestClassifier(random_state=42))
])


In [8]:
cv_lr = cross_val_score(
    lr_pipe,
    X_train,
    y_train,
    cv=5,
    scoring="roc_auc"
)

cv_rf = cross_val_score(
    rf_pipe,
    X_train,
    y_train,
    cv=5,
    scoring="roc_auc"
)

print("Logistic Regression CV ROC-AUC:", cv_lr.mean())
print("Random Forest CV ROC-AUC:", cv_rf.mean())


Logistic Regression CV ROC-AUC: 0.8455616199670255
Random Forest CV ROC-AUC: 0.8216537302909803


In [9]:
param_grid = {
    "model__C": [0.01, 0.1, 1, 10],
    "model__penalty": ["l2"],
    "model__solver": ["liblinear"]
}

grid = GridSearchCV(
    estimator=lr_pipe,
    param_grid=param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1
)

grid.fit(X_train, y_train)


0,1,2
,estimator,Pipeline(step..._iter=1000))])
,param_grid,"{'model__C': [0.01, 0.1, ...], 'model__penalty': ['l2'], 'model__solver': ['liblinear']}"
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,1000


In [10]:
print("Best CV ROC-AUC:", grid.best_score_)
print("Best Params:", grid.best_params_)

best_model = grid.best_estimator_


Best CV ROC-AUC: 0.8458816402165059
Best Params: {'model__C': 10, 'model__penalty': 'l2', 'model__solver': 'liblinear'}


In [11]:
y_test_proba = best_model.predict_proba(X_test)[:, 1]
test_roc_auc = roc_auc_score(y_test, y_test_proba)

print("Final Test ROC-AUC:", test_roc_auc)

y_test_pred = best_model.predict(X_test)

print("\nClassification Report:\n")
print(classification_report(y_test, y_test_pred))

print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_test_pred))


Final Test ROC-AUC: 0.841124803017386

Classification Report:

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.56      0.60       374

    accuracy                           0.80      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.80      0.80      1409


Confusion Matrix:

[[925 110]
 [165 209]]


In [12]:
y_prob = best_model.predict_proba(X_test)[:, 1]
y_pred_custom = (y_prob >= 0.35).astype(int)



In [13]:
feature_names = best_model.named_steps['preprocess'].get_feature_names_out()
coefficients = best_model.named_steps['model'].coef_[0]

importance = pd.Series(coefficients, index=feature_names).sort_values()
importance.tail(10)


cat__OnlineSecurity_No                 0.123586
cat__OnlineBackup_Yes                  0.149621
cat__PaymentMethod_Electronic check    0.164155
cat__DeviceProtection_Yes              0.218794
cat__MultipleLines_Yes                 0.330221
cat__StreamingTV_Yes                   0.514886
cat__StreamingMovies_Yes               0.515791
cat__Contract_Month-to-month           0.542657
num__TotalCharges                      0.561540
cat__InternetService_Fiber optic       1.299576
dtype: float64

In [14]:
y_prob = best_model.predict_proba(X_test)[:, 1]
y_pred_035 = (y_prob >= 0.35).astype(int)


In [15]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_035))


              precision    recall  f1-score   support

           0       0.88      0.79      0.83      1035
           1       0.55      0.71      0.62       374

    accuracy                           0.77      1409
   macro avg       0.71      0.75      0.72      1409
weighted avg       0.79      0.77      0.77      1409



In [16]:
import joblib
joblib.dump(best_model, "../model/churn_model.pkl")


['../model/churn_model.pkl']