In [1]:
import pandas as pd
df = pd.read_csv("../data/processed/telco_customer_churn_cleaned.csv") 


In [2]:
# Columns that always cause leakage
leakage_cols = [
    "churn_label", 
    "churn_score", 
    "cltv", 
    "churn_reason",
    "count"
]

# Columns that add no predictive value
drop_geo = [
    "country", "state", "city", "zip_code",
    "lat_long", "latitude", "longitude"
]

df_ml = df.drop(columns=leakage_cols + drop_geo)


In [3]:
y = df_ml["churn_value"]
x = df_ml.drop(columns=["churn_value", "customerid"])

x = pd.get_dummies(x, drop_first=True)



In [4]:
x.to_csv("../data/processed/telco_features.csv", index=False)
y.to_csv("../data/processed/telco_target.csv", index=False)



In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

x_train.shape, x_test.shape, y_train.shape, y_test.shape


((5625, 30), (1407, 30), (5625,), (1407,))

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

#1: Logistic Regression
log_reg = LogisticRegression(max_iter=2000)
log_reg.fit(x_train, y_train)

y_pred_lr = log_reg.predict(x_test)

print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))


#2: Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(x_train, y_train)

y_pred_rf = rf.predict(x_test)

print("\nRandom Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression
Accuracy: 0.8038379530916845
Precision: 0.6384180790960452
Recall: 0.6042780748663101
F1 Score: 0.6208791208791209

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.88      0.87      1033
           1       0.64      0.60      0.62       374

    accuracy                           0.80      1407
   macro avg       0.75      0.74      0.74      1407
weighted avg       0.80      0.80      0.80      1407

Confusion Matrix:
 [[905 128]
 [148 226]]

Random Forest
Accuracy: 0.7938877043354655
Precision: 0.6409395973154363
Recall: 0.5106951871657754
F1 Score: 0.5684523809523809

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.90      0.86      1033
           1       0.64      0.51      0.57       374

    accuracy                           0.79      1407
   macro avg       0.74      0.70      0.72      1407
weighted avg       0.78      0.79      0.79 

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# 1. Identify numeric columns
numeric_cols = ["tenure_months", "monthly_charges", "total_charges"]

# 2. Scale numeric columns using the training set only
scaler = StandardScaler()
x_train_scaled = x_train.copy()
x_test_scaled = x_test.copy()

x_train_scaled[numeric_cols] = scaler.fit_transform(x_train[numeric_cols])
x_test_scaled[numeric_cols] = scaler.transform(x_test[numeric_cols])

# 3. Retrain Logistic Regression with more iterations
log_reg_scaled = LogisticRegression(max_iter=5000)
log_reg_scaled.fit(x_train_scaled, y_train)

# 4. Predict
y_pred_scaled = log_reg_scaled.predict(x_test_scaled)

# 5. Evaluate
print("Scaled Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_scaled))
print("Precision:", precision_score(y_test, y_pred_scaled))
print("Recall:", recall_score(y_test, y_pred_scaled))
print("F1 Score:", f1_score(y_test, y_pred_scaled))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_scaled))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_scaled))


Scaled Logistic Regression
Accuracy: 0.8045486851457001
Precision: 0.6410256410256411
Recall: 0.6016042780748663
F1 Score: 0.6206896551724138

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1033
           1       0.64      0.60      0.62       374

    accuracy                           0.80      1407
   macro avg       0.75      0.74      0.74      1407
weighted avg       0.80      0.80      0.80      1407


Confusion Matrix:
[[907 126]
 [149 225]]


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#1: Logistic Regression Grid Search


log_reg = LogisticRegression()

log_reg_params = {
    "C": [0.01, 0.1, 1, 10],
    "solver": ["lbfgs", "liblinear"],
    "penalty": ["l2"],
    "max_iter": [2000, 5000]
}

log_reg_grid = GridSearchCV(
    estimator=log_reg,
    param_grid=log_reg_params,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

log_reg_grid.fit(x_train_scaled, y_train)

print("Best Logistic Regression Params:", log_reg_grid.best_params_)
print("Best Cross-Validation Accuracy:", log_reg_grid.best_score_)

# Retrain model with best params
best_log_reg = log_reg_grid.best_estimator_
y_pred_log_reg = best_log_reg.predict(x_test_scaled)

print("\nTuned Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("\nClassification Report:\n", classification_report(y_test, y_pred_log_reg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_reg))



#2: Random Forest Grid Search


rf = RandomForestClassifier(random_state=42)

rf_params = {
    "n_estimators": [100, 200],
    "max_depth": [5, 10, 15, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "class_weight": ["balanced", None]
}

rf_grid = GridSearchCV(
    estimator=rf,
    param_grid=rf_params,
    cv=3,
    scoring="accuracy",
    n_jobs=-1
)

rf_grid.fit(x_train, y_train)  # NOTE: random forest does NOT need scaled data

print("\nBest Random Forest Params:", rf_grid.best_params_)
print("Best Cross-Validation Accuracy:", rf_grid.best_score_)

# Retrain with best params
best_rf = rf_grid.best_estimator_
y_pred_rf = best_rf.predict(x_test)

print("\nTuned Random Forest Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
