# Modeling


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('../data/Churn_Data_Processed.csv')

# Prepare features and target
X = df.drop('Churn', axis=1)
y = df['Churn'].map({'Yes': 1, 'No': 0})

print("Target Distribution:")
print(y.value_counts())
print(f"Churn Rate: {y.mean():.3f}")


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)




print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Churn rate - Train: {y_train.mean():.3f}, Test: {y_test.mean():.3f}")

Target Distribution:
Churn
0    678
1    221
Name: count, dtype: int64
Churn Rate: 0.246

Training set: 629 samples
Test set: 270 samples
Churn rate - Train: 0.246, Test: 0.244


In [2]:
# Calculate class weights to handle imbalance
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
print(f"Class weights: {class_weight_dict}")

Class weights: {0: np.float64(0.6635021097046413), 1: np.float64(2.029032258064516)}


In [3]:
# Stratified K-Fold for imbalanced data
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [4]:
# LASSO Logistic Regression Pipeline with class weights
lasso_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', LogisticRegression(
        penalty='l1', 
        solver='liblinear', 
        random_state=42,
        max_iter=1000,
        class_weight=class_weight_dict  # Adjusted for imbalance
    ))
])

# Parameter grid for LASSO
lasso_param_grid = {
    'lasso__C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]  # Wider range for fine-tuning
}

# K-fold Cross-validated grid search
lasso_grid = GridSearchCV(
    lasso_pipeline, 
    lasso_param_grid, 
    cv=kfold, 
    scoring='roc_auc',
    n_jobs=-1,
    return_train_score=True
)

lasso_grid.fit(X_train, y_train)

print("LASSO Best Parameters:", lasso_grid.best_params_)
print("LASSO Best CV Score (AUC):", lasso_grid.best_score_)

LASSO Best Parameters: {'lasso__C': 1}
LASSO Best CV Score (AUC): 0.866963118159159


In [5]:
from sklearn.metrics import classification_report, precision_recall_fscore_support

# Get the best LASSO model
best_lasso = lasso_grid.best_estimator_

# Make predictions
y_pred = best_lasso.predict(X_test)
y_pred_proba = best_lasso.predict_proba(X_test)[:, 1]

# Detailed classification report
print("=" * 60)
print("DETAILED CLASSIFICATION REPORT")
print("=" * 60)
print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

# Focus on churn class (class 1) metrics
precision, recall, f1, support = precision_recall_fscore_support(
    y_test, y_pred, average=None, labels=[0, 1]
)

print("\n" + "=" * 50)
print("CHURN CLASS (Class 1) PERFORMANCE")
print("=" * 50)
print(f"Precision: {precision[1]:.4f}")
print(f"Recall:    {recall[1]:.4f}") 
print(f"F1-Score:  {f1[1]:.4f}")
print(f"Support:   {support[1]} samples")

# Confusion matrix for detailed analysis
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

print("\n" + "=" * 40)
print("CONFUSION MATRIX")
print("=" * 40)
print(cm)
print(f"\nTrue Negatives:  {cm[0,0]} | False Positives: {cm[0,1]}")
print(f"False Negatives: {cm[1,0]} | True Positives:  {cm[1,1]}")

# Calculate additional churn-specific metrics
tn, fp, fn, tp = cm.ravel()
churn_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
churn_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
churn_f1 = 2 * (churn_precision * churn_recall) / (churn_precision + churn_recall) if (churn_precision + churn_recall) > 0 else 0

print(f"\nChurn Class Metrics (Manual Calculation):")
print(f"Precision: {churn_precision:.4f}")
print(f"Recall:    {churn_recall:.4f}")
print(f"F1-Score:  {churn_f1:.4f}")

DETAILED CLASSIFICATION REPORT
              precision    recall  f1-score   support

    No Churn       0.93      0.83      0.88       204
       Churn       0.61      0.82      0.70        66

    accuracy                           0.83       270
   macro avg       0.77      0.82      0.79       270
weighted avg       0.85      0.83      0.83       270


CHURN CLASS (Class 1) PERFORMANCE
Precision: 0.6067
Recall:    0.8182
F1-Score:  0.6968
Support:   66 samples

CONFUSION MATRIX
[[169  35]
 [ 12  54]]

True Negatives:  169 | False Positives: 35
False Negatives: 12 | True Positives:  54

Churn Class Metrics (Manual Calculation):
Precision: 0.6067
Recall:    0.8182
F1-Score:  0.6968


In [6]:
# ElasticNet Logistic Regression Pipeline with class weights
elasticnet_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('elasticnet', LogisticRegression(
        penalty='elasticnet', 
        solver='saga', 
        random_state=42,
        max_iter=1000,
        class_weight=class_weight_dict  # Adjusted for imbalance
    ))
])

# Parameter grid for ElasticNet
elasticnet_param_grid = {
    'elasticnet__C': [0.001, 0.01, 0.1, 1, 10],
    'elasticnet__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

elasticnet_grid = GridSearchCV(
    elasticnet_pipeline, 
    elasticnet_param_grid, 
    cv=kfold, 
    scoring='roc_auc',
    n_jobs=-1,
    return_train_score=True
)

elasticnet_grid.fit(X_train, y_train)

print("ElasticNet Best Parameters:", elasticnet_grid.best_params_)
print("ElasticNet Best CV Score (AUC):", elasticnet_grid.best_score_)



ElasticNet Best Parameters: {'elasticnet__C': 0.1, 'elasticnet__l1_ratio': 0.1}
ElasticNet Best CV Score (AUC): 0.8694845211862876


In [7]:
# Standard L2 Logistic Regression Pipeline with class weights
l2_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('l2', LogisticRegression(
        penalty='l2', 
        random_state=42,
        max_iter=1000,
        class_weight=class_weight_dict  # Adjusted for imbalance
    ))
])

# Parameter grid for L2
l2_param_grid = {
    'l2__C': [0.001, 0.01, 0.1, 1, 10, 100]
}

l2_grid = GridSearchCV(
    l2_pipeline, 
    l2_param_grid, 
    cv=kfold, 
    scoring='roc_auc',
    n_jobs=-1,
    return_train_score=True
)

l2_grid.fit(X_train, y_train)

print("L2 Best Parameters:", l2_grid.best_params_)
print("L2 Best CV Score (AUC):", l2_grid.best_score_)

L2 Best Parameters: {'l2__C': 0.1}
L2 Best CV Score (AUC): 0.8684557309540152


In [8]:
# Get best models
best_lasso = lasso_grid.best_estimator_
best_elasticnet = elasticnet_grid.best_estimator_
best_l2 = l2_grid.best_estimator_

# Detailed CV performance comparison
models = {
    'LASSO (L1)': best_lasso,
    'ElasticNet': best_elasticnet,
    'Ridge (L2)': best_l2
}

print("\n" + "="*60)
print("K-FOLD CROSS-VALIDATION RESULTS (AUC)")
print("="*60)

cv_results_comparison = {}
for name, model in models.items():
    # Cross-validation scores
    cv_scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='roc_auc')
    
    # Train the model on full training data for test evaluation
    model.fit(X_train, y_train)
    
    # Test performance
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, y_pred_proba)
    
    cv_results_comparison[name] = {
        'CV_Mean_AUC': cv_scores.mean(),
        'CV_Std_AUC': cv_scores.std(),
        'Test_AUC': test_auc
    }
    
    print(f"{name:.<20} CV AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f}) | Test AUC: {test_auc:.4f}")

# Create comparison dataframe
results_df = pd.DataFrame(cv_results_comparison).T
results_df = results_df.sort_values('CV_Mean_AUC', ascending=False)
print("\n" + results_df.to_string())


K-FOLD CROSS-VALIDATION RESULTS (AUC)
LASSO (L1).......... CV AUC: 0.8670 (+/- 0.0257) | Test AUC: 0.8948
ElasticNet.......... CV AUC: 0.8695 (+/- 0.0185) | Test AUC: 0.8850
Ridge (L2).......... CV AUC: 0.8685 (+/- 0.0186) | Test AUC: 0.8864

            CV_Mean_AUC  CV_Std_AUC  Test_AUC
ElasticNet     0.869485    0.009239  0.884952
Ridge (L2)     0.868456    0.009279  0.886364
LASSO (L1)     0.866963    0.012855  0.894831


In [9]:
from sklearn.metrics import precision_recall_curve, f1_score, confusion_matrix, classification_report

best_model_name = results_df.index[0]
best_model = models[best_model_name]

print(f"\n SELECTED BEST MODEL: {best_model_name}")
print(f" CV AUC: {results_df.loc[best_model_name, 'CV_Mean_AUC']:.4f}")
print(f" Test AUC: {results_df.loc[best_model_name, 'Test_AUC']:.4f}")

if best_model_name == 'LASSO (L1)':
    best_coefs = best_model.named_steps['lasso'].coef_[0]
elif best_model_name == 'ElasticNet':
    best_coefs = best_model.named_steps['elasticnet'].coef_[0]
else:
    best_coefs = best_model.named_steps['l2'].coef_[0]

final_importance = pd.DataFrame({
    'feature': X.columns,
    'coefficient': best_coefs,
    'importance': np.abs(best_coefs)
}).sort_values('importance', ascending=False)

print(f"\n TOP 10 MOST IMPORTANT FEATURES ({best_model_name}):")
print("="*60)
for i, row in final_importance.head(10).iterrows():
    direction = " INCREASES churn" if row['coefficient'] > 0 else " REDUCES churn"
    print(f"{row['feature']:.<40} {direction} (impact: {row['importance']:.4f})")


y_probs = best_model.predict_proba(X_test)[:, 1]  # probability of churn

precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

print(f"\n Optimal threshold for churn (max F1): {best_threshold:.2f}")
print(f"Precision at optimal threshold: {precisions[best_idx]:.2f}")
print(f"Recall at optimal threshold: {recalls[best_idx]:.2f}")
print(f"F1-score at optimal threshold: {f1_scores[best_idx]:.2f}")

y_pred_new = (y_probs >= best_threshold).astype(int)

conf_matrix = confusion_matrix(y_test, y_pred_new)
tn, fp, fn, tp = conf_matrix.ravel()

print("\n========================================")
print("CONFUSION MATRIX AT OPTIMAL THRESHOLD")
print("========================================")
print(conf_matrix)
print(f"\nTrue Negatives: {tn} | False Positives: {fp}")
print(f"False Negatives: {fn} | True Positives: {tp}")

churn_precision = tp / (tp + fp)
churn_recall = tp / (tp + fn)
churn_f1 = 2 * (churn_precision * churn_recall) / (churn_precision + churn_recall)

print("\n========================================")
print("CHURN CLASS (Class 1) PERFORMANCE")
print("========================================")
print(f"Precision: {churn_precision:.4f}")
print(f"Recall:    {churn_recall:.4f}")
print(f"F1-Score:  {churn_f1:.4f}")
print(f"Support:   {tp + fn} samples")



 SELECTED BEST MODEL: ElasticNet
 CV AUC: 0.8695
 Test AUC: 0.8850

 TOP 10 MOST IMPORTANT FEATURES (ElasticNet):
MultipleLines_Yes.......................  REDUCES churn (impact: 0.5541)
tenure..................................  REDUCES churn (impact: 0.4678)
InternationalPlan.......................  INCREASES churn (impact: 0.4571)
AvgCallDurationBin_Medium...............  INCREASES churn (impact: 0.4477)
InternetService_Fiber optic.............  INCREASES churn (impact: 0.4361)
AvgCallDurationBin_Very Long............  REDUCES churn (impact: 0.4059)
AvgCallPerMonth.........................  INCREASES churn (impact: 0.3929)
PhoneService_Yes........................  REDUCES churn (impact: 0.3240)
Contract_Two year.......................  REDUCES churn (impact: 0.2693)
StreamingTV_Yes.........................  INCREASES churn (impact: 0.2488)

 Optimal threshold for churn (max F1): 0.62
Precision at optimal threshold: 0.70
Recall at optimal threshold: 0.74
F1-score at optimal threshold

# Save prediction results

In [10]:
test_results = X_test.copy()
test_results['Actual_Churn'] = y_test
test_results['Predicted_Churn'] = y_pred_new
test_results['Predicted_Probability'] = y_probs
test_results.to_csv('../data/Churn_Test_Predictions.csv', index=False)