# Modelling

In [62]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
    f1_score
)
from xgboost import XGBClassifier

In [63]:
np.random.seed(42)

## Import data

In [64]:
data = pd.read_csv('../data/cleaned_engineered_data.csv')

In [65]:
data.columns

Index(['Partner', 'Dependents', 'MultipleLines', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'PaperlessBilling', 'Churn', 'new_customer',
       'is_autopay', 'AddOnCount', 'ChargePerMonthRatio',
       'Fiber_NoTechSupport', 'FiberOptic_StreamingTV', 'Senior_Contract',
       'M2M_ElectronicCheck', 'InternetService_DSL',
       'InternetService_Fiber optic', 'InternetService_No',
       'Contract_Month-to-month', 'Contract_One year', 'Contract_Two year',
       'tenure_group_0–6', 'tenure_group_6–12', 'tenure_group_12–24',
       'tenure_group_24–48', 'tenure_group_48–60', 'tenure_group_60–72',
       'AddOnGroup_None', 'AddOnGroup_Low', 'AddOnGroup_High',
       'MonthlyCharges_group_Low', 'MonthlyCharges_group_Medium',
       'MonthlyCharges_group_High', 'MonthlyCharges_group_Very High'],
      dtype='object')

In [66]:
X = data.drop(columns=['Churn'])
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

## Baseline

This analysis implements a contract-based heuristic baseline for churn prediction, leveraging the relationship between customer contract duration and churn behavior. The approach uses data-driven thresholds rather than simple rule-based predictions

In [67]:
import plotly.express as px
contract_churn_rates = {}
for contract_type in ['Month-to-month', 'One year', 'Two year']:
    if f'Contract_{contract_type}' in X_train.columns:
        mask = X_train[f'Contract_{contract_type}'] == 1
        churn_rate = y_train[mask].mean()
        contract_churn_rates[contract_type] = churn_rate
        print(f"{contract_type} churn rate: {churn_rate:.3f}")

# Use the churn rates to set thresholds
# Predict churn if customer has contract type with churn rate > threshold
thresholds = np.linspace(0, 1, 100)
f1_scores = []

for t in thresholds:
    preds = np.zeros(len(X_test))
    
    for contract_type, churn_rate in contract_churn_rates.items():
        if churn_rate > t and f'Contract_{contract_type}' in X_test.columns:
            mask = X_test[f'Contract_{contract_type}'] == 1
            preds[mask] = 1
    
    f1 = f1_score(y_test, preds)
    f1_scores.append(f1)

fig = px.line(
    x=thresholds,
    y=f1_scores,
    labels={"x": "Churn Rate Threshold", "y": "F1 Score"},
    title="Contract-based Heuristic Baseline: Threshold vs F1 Score"
)
fig.show()

print(f"Best Contract F1 Score: {max(f1_scores):.3f}")

Month-to-month churn rate: 0.427
One year churn rate: 0.111
Two year churn rate: 0.029


Best Contract F1 Score: 0.574


Baseline Achievement:
- Best F1 Score: 0.574
- Optimal Threshold: 0.424

## Model

In [68]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [69]:
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [70]:
results = {}
scoring_metrics = ['roc_auc', 'f1', 'precision', 'recall']

for name, model in models.items():
    # Train on training set
    model.fit(X_train, y_train)
    
    # Perform cross-validation with multiple metrics
    cv_results = cross_validate(model, X_train, y_train, cv=5, scoring=scoring_metrics)
    
    results[name] = {
        'model': model,
        'cv_roc_auc': cv_results['test_roc_auc'].mean(),
        'cv_roc_auc_std': cv_results['test_roc_auc'].std(),
        'cv_f1': cv_results['test_f1'].mean(),
        'cv_f1_std': cv_results['test_f1'].std(),
        'cv_precision': cv_results['test_precision'].mean(),
        'cv_recall': cv_results['test_recall'].mean(),
        'cv_results': cv_results
    }
    
    print(f"{name}:")
    print(f"  ROC AUC: {cv_results['test_roc_auc'].mean():.4f} ± {cv_results['test_roc_auc'].std():.4f}")
    print(f"  F1:      {cv_results['test_f1'].mean():.4f} ± {cv_results['test_f1'].std():.4f}")
    print(f"  Precision: {cv_results['test_precision'].mean():.4f}")
    print(f"  Recall:    {cv_results['test_recall'].mean():.4f}")
    print()

# Select best model based on ROC AUC (you can change this criterion)
best_model_name = max(results.keys(), key=lambda x: results[x]['cv_roc_auc'])
best_model = results[best_model_name]['model']
best_scores = results[best_model_name]

print(f"Selected: {best_model_name}")
print(f'ROC AUC (avg 5-fold): {best_scores["cv_roc_auc"]:.4f}')
print(f'F1 Score (avg 5-fold): {best_scores["cv_f1"]:.4f}')

Logistic Regression:
  ROC AUC: 0.8479 ± 0.0148
  F1:      0.5846 ± 0.0295
  Precision: 0.6699
  Recall:    0.5191

Random Forest:
  ROC AUC: 0.8098 ± 0.0118
  F1:      0.5392 ± 0.0178
  Precision: 0.6021
  Recall:    0.4890




Parameters: { "use_label_encoder" } are not used.



Parameters: { "use_label_encoder" } are not used.



Parameters: { "use_label_encoder" } are not used.



Parameters: { "use_label_encoder" } are not used.



Parameters: { "use_label_encoder" } are not used.



Parameters: { "use_label_encoder" } are not used.




XGBoost:
  ROC AUC: 0.8144 ± 0.0141
  F1:      0.5437 ± 0.0294
  Precision: 0.5929
  Recall:    0.5023

Selected: Logistic Regression
ROC AUC (avg 5-fold): 0.8479
F1 Score (avg 5-fold): 0.5846
