# Baseline Model: Logistic Regression for Customer Churn Prediction

This notebook establishes a baseline model using Logistic Regression for predicting customer churn.

## Objectives
- Define success criteria for the baseline model
- Train Logistic Regression with default parameters
- Log hyperparameters and metrics to MLflow
- Evaluate on validation set
- Generate confusion matrix and ROC curve

## Success Criteria
The baseline model should achieve:
- **AUC-ROC > 0.60** on validation set (minimum viable baseline)
- This establishes the performance floor that improved models must beat
- Target for final model: **AUC-ROC > 0.75**

In [None]:
%pip install -e .. --quiet

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_curve,
    roc_auc_score
)

# Import our custom modules
from src.data_loader import create_sample_data
from src.preprocessing import ChurnPreprocessor, create_train_val_test_split
from src.experiment import (
    setup_experiment,
    start_run,
    log_params,
    log_classification_metrics,
    log_dataset_info,
    log_model,
    log_figure
)

# Display settings
%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')
pd.set_option('display.max_columns', None)

print("Libraries loaded successfully.")

## 1. Load and Prepare Data

In [None]:
# Generate sample data with realistic correlations
df = create_sample_data(n_samples=1000, random_state=42, churn_rate=0.2)

print(f"Dataset shape: {df.shape}")
print(f"\nChurn distribution:")
print(df['churn'].value_counts(normalize=True).round(3))

In [None]:
# Split data into train/validation/test sets
train_df, val_df, test_df = create_train_val_test_split(
    df, 
    target_column='churn',
    train_size=0.7,
    val_size=0.15,
    test_size=0.15,
    random_state=42
)

print(f"Train set: {len(train_df)} samples")
print(f"Validation set: {len(val_df)} samples")
print(f"Test set: {len(test_df)} samples (held out for final evaluation)")

In [None]:
# Initialize and fit preprocessor on training data only
preprocessor = ChurnPreprocessor()
preprocessor.fit(train_df)

# Transform all sets
X_train, y_train = preprocessor.transform(train_df)
X_val, y_val = preprocessor.transform(val_df)

print(f"Training features shape: {X_train.shape}")
print(f"Validation features shape: {X_val.shape}")
print(f"\nFeature names: {preprocessor.feature_names}")

## 2. Set Up MLflow Experiment

In [None]:
# Configure MLflow experiment
experiment_name = "churn_prediction_baseline"
experiment_id = setup_experiment(experiment_name)

print(f"MLflow Experiment: {experiment_name}")
print(f"Experiment ID: {experiment_id}")

## 3. Train Baseline Logistic Regression Model

In [None]:
# Start MLflow run
with start_run(run_name="logistic_regression_baseline", 
               description="Baseline Logistic Regression with default parameters") as run:
    
    # Log dataset information
    log_dataset_info(train_df, name="train", description="Training data for baseline model")
    log_dataset_info(val_df, name="validation", description="Validation data for baseline model")
    
    # Define model with default parameters
    model = LogisticRegression(
        random_state=42,
        max_iter=1000  # Ensure convergence
    )
    
    # Log model parameters
    model_params = {
        "model_type": "LogisticRegression",
        "penalty": model.penalty,
        "C": model.C,
        "solver": model.solver,
        "max_iter": model.max_iter,
        "random_state": model.random_state
    }
    log_params(model_params)
    
    # Train the model
    print("Training Logistic Regression model...")
    model.fit(X_train, y_train)
    print("Training complete.")
    
    # Make predictions on training set
    y_train_pred = model.predict(X_train)
    y_train_prob = model.predict_proba(X_train)[:, 1]
    
    # Make predictions on validation set
    y_val_pred = model.predict(X_val)
    y_val_prob = model.predict_proba(X_val)[:, 1]
    
    # Log training metrics
    print("\n=== Training Set Performance ===")
    train_metrics = log_classification_metrics(y_train, y_train_pred, y_train_prob, prefix="train_")
    for metric, value in train_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    # Log validation metrics
    print("\n=== Validation Set Performance ===")
    val_metrics = log_classification_metrics(y_val, y_val_pred, y_val_prob, prefix="val_")
    for metric, value in val_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    # Store the run ID for reference
    run_id = run.info.run_id
    print(f"\nMLflow Run ID: {run_id}")
    
    # Log the model
    log_model(model, artifact_path="model", input_example=X_train[:5])
    print("Model logged to MLflow.")

## 4. Generate Confusion Matrix

In [None]:
# Calculate confusion matrix
cm = confusion_matrix(y_val, y_val_pred)

# Create confusion matrix visualization
fig, ax = plt.subplots(figsize=(8, 6))

sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=['No Churn (0)', 'Churn (1)'],
    yticklabels=['No Churn (0)', 'Churn (1)'],
    ax=ax
)

ax.set_xlabel('Predicted Label', fontsize=12)
ax.set_ylabel('True Label', fontsize=12)
ax.set_title('Confusion Matrix - Baseline Logistic Regression (Validation Set)', fontsize=14)

plt.tight_layout()
plt.show()

# Print classification report
print("\nClassification Report (Validation Set):")
print(classification_report(y_val, y_val_pred, target_names=['No Churn', 'Churn']))

## 5. Generate ROC Curve

In [None]:
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_val, y_val_prob)
roc_auc = roc_auc_score(y_val, y_val_prob)

# Create ROC curve visualization
fig, ax = plt.subplots(figsize=(8, 6))

ax.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random classifier')

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate', fontsize=12)
ax.set_ylabel('True Positive Rate', fontsize=12)
ax.set_title('ROC Curve - Baseline Logistic Regression (Validation Set)', fontsize=14)
ax.legend(loc='lower right')

# Add threshold annotations
ax.axhline(y=0.75, color='green', linestyle=':', alpha=0.5, label='Target TPR')

plt.tight_layout()
plt.show()

print(f"\nROC-AUC Score: {roc_auc:.4f}")

## 6. Baseline Model Summary

In [None]:
# Print baseline summary
print("="*60)
print("BASELINE MODEL SUMMARY")
print("="*60)
print(f"\nModel: Logistic Regression (default parameters)")
print(f"Experiment: {experiment_name}")
print(f"Run ID: {run_id}")

print(f"\n--- Validation Set Metrics ---")
print(f"Accuracy:  {val_metrics['val_accuracy']:.4f}")
print(f"Precision: {val_metrics['val_precision']:.4f}")
print(f"Recall:    {val_metrics['val_recall']:.4f}")
print(f"F1 Score:  {val_metrics['val_f1_score']:.4f}")
print(f"ROC-AUC:   {val_metrics['val_roc_auc']:.4f}")

print(f"\n--- Success Criteria Check ---")
baseline_threshold = 0.60
target_threshold = 0.75

if val_metrics['val_roc_auc'] >= baseline_threshold:
    print(f"[PASS] Baseline achieved AUC-ROC >= {baseline_threshold}")
else:
    print(f"[FAIL] Baseline did not achieve AUC-ROC >= {baseline_threshold}")

if val_metrics['val_roc_auc'] >= target_threshold:
    print(f"[PASS] Already meets final target AUC-ROC >= {target_threshold}!")
else:
    improvement_needed = target_threshold - val_metrics['val_roc_auc']
    print(f"[INFO] Need {improvement_needed:.4f} improvement to reach target {target_threshold}")

print("\n" + "="*60)

## 7. Feature Importance (Coefficients)

In [None]:
# Get feature coefficients
coefficients = pd.DataFrame({
    'feature': preprocessor.feature_names,
    'coefficient': model.coef_[0]
}).sort_values('coefficient', key=abs, ascending=False)

print("Feature Coefficients (sorted by absolute value):")
print(coefficients.to_string(index=False))

# Visualize feature importance
fig, ax = plt.subplots(figsize=(10, 6))

colors = ['red' if c > 0 else 'blue' for c in coefficients['coefficient']]
ax.barh(coefficients['feature'], coefficients['coefficient'], color=colors, alpha=0.7)

ax.set_xlabel('Coefficient Value', fontsize=12)
ax.set_ylabel('Feature', fontsize=12)
ax.set_title('Logistic Regression Coefficients\n(Red = increases churn probability, Blue = decreases)', fontsize=14)
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.5)

plt.tight_layout()
plt.show()

## Next Steps

This baseline establishes the performance floor for our churn prediction model:

1. **Task 2.3**: Analyze baseline results in detail
2. **Phase 3**: Train Random Forest and Gradient Boosting classifiers
3. Compare all models and select the best performer
4. Evaluate final model on held-out test set

The baseline Logistic Regression model provides interpretable coefficients and a benchmark AUC-ROC score to beat with more complex models.