### Prepare environment

In [0]:
%run ../environment/prepare_environment

# Logistic Regression â€” Telco Churn Classification

This notebook will cover:
- Loading features for a churn prediction problem
- Training and tuning a logistic regression model using Spark ML
- Tracking experiments and results with MLflow
- Visualizing model performance with ROC curves and confusion matrices

**Why logistic regression?**
- It's a gold standard for interpretable binary classification
- Fast, robust, and a great baseline for many problems
- Easy to explain to business stakeholders

In [0]:
import os
import mlflow
import logging
import numpy as np
import pandas as pd
import mlflow.spark
import matplotlib.pyplot as plt
from pyspark.ml import Pipeline
from mlflow.models import infer_signature
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from sklearn.metrics import roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("telco-churn-pipeline")

## 1. Load Feature Table

We use features engineered in the previous workshop. If the feature table is missing, we automatically re-run the feature engineering notebook. This ensures reproducibility and a clean ML pipeline.

In [0]:
# Data loading and preparation as a function for reusability

def load_feature_table():
    feature_table_name = "ai_ml_in_practice.telco_customer_churn_silver.telco_silver_vectors"

    df = None
    try:
        df = spark.table(feature_table_name)
        logger.info(f"Loaded feature table: {feature_table_name}")
    except Exception:
        pass
    if df is None:
        logger.warning("Feature table not found. Re-running feature engineering notebooks to create features.")
        get_ipython().run_line_magic('run', '../2_data_preparations/2.1_telco_raw_to_bronze.ipynb')
        get_ipython().run_line_magic('run', '../2_data_preparations/2.1_telco_bronze_to_silver.ipynb')
        get_ipython().run_line_magic('run', '../3_feature_engineering/3.1_telco_feature_table.ipynb')
        df = spark.table(feature_table_name)
    if df is None:
        raise RuntimeError("Feature dataframe could not be found or created.")
    return df

df = load_feature_table()

## 2. Data Quality Checks & Preparation

Before modeling, always check that your label and features are present and correctly typed. This is a key standard for robust ML pipelines.

In [0]:
# Data sanity checks

from pyspark.sql.functions import col
if 'churn' not in df.columns or 'customer_features' not in df.columns:
    raise ValueError("Required columns 'churn' and 'customer_features' not found in feature table.")
df = df.withColumn('churn', col('churn').cast('double'))
logger.info(f"Total rows: {df.count()}")

## 3. Train/Test Split

We split the data into training and test sets (80/20). This is a standard practice to evaluate model generalization and avoid overfitting.

In [0]:
# Train/test split as a function

def split_data(df, train_ratio=0.8, seed=42):
    train_df, test_df = df.randomSplit([train_ratio, 1-train_ratio], seed=seed)
    logger.info(f"Train: {train_df.count()}, Test: {test_df.count()}")
    return train_df, test_df

train_df, test_df = split_data(df)

## 4. MLFlow Setup and Model Training

We use MLFlow to track all experiments, parameters, and results. Logistic regression is trained with cross-validation and hyperparameter tuning. All metrics and artifacts are logged for full reproducibility.

In [0]:
# Model training, tuning, and MLflow logging

def train_and_log(train_df, test_df, experiment_name="telco_logistic_regression"):
    lr = LogisticRegression(featuresCol='customer_features', labelCol='churn')
    param_grid = (ParamGridBuilder()
                  .addGrid(lr.regParam, [0.0, 0.01, 0.1])
                  .addGrid(lr.elasticNetParam, [0.0, 0.5])
                  .addGrid(lr.maxIter, [10, 50])
                  .build())
    evaluator = BinaryClassificationEvaluator(labelCol='churn', rawPredictionCol='rawPrediction', metricName='areaUnderROC')
    multiclass_evaluator = MulticlassClassificationEvaluator(
        labelCol="churn",
        predictionCol="prediction"
    )
    cv = CrossValidator(estimator=lr, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)

    with mlflow.start_run(run_name="logistic_regression_cv") as run:
        # Train a model
        cv_model = cv.fit(train_df)
        best_model = cv_model.bestModel

        # Evaluate the model using the test set
        predictions = best_model.transform(test_df)
        acc = multiclass_evaluator.evaluate(predictions, {multiclass_evaluator.metricName: "accuracy"})
        prec = multiclass_evaluator.evaluate(predictions, {multiclass_evaluator.metricName: "weightedPrecision"})
        rec = multiclass_evaluator.evaluate(predictions, {multiclass_evaluator.metricName: "weightedRecall"})
        roc_auc = evaluator.evaluate(predictions)
        mlflow.log_metrics({
            "accuracy": float(acc),
            "precision": float(prec),
            "recall": float(rec),
            "roc_auc": float(roc_auc)
        })
        logger.info(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, ROC_AUC: {roc_auc:.4f}")
        
        # Get predictions from the best model
        pdf = predictions.select('prediction', 'churn', 'probability').toPandas()
        y_true = pdf['churn'].astype(int)
        y_score = pdf['probability'].apply(lambda v: v[1])

        # ROC Curve
        fpr, tpr, thresholds = roc_curve(y_true, y_score)
        roc_auc = auc(fpr, tpr)
        fig, ax = plt.subplots(figsize=(6, 6))
        ax.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
        ax.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
        ax.set_xlabel('False Positive Rate')
        ax.set_ylabel('True Positive Rate')
        ax.set_title('Receiver Operating Characteristic (ROC)')
        ax.legend(loc='lower right')
        plt.tight_layout()
        plt.close(fig)
        mlflow.log_figure(fig, "roc_curve.png")

        # Confusion matrix
        cm = confusion_matrix(y_true, pdf['prediction'].astype(int))
        fig, ax = plt.subplots(figsize=(4, 4))
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot(ax=ax, cmap='Blues', colorbar=False)
        ax.set_title('Confusion Matrix')
        plt.tight_layout()
        plt.close(fig)
        mlflow.log_figure(fig, "confusion_matrix.png")

        # Feature importance
        importances = cv_model.bestModel.coefficients.toArray()
        attrs = train_df.schema['customer_features'].metadata['ml_attr']['attrs']
        feature_labels = []
        for attr_type in ["numeric", "binary", "nominal"]:
            if attr_type in attrs:
                feature_labels.extend([a["name"] for a in attrs[attr_type]])
        coef_df = pd.DataFrame({'Feature': feature_labels, 'Coefficient': importances})
        coef_df = coef_df.sort_values('Coefficient', key=abs, ascending=False)
        fig, ax = plt.subplots(figsize=(8, 6))
        coef_df.plot.bar(x='Feature', y='Coefficient', ax=ax, legend=False, color='teal')
        ax.set_title('Feature Importances (Logistic Regression Coefficients)')
        ax.set_ylabel('Coefficient Value')
        plt.tight_layout()
        plt.close(fig)
        mlflow.log_figure(fig, "feature_importance.png")

        # Save model to MLflow Model Registry
        signature = infer_signature(train_df.select("customer_features"), predictions)

        mlflow.spark.log_model(
            best_model,
            artifact_path='telco_churn_logreg',
            signature=signature,
            registered_model_name='ai_ml_in_practice.telco_customer_churn_silver.telco_churn_logreg_model',
            dfs_tmpdir="/Volumes/ai_ml_in_practice/telco_customer_churn_silver/mlflow_tmp"
        )

        logger.info(f"Model logged and registered as telco_churn_logreg_model")
        return best_model, predictions

best_model, predictions = train_and_log(train_df, test_df)

## 5. Batch Inference and Model Loading

In production, you often need to load a model and run batch inference. Here is how you do it with MLflow and SparkML library.

In [0]:
loaded_model = mlflow.spark.load_model(
    "models:/ai_ml_in_practice.telco_customer_churn_silver.telco_churn_logreg_model/1",
    dfs_tmpdir="/Volumes/ai_ml_in_practice/telco_customer_churn_silver/mlflow_tmp"
)

# Batch inference example
inference_df = loaded_model.transform(test_df)
display(inference_df.select('prediction', 'probability'))