Create a NEW ML feature table

In [0]:
from pyspark.sql.functions import when, col

ml_df = (
    spark.table("workspace.default.silver_telco_churn")
    .withColumn(
        "SeniorCitizen",
        when(col("SeniorCitizen") == "Yes", 1)
        .otherwise(0)
    )
    .select(
        "tenure",
        "MonthlyCharges",
        "TotalCharges",
        "SeniorCitizen",
        "Contract",
        "InternetService",
        "PaymentMethod",
        "gender",
        "Churn"
    )
)


In [0]:
ml_df = ml_df.fillna({
    "TotalCharges": 0.0,
    "MonthlyCharges": 0.0,
    "tenure": 0,
    "SeniorCitizen": 0
})


In [0]:
ml_df = ml_df.fillna({
    "TotalCharges": 0.0,
    "MonthlyCharges": 0.0,
    "tenure": 0,
    "SeniorCitizen": 0
})


In [0]:
ml_df = ml_df.withColumn(
    "label",
    when(col("Churn") == "Yes", 1).otherwise(0)
)


In [0]:
ml_df = ml_df.withColumn(
    "label",
    when(col("Churn") == "Yes", 1).otherwise(0)
)


In [0]:
categorical_cols = [
    "Contract",
    "InternetService",
    "PaymentMethod",
    "gender"
]


In [0]:
numerical_cols = [
    "tenure",
    "MonthlyCharges",
    "TotalCharges",
    "SeniorCitizen"
]


Build encoding pipeline

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

indexers = [
    StringIndexer(
        inputCol=c,
        outputCol=f"{c}_idx",
        handleInvalid="keep"
    )
    for c in categorical_cols
]

encoders = [
    OneHotEncoder(
        inputCol=f"{c}_idx",
        outputCol=f"{c}_ohe"
    )
    for c in categorical_cols
]

assembler = VectorAssembler(
    inputCols=[f"{c}_ohe" for c in categorical_cols] + numerical_cols,
    outputCol="features"
)


In [0]:
train_df, test_df = ml_df.randomSplit([0.8, 0.2], seed=42)


Train Logistic Regression (lightweight)

In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(
    featuresCol="features",
    labelCol="label",
    maxIter=20
)

pipeline = Pipeline(stages=indexers + encoders + [assembler, lr])


In [0]:
model = pipeline.fit(train_df)



In [0]:
predictions = model.transform(test_df)

from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(
    labelCol="label",
    metricName="areaUnderROC"
)

auc = evaluator.evaluate(predictions)
print("AUC:", auc)


AUC: 0.8378918198691404


Anything â‰¥ 0.70 is perfectly acceptable for churn.

In [0]:
predictions.groupBy("label", "prediction").count().show()


+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       1.0|  110|
|    1|       0.0|  177|
|    1|       1.0|  195|
|    0|       0.0|  927|
+-----+----------+-----+



This shows how many churners were correctly identified versus missed, which is more meaningful than accuracy alone

In [0]:
predictions.select("probability", "label").show(5, truncate=False)


+----------------------------------------+-----+
|probability                             |label|
+----------------------------------------+-----+
|[0.8569346242089596,0.1430653757910404] |0    |
|[0.8808792024763595,0.11912079752364046]|0    |
|[0.7865607803792499,0.21343921962075008]|1    |
|[0.6846293434973623,0.31537065650263774]|0    |
|[0.7869204806132195,0.2130795193867805] |0    |
+----------------------------------------+-----+
only showing top 5 rows


The probability output allows the business to choose different risk thresholds depending on retention budget