In [12]:
feature_cols = [
    "total_order_count",
    "days_since_last_order",
    "customer_tenure_days",
    "order_count_last_90d",
    "order_interval_mean",
    "total_spent",
    "avg_order_value",
    "spend_std",
    "avg_items_per_order",
    "total_spent_last_90d"
]

label_col = "churn"

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 14, Finished, Available, Finished)

In [13]:
model_df = spark.table("gold_churn_model_dataset")

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 15, Finished, Available, Finished)

In [14]:
model_df_clean = model_df.fillna(0)

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 16, Finished, Available, Finished)

In [15]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

model_input = assembler.transform(model_df_clean).select(
    "features", label_col
)

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 17, Finished, Available, Finished)

In [16]:
train_df, test_df = model_input.randomSplit([0.7, 0.3], seed=42)

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 18, Finished, Available, Finished)

In [17]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(
    featuresCol="features",
    labelCol="churn",
    maxIter=20
)

lr_model = lr.fit(train_df)

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 19, Finished, Available, Finished)

In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

preds = lr_model.transform(test_df)

evaluator = BinaryClassificationEvaluator(
    labelCol="churn",
    metricName="areaUnderROC"
)

auc = evaluator.evaluate(preds)
print("AUC:", auc)

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 20, Finished, Available, Finished)

AUC: 0.9999738371108032


In [19]:
import pandas as pd

coef = lr_model.coefficients.toArray()

importance_df = pd.DataFrame({
    "feature": feature_cols,
    "coefficient": coef
}).sort_values("coefficient", ascending=False)

importance_df

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 21, Finished, Available, Finished)

Unnamed: 0,feature,coefficient
8,avg_items_per_order,0.160338
2,customer_tenure_days,0.061036
1,days_since_last_order,0.035924
5,total_spent,0.000937
6,avg_order_value,-0.000573
7,spend_std,-0.003457
9,total_spent_last_90d,-0.017674
4,order_interval_mean,-0.059957
0,total_order_count,-2.78606
3,order_count_last_90d,-26.661146


In [20]:
feature_names = feature_cols


StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 22, Finished, Available, Finished)

In [21]:
import pandas as pd

coef_df = pd.DataFrame({
    "feature": feature_names,
    "coefficient": lr_model.coefficients.toArray()
})

coef_df["abs_coef"] = coef_df["coefficient"].abs()
coef_df = coef_df.sort_values("abs_coef", ascending=False)
coef_df


StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 23, Finished, Available, Finished)

Unnamed: 0,feature,coefficient,abs_coef
3,order_count_last_90d,-26.661146,26.661146
0,total_order_count,-2.78606,2.78606
8,avg_items_per_order,0.160338,0.160338
2,customer_tenure_days,0.061036,0.061036
4,order_interval_mean,-0.059957,0.059957
1,days_since_last_order,0.035924,0.035924
9,total_spent_last_90d,-0.017674,0.017674
7,spend_std,-0.003457,0.003457
5,total_spent,0.000937,0.000937
6,avg_order_value,-0.000573,0.000573


prepare dashboard

In [23]:
scored_df = lr_model.transform(model_input)


StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 25, Finished, Available, Finished)

In [24]:
scored_df.printSchema()

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 26, Finished, Available, Finished)

root
 |-- features: vector (nullable = true)
 |-- churn: integer (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [25]:
from pyspark.ml.functions import vector_to_array
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col, when

scored_df = scored_df.withColumn(
    "prob_array",
    vector_to_array(col("probability"))
)


StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 27, Finished, Available, Finished)

In [26]:
scored_df = scored_df.withColumn(
    "churn_probability",
    col("prob_array")[1]
)


StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 28, Finished, Available, Finished)

In [27]:
scored_df = scored_df.withColumn(
    "churn_segment",
    when(col("churn_probability") >= 0.8, "High Risk")
    .when(col("churn_probability") >= 0.4, "Medium Risk")
    .otherwise("Low Risk")
)

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 29, Finished, Available, Finished)

In [28]:
scored_df = scored_df.drop("prob_array")

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 30, Finished, Available, Finished)

In [29]:
scored_df.printSchema()

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 31, Finished, Available, Finished)

root
 |-- features: vector (nullable = true)
 |-- churn: integer (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)
 |-- churn_probability: double (nullable = true)
 |-- churn_segment: string (nullable = false)



In [31]:
from pyspark.sql import functions as F
final_scored_df = model_df.withColumn(
    "row_id", F.monotonically_increasing_id()
).join(
    preds.withColumn(
        "row_id", F.monotonically_increasing_id()
    ),
    on="row_id",
    how="inner"
).drop("row_id")


StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 33, Finished, Available, Finished)

In [32]:
display(final_scored_df)

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 34, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, bb81bbdf-e3ad-4abd-bf4d-ee7e97c595ae)

In [33]:
final_scored_df.columns

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 35, Finished, Available, Finished)

['customer_unique_id',
 'last_order_date',
 'first_order_date',
 'total_order_count',
 'days_since_last_order',
 'customer_tenure_days',
 'order_count_last_90d',
 'order_interval_mean',
 'total_spent',
 'avg_order_value',
 'spend_std',
 'avg_items_per_order',
 'total_spent_last_90d',
 'has_complaint',
 'has_delivery_delay',
 'churn',
 'features',
 'churn',
 'rawPrediction',
 'probability',
 'prediction']

In [34]:

new_cols = []
counts = {}

for c in final_scored_df.columns:
    if c not in counts:
        counts[c] = 0
        new_cols.append(c)
    else:
        counts[c] += 1
        new_cols.append(f"{c}_{counts[c]}")

final_scored_df = final_scored_df.toDF(*new_cols)


StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 36, Finished, Available, Finished)

In [36]:
final_scored_df.printSchema()

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 38, Finished, Available, Finished)

root
 |-- customer_unique_id: string (nullable = true)
 |-- last_order_date: timestamp (nullable = true)
 |-- first_order_date: timestamp (nullable = true)
 |-- total_order_count: long (nullable = true)
 |-- days_since_last_order: integer (nullable = true)
 |-- customer_tenure_days: integer (nullable = true)
 |-- order_count_last_90d: long (nullable = true)
 |-- order_interval_mean: double (nullable = true)
 |-- total_spent: double (nullable = true)
 |-- avg_order_value: double (nullable = true)
 |-- spend_std: double (nullable = true)
 |-- avg_items_per_order: double (nullable = true)
 |-- total_spent_last_90d: double (nullable = true)
 |-- has_complaint: integer (nullable = true)
 |-- has_delivery_delay: integer (nullable = true)
 |-- churn: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [35]:
final_scored_df = final_scored_df.drop("churn_1")

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 37, Finished, Available, Finished)

In [37]:
final_scored_df = final_scored_df.drop("features","rawPrediction","probability")

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 39, Finished, Available, Finished)

In [39]:
final_scored_df.printSchema()

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 41, Finished, Available, Finished)

root
 |-- customer_unique_id: string (nullable = true)
 |-- last_order_date: timestamp (nullable = true)
 |-- first_order_date: timestamp (nullable = true)
 |-- total_order_count: long (nullable = true)
 |-- days_since_last_order: integer (nullable = true)
 |-- customer_tenure_days: integer (nullable = true)
 |-- order_count_last_90d: long (nullable = true)
 |-- order_interval_mean: double (nullable = true)
 |-- total_spent: double (nullable = true)
 |-- avg_order_value: double (nullable = true)
 |-- spend_std: double (nullable = true)
 |-- avg_items_per_order: double (nullable = true)
 |-- total_spent_last_90d: double (nullable = true)
 |-- has_complaint: integer (nullable = true)
 |-- has_delivery_delay: integer (nullable = true)
 |-- churn: integer (nullable = true)
 |-- prediction: double (nullable = false)



In [40]:
final_scored_df.write.mode("overwrite").format("delta").saveAsTable("gold_churn_customers")

StatementMeta(, 329af25c-025c-46f0-9b95-a4ee14499a14, 42, Finished, Available, Finished)