<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/PySpark-Analytics-Hub/blob/main/Customer%20Churn%20Analysis/Churn_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup Enviroment

In [1]:
# !/bin/bash
%%bash
kaggle datasets download halimedogan/churn-dataset
unzip churn-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/halimedogan/churn-dataset
License(s): unknown
Downloading churn-dataset.zip to /content

Archive:  churn-dataset.zip
  inflating: churn2.csv              


  0%|          | 0.00/262k [00:00<?, ?B/s]100%|██████████| 262k/262k [00:00<00:00, 73.3MB/s]


In [2]:
%%bash
# pip install sparkmagic

# Import Libraries

In [32]:
import pandas  as pd
import seaborn as sns
import numpy as np
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import (StringIndexer,
                                VectorAssembler,
                                OneHotEncoder,
                                StandardScaler,
                                Bucketizer)
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.window import Window

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Setup pandas option
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Setup Spark Session & Import DataFrame

In [4]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [6]:
spark_df = spark.read.csv('/content/churn2.csv',
                          header=True,
                          inferSchema=True)

# Data Analysis

In [7]:
spark_df.show(10)

+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|RowNumber|CustomerId| Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|        1|  15634602|Hargrave|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|
|        2|  15647311|    Hill|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|
|        3|  15619304|    Onio|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|
|        4|  15701354|    Boni|        699|   France|Female| 39|     1|      0.0|            2|        0|             0|       93826.63|

In [8]:
# Shape
print("Shape: ",(spark_df.count(), len(spark_df.columns)))

Shape:  (10000, 14)


In [9]:
# Check Schema
spark_df.printSchema()

root
 |-- RowNumber: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- HasCrCard: integer (nullable = true)
 |-- IsActiveMember: integer (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)



In [None]:
# Check Statistical Summary
spark_df.describe().toPandas().transpose()

In [10]:
# Check exited customer
spark_df.groupby('exited').count().show()

+------+-----+
|exited|count|
+------+-----+
|     1| 2037|
|     0| 7963|
+------+-----+



In [11]:
# Check customers who have credit card
spark_df.groupby('HasCrCard').count().show()

+---------+-----+
|HasCrCard|count|
+---------+-----+
|        1| 7055|
|        0| 2945|
+---------+-----+



In [12]:
# Groupby exited customer with their estimated salayar and whether have credit card or no
spark_df.groupby("Exited").agg({"EstimatedSalary": "mean", 'HasCrCard': "count"}).show()

+------+----------------+--------------------+
|Exited|count(HasCrCard)|avg(EstimatedSalary)|
+------+----------------+--------------------+
|     1|            2037|   101465.6775306824|
|     0|            7963|   99738.39177194514|
+------+----------------+--------------------+



In [13]:
# Check Missing values
spark_df.select([f.count(f.when(f.col(c).isNull(), c)).alias(c) for c in spark_df.columns]).toPandas().T

Unnamed: 0,0
RowNumber,0
CustomerId,0
Surname,0
CreditScore,0
Geography,0
Gender,0
Age,0
Tenure,0
Balance,0
NumOfProducts,0


## Windowing Function

In [14]:
window_spec = Window.partitionBy("Geography").orderBy("CreditScore")

df_ranked = (
    spark_df
    .withColumn("Rank", f.rank().over(window_spec))
    .withColumn("DenseRank", f.dense_rank().over(window_spec))
    .withColumn("RowNumber", f.row_number().over(window_spec))
    )

df_ranked.select("Geography", "CreditScore", "Rank", "DenseRank", "RowNumber").show()

+---------+-----------+----+---------+---------+
|Geography|CreditScore|Rank|DenseRank|RowNumber|
+---------+-----------+----+---------+---------+
|   France|        350|   1|        1|        1|
|   France|        350|   1|        1|        2|
|   France|        350|   1|        1|        3|
|   France|        359|   4|        2|        4|
|   France|        373|   5|        3|        5|
|   France|        376|   6|        4|        6|
|   France|        405|   7|        5|        7|
|   France|        408|   8|        6|        8|
|   France|        408|   8|        6|        9|
|   France|        410|  10|        7|       10|
|   France|        410|  10|        7|       11|
|   France|        411|  12|        8|       12|
|   France|        411|  12|        8|       13|
|   France|        411|  12|        8|       14|
|   France|        411|  12|        8|       15|
|   France|        412|  16|        9|       16|
|   France|        413|  17|       10|       17|
|   France|        4

In [15]:
# Moving Avg
window_spec_avg = Window.partitionBy("geography").rowsBetween(-3, 3)

df_moving_avg = spark_df.withColumn("moving_avg_creditscore", f.avg("creditscore").over(window_spec_avg))

df_moving_avg.select("geography", "creditscore", "moving_avg_creditscore").show()

+---------+-----------+----------------------+
|geography|creditscore|moving_avg_creditscore|
+---------+-----------+----------------------+
|   France|        619|                 660.5|
|   France|        502|                 628.6|
|   France|        699|     637.8333333333334|
|   France|        822|     622.1428571428571|
|   France|        501|     601.7142857142857|
|   France|        684|     608.4285714285714|
|   France|        528|     612.2857142857143|
|   France|        476|     599.4285714285714|
|   France|        549|     623.4285714285714|
|   France|        726|     646.5714285714286|
|   France|        732|     653.5714285714286|
|   France|        669|     667.1428571428571|
|   France|        846|     647.4285714285714|
|   France|        577|     619.8571428571429|
|   France|        571|     583.1428571428571|
|   France|        411|                 609.0|
|   France|        533|     554.5714285714286|
|   France|        475|     551.5714285714286|
|   France|  

In [16]:
# Cumulative Sum of each Geography

window_spec_cumsum = Window.partitionBy("Geography").orderBy("Age").rowsBetween(Window.unboundedPreceding, 0)

df_cumsum = (
    spark_df
    .withColumn("Cumulative_Balance", f.sum("Balance").over(window_spec_cumsum))
)

df_cumsum.select("Geography", "Balance", "Age", "Cumulative_Balance").show()

+---------+---------+---+------------------+
|Geography|  Balance|Age|Cumulative_Balance|
+---------+---------+---+------------------+
|   France|160980.03| 18|         160980.03|
|   France|151762.74| 18|         312742.77|
|   France| 82767.42| 18|         395510.19|
|   France|133550.67| 18|         529060.86|
|   France|      0.0| 18|         529060.86|
|   France|      0.0| 18|         529060.86|
|   France| 176139.5| 18|         705200.36|
|   France| 98894.39| 18|         804094.75|
|   France|      0.0| 18|         804094.75|
|   France|      0.0| 18|         804094.75|
|   France|102983.91| 18|         907078.66|
|   France|128514.84| 19|         1035593.5|
|   France| 97445.49| 19|        1133038.99|
|   France|143390.51| 19|         1276429.5|
|   France|110928.51| 19|        1387358.01|
|   France|      0.0| 19|        1387358.01|
|   France|      0.0| 19|        1387358.01|
|   France|127649.64| 19|        1515007.65|
|   France|      0.0| 19|        1515007.65|
|   France

# Feature Engineering

In [18]:
spark_df.show(1)

+---------+----------+--------+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+
|RowNumber|CustomerId| Surname|CreditScore|Geography|Gender|Age|Tenure|Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+--------+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+
|        1|  15634602|Hargrave|        619|   France|Female| 42|     2|    0.0|            1|        1|             1|      101348.88|     1|
+---------+----------+--------+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+
only showing top 1 row



In [20]:
spark_df = (
    spark_df
    .drop('RowNumber', 'CustomerId', 'Surname')
    .withColumn("CredictScore_Salary", spark_df.CreditScore / spark_df.EstimatedSalary)
    .withColumn("CredictScore_Tenure", spark_df.CreditScore * spark_df.Tenure)
    .withColumn("Balance_Salary", spark_df.Balance / spark_df.EstimatedSalary)
)

spark_df.show(10)

+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+
|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited| CredictScore_Salary|CredictScore_Tenure|    Balance_Salary|
+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+
|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|               1238|               0.0|
|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|0.005402399696186101|                608|0.7446769036217226|
|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|0.004406153623618

In [22]:
# Bucketization
bucketizer = Bucketizer(splits=[0, 35, 55, 75, 95],
                        inputCol="Age",
                        outputCol="age_cat")
spark_df = bucketizer.setHandleInvalid("keep").transform(spark_df)

spark_df = spark_df.withColumn("age_cat", spark_df.age_cat + 1)
spark_df = spark_df.withColumn("age_cat", spark_df["age_cat"].cast("integer"))
# Show
spark_df.show(10)

+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+-------+
|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited| CredictScore_Salary|CredictScore_Tenure|    Balance_Salary|age_cat|
+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+-------+
|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|               1238|               0.0|      2|
|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|0.005402399696186101|                608|0.7446769036217226|      2|
|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|

# Features Vectorization

## StringIndexer

In [24]:
# Gender
indexer = StringIndexer(inputCol="Gender",
                        outputCol="gender_label")
indexer.fit(spark_df).transform(spark_df).show(5)

temp_sdf = indexer.fit(spark_df).transform(spark_df)

spark_df = temp_sdf.withColumn("gender_label", temp_sdf["gender_label"].cast("integer"))
spark_df = spark_df.drop('Gender')

+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+-------+------------+
|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited| CredictScore_Salary|CredictScore_Tenure|    Balance_Salary|age_cat|gender_label|
+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+-------+------------+
|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|               1238|               0.0|      2|         1.0|
|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|0.005402399696186101|                608|0.7446769036217226|      2|         1.0|
|        502|   France|Fe

In [26]:
# Geography
indexer = StringIndexer(inputCol="Geography",
                        outputCol="geography_label")
indexer.fit(spark_df).transform(spark_df).show(5)

temp_sdf = indexer.fit(spark_df).transform(spark_df)
spark_df = temp_sdf.withColumn("geography_label", temp_sdf["geography_label"].cast("integer"))
spark_df = spark_df.drop('geography')

+-----------+---------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+-------+------------+---------------+
|CreditScore|Geography|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited| CredictScore_Salary|CredictScore_Tenure|    Balance_Salary|age_cat|gender_label|geography_label|
+-----------+---------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+-------+------------+---------------+
|        619|   France| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|               1238|               0.0|      2|           1|            0.0|
|        608|    Spain| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|0.005402399696186101|                608|0.7446769036217226|      2|          

## Assembler

In [28]:
# Features to include in the model
feature_columns = [
    "CreditScore", "Age", "Tenure", "Balance", "NumOfProducts",
    "HasCrCard", "IsActiveMember", "EstimatedSalary", "CredictScore_Salary",
    "CredictScore_Tenure", "Balance_Salary", "age_cat", "gender_label", "geography_label"
]

# Assemble features into a single vector
assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol="features"
)

# Standardize features using StandardScaler
scaler = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withMean=True,
    withStd=True
)

## Define & Train a RandomForestClassifier

In [31]:
rf = RandomForestClassifier(
    featuresCol='scaled_features',
    labelCol='Exited',
    numTrees=150,
    maxDepth=10
)

In [33]:
# Create Spak Pipeline
pipeline = Pipeline(stages=[assembler, scaler, rf])

In [34]:
# Split Train/Test
train, test = spark_df.randomSplit([0.8, 0.2], seed=42)

In [35]:
# Train Model
model = pipeline.fit(train)

### Check Predictions & Accuracy of RandomForestClassifier

In [36]:
predictions = model.transform(test)
predictions.select("scaled_features", "Exited", "prediction", "probability").show()

+--------------------+------+----------+--------------------+
|     scaled_features|Exited|prediction|         probability|
+--------------------+------+----------+--------------------+
|[-3.0853319664765...|     1|       0.0|[0.57035684607025...|
|[-3.0030352594573...|     1|       1.0|[0.07096998422675...|
|[-2.9515998175703...|     1|       1.0|[0.16697341841943...|
|[-2.8178676686640...|     1|       1.0|[0.25151541714830...|
|[-2.560690459229,...|     1|       1.0|[0.26307062727352...|
|[-2.4989679289645...|     1|       0.0|[0.61933038339768...|
|[-2.4578195754549...|     0|       0.0|[0.91711773215675...|
|[-2.4372453987001...|     0|       0.0|[0.96406363876309...|
|[-2.4063841335679...|     0|       0.0|[0.68485738858250...|
|[-2.3960970451905...|     0|       0.0|[0.97263497928513...|
|[-2.3858099568131...|     0|       0.0|[0.90300769951635...|
|[-2.3858099568131...|     0|       0.0|[0.96699296502158...|
|[-2.3858099568131...|     0|       0.0|[0.88954926336853...|
|[-2.365

In [38]:
# Evaluate the Model
evaluator = MulticlassClassificationEvaluator(
    labelCol="Exited",
    predictionCol="prediction",
    metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.8667360749609578


## Define & Train XGBClassifier

In [39]:
xgb = GBTClassifier(
    featuresCol='scaled_features',
    labelCol='Exited',
    maxIter=10
)

# Create Spak Pipeline
pipeline = Pipeline(stages=[assembler, scaler, xgb])

# Train Model
model = pipeline.fit(train)

In [40]:
predictions = model.transform(test)
predictions.select("scaled_features", "Exited", "prediction", "probability").show()

+--------------------+------+----------+--------------------+
|     scaled_features|Exited|prediction|         probability|
+--------------------+------+----------+--------------------+
|[-3.0853319664765...|     1|       0.0|[0.57538314011928...|
|[-3.0030352594573...|     1|       1.0|[0.13911971850206...|
|[-2.9515998175703...|     1|       1.0|[0.10591009361155...|
|[-2.8178676686640...|     1|       1.0|[0.20625707781615...|
|[-2.560690459229,...|     1|       1.0|[0.25369876656995...|
|[-2.4989679289645...|     1|       0.0|[0.78870404282574...|
|[-2.4578195754549...|     0|       0.0|[0.91247891460248...|
|[-2.4372453987001...|     0|       0.0|[0.92352165607129...|
|[-2.4063841335679...|     0|       0.0|[0.85092083986898...|
|[-2.3960970451905...|     0|       0.0|[0.92777739930648...|
|[-2.3858099568131...|     0|       0.0|[0.88039466293922...|
|[-2.3858099568131...|     0|       0.0|[0.92579234486487...|
|[-2.3858099568131...|     0|       0.0|[0.90376126055064...|
|[-2.365

### Check Predictions & Accuracy of XGBClassifier

In [41]:
predictions = model.transform(test)
predictions.select("scaled_features", "Exited", "prediction", "probability").show()

+--------------------+------+----------+--------------------+
|     scaled_features|Exited|prediction|         probability|
+--------------------+------+----------+--------------------+
|[-3.0853319664765...|     1|       0.0|[0.57538314011928...|
|[-3.0030352594573...|     1|       1.0|[0.13911971850206...|
|[-2.9515998175703...|     1|       1.0|[0.10591009361155...|
|[-2.8178676686640...|     1|       1.0|[0.20625707781615...|
|[-2.560690459229,...|     1|       1.0|[0.25369876656995...|
|[-2.4989679289645...|     1|       0.0|[0.78870404282574...|
|[-2.4578195754549...|     0|       0.0|[0.91247891460248...|
|[-2.4372453987001...|     0|       0.0|[0.92352165607129...|
|[-2.4063841335679...|     0|       0.0|[0.85092083986898...|
|[-2.3960970451905...|     0|       0.0|[0.92777739930648...|
|[-2.3858099568131...|     0|       0.0|[0.88039466293922...|
|[-2.3858099568131...|     0|       0.0|[0.92579234486487...|
|[-2.3858099568131...|     0|       0.0|[0.90376126055064...|
|[-2.365

In [48]:
# Evaluate the Model
evaluator = BinaryClassificationEvaluator(
    labelCol="Exited",
    rawPredictionCol="prediction"
)

# Evaluate AUC-PR
evaluator.setMetricName("areaUnderPR")
auc_pr = evaluator.evaluate(predictions)
print(f"AUC-PR: {auc_pr}")

# Evaluate AUC-ROC
evaluator.setMetricName("areaUnderROC")
auc_roc = evaluator.evaluate(predictions)
print(f"AUC-ROC: {auc_roc}")

AUC-PR: 0.6092440665778083
AUC-ROC: 0.7040752544282847


In [51]:
# Create evaluator instance
evaluator = MulticlassClassificationEvaluator(labelCol="Exited", predictionCol="prediction")

# Evaluate F1-Score
evaluator.setMetricName("f1")
f1_score = evaluator.evaluate(predictions)
print(f"F1-Score: {f1_score}")

# Evaluate Precision
evaluator.setMetricName("precisionByLabel")
precision = evaluator.evaluate(predictions)
print(f"Precision: {precision}")

# Evaluate Recall
evaluator.setMetricName("recallByLabel")
recall = evaluator.evaluate(predictions)
print(f"Recall: {recall}")

F1-Score: 0.8538697957263403
Precision: 0.8803967327887982
Recall: 0.9697943444730077


## Handle Imbalanced Dataset

In [55]:
from imblearn.over_sampling import SMOTE

In [54]:
# Frequency table for 'Exited' column
spark_df.groupBy("Exited").count().show()

+------+-----+
|Exited|count|
+------+-----+
|     1| 2037|
|     0| 7963|
+------+-----+



In [71]:
pandas_df = spark_df.toPandas() # Convert to Pandas

# Seperate X, y
X = pandas_df.drop('Exited', axis=1)
y = pandas_df['Exited']

# Apply SMOTE
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Join resampled X, y to spark_df
spark_df_new = spark.createDataFrame(pd.concat([X_resampled, y_resampled], axis=1))

# Recheck dataset
spark_df_new.groupBy("Exited").count().show()

+------+-----+
|Exited|count|
+------+-----+
|     0| 7963|
|     1| 3981|
+------+-----+



### Re-Train XGBClassifier with Resampled Dataframe

In [78]:
train_new, test_new = spark_df_new.randomSplit([0.8, 0.2], seed=42)

pipeline_xgb = Pipeline(stages=[assembler, scaler, xgb])
# Train
model_xgb = pipeline_xgb.fit(train_new)
# Predictions
predictions_xgb = model_xgb.transform(test_new)

### Check Accuracy

In [79]:
predictions = model_xgb.transform(test)
predictions.select("scaled_features", "Exited", "prediction", "probability").show()

+--------------------+------+----------+--------------------+
|     scaled_features|Exited|prediction|         probability|
+--------------------+------+----------+--------------------+
|[-3.1643727503228...|     1|       0.0|[0.57491748813397...|
|[-3.0796924565684...|     1|       1.0|[0.09219619768104...|
|[-3.0267672729719...|     1|       1.0|[0.14949313244633...|
|[-2.8891617956210...|     1|       1.0|[0.26040876922317...|
|[-2.6245358776385...|     1|       1.0|[0.09264518746911...|
|[-2.5610256573227...|     1|       0.0|[0.78770133360803...|
|[-2.5186855104455...|     0|       0.0|[0.89624453068261...|
|[-2.4975154370069...|     0|       0.0|[0.91067598587244...|
|[-2.4657603268490...|     0|       0.0|[0.76819343707422...|
|[-2.4551752901297...|     0|       0.0|[0.90052070929690...|
|[-2.4445902534104...|     0|       0.0|[0.81410918851858...|
|[-2.4445902534104...|     0|       0.0|[0.91222887536113...|
|[-2.4445902534104...|     0|       0.0|[0.86178650843064...|
|[-2.423

In [80]:
# Evaluate the Model
evaluator = BinaryClassificationEvaluator(
    labelCol="Exited",
    rawPredictionCol="prediction"
)

# Evaluate AUC-PR
evaluator.setMetricName("areaUnderPR")
auc_pr = evaluator.evaluate(predictions)
print(f"AUC-PR: {auc_pr}")

# Evaluate AUC-ROC
evaluator.setMetricName("areaUnderROC")
auc_roc = evaluator.evaluate(predictions)
print(f"AUC-ROC: {auc_roc}")

AUC-PR: 0.5490148988205793
AUC-ROC: 0.7549116103813783
