In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("FraudDetection") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/15 21:43:46 WARN Utils: Your hostname, NolanPC, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/08/15 21:43:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/15 21:43:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/15 21:43:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
df = spark.read.parquet("/home/nolan/fraud/data/processed")
df.orderBy("timestamp", ascending=False).show(5)



+--------------------+-------------+------+--------------------+--------+------------------+---------+------------------+------------------+------------------+-------------------+------------------+--------------------+-------------+----------------------+--------------+-------------+---------+
|      transaction_id|     merchant|amount|           timestamp|is_fraud|        log_amount|txn_index|      user_cum_sum|      user_cum_avg|      user_cum_std|amount_to_avg_ratio|amount_x_txn_index|             prev_ts|time_diff_sec|log_amount_x_time_diff|merchant_index| merchant_ohe|  user_id|
+--------------------+-------------+------+--------------------+--------+------------------+---------+------------------+------------------+------------------+-------------------+------------------+--------------------+-------------+----------------------+--------------+-------------+---------+
|97d8a706-61dd-45a...|subscriptions| 14.93|2025-08-15 19:52:...|       0| 2.768204123921957|      293| 95596.086

                                                                                

In [6]:
from pyspark.sql.functions import col

counts = df.groupBy("is_fraud").count().collect()
count_fraud = [row["count"] for row in counts if row["is_fraud"] == 1][0]
count_nonfraud = [row["count"] for row in counts if row["is_fraud"] == 0][0]

weight_fraud = count_nonfraud / (count_nonfraud + count_nonfraud)
weight_nonfraud= 1.0

df = df.withColumn("class_weight", col("is_fraud") * weight_fraud + (1 - col("is_fraud")) * weight_nonfraud)
df.show(5)

                                                                                

+--------------------+-----------+------+--------------------+--------+------------------+---------+------------+------------------+------------------+-------------------+------------------+--------------------+-------------+----------------------+--------------+-------------+---------+------------+
|      transaction_id|   merchant|amount|           timestamp|is_fraud|        log_amount|txn_index|user_cum_sum|      user_cum_avg|      user_cum_std|amount_to_avg_ratio|amount_x_txn_index|             prev_ts|time_diff_sec|log_amount_x_time_diff|merchant_index| merchant_ohe|  user_id|class_weight|
+--------------------+-----------+------+--------------------+--------+------------------+---------+------------+------------------+------------------+-------------------+------------------+--------------------+-------------+----------------------+--------------+-------------+---------+------------+
|69f97ba2-1b54-4a7...|     travel|574.48|2025-08-15 18:14:...|       0| 6.355204475168832|       

In [7]:
from pyspark.ml.feature import VectorAssembler

numeric_cols = [
    "amount", "log_amount", "txn_index", "user_cum_sum", "user_cum_avg", "user_cum_std", "amount_to_avg_ratio", "amount_x_txn_index", "time_diff_sec", "log_amount_x_time_diff"
]

assembler = VectorAssembler(
    inputCols=numeric_cols + ["merchant_ohe"],
    outputCol="features"
)

df_model = assembler.transform(df)

In [8]:
train_df, test_df = df_model.randomSplit([0.8, 0.2], seed=33)

In [9]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="is_fraud",
    weightCol="class_weight",
    numTrees=100
)

model = rf.fit(train_df)

25/08/15 21:56:31 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/08/15 21:57:00 WARN MemoryStore: Not enough space to cache rdd_71_83 in memory! (computed 3.7 MiB so far)
25/08/15 21:57:00 WARN BlockManager: Persisting block rdd_71_83 to disk instead.
25/08/15 21:57:00 WARN MemoryStore: Not enough space to cache rdd_71_85 in memory! (computed 1550.8 KiB so far)
25/08/15 21:57:00 WARN BlockManager: Persisting block rdd_71_85 to disk instead.
25/08/15 21:57:00 WARN MemoryStore: Not enough space to cache rdd_71_87 in memory! (computed 3.7 MiB so far)
25/08/15 21:57:00 WARN BlockManager: Persisting block rdd_71_87 to disk instead.
25/08/15 21:57:00 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_71_98 in memory.
25/08/15 21:57:00 WARN MemoryStore: Not enough space to cache rdd_71_83 in memory! (computed 1033.6 KiB 

In [11]:
preds = model.transform(test_df)
preds.select("is_fraud", "probability", "prediction").show(20)

+--------+--------------------+----------+
|is_fraud|         probability|prediction|
+--------+--------------------+----------+
|       0|[0.99450337610762...|       0.0|
|       0|[0.99477901771737...|       0.0|
|       0|[0.99448156032218...|       0.0|
|       0|[0.99471167439225...|       0.0|
|       0|[0.99491742980412...|       0.0|
|       0|[0.99487810075278...|       0.0|
|       0|[0.99538075329297...|       0.0|
|       0|[0.99683284589864...|       0.0|
|       0|[0.99694814630829...|       0.0|
|       0|[0.99491347917077...|       0.0|
|       0|[0.99613870447263...|       0.0|
|       0|[0.99647683388424...|       0.0|
|       0|[0.99388807295998...|       0.0|
|       0|[0.99539957150603...|       0.0|
|       0|[0.99473203340797...|       0.0|
|       0|[0.99153302525458...|       0.0|
|       0|[0.99687834686575...|       0.0|
|       0|[0.99543912783132...|       0.0|
|       0|[0.99689881864985...|       0.0|
|       0|[0.99478285420551...|       0.0|
+--------+-

In [12]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="is_fraud", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(preds)
print(f"ROC_AUC: {roc_auc}")

                                                                                

ROC_AUC: 0.7792508593623445


In [13]:
preds.groupBy("is_fraud", "prediction").count().show()



+--------+----------+------+
|is_fraud|prediction| count|
+--------+----------+------+
|       1|       0.0|  5627|
|       0|       0.0|593465|
|       1|       1.0|   310|
+--------+----------+------+



                                                                                