In [27]:
# Need to install PySpark 
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum, count, when

In [28]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("CreditCardFraudDetection") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "4") \
    .getOrCreate()

In [29]:
# Load Data
file_path = "creditcard.csv"  # change your dataset path
data = spark.read.csv(file_path, header=True, inferSchema=True)

                                                                                

In [30]:
# Inspect the Data
data.printSchema()
data.show(5)

root
 |-- Time: double (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: double (nullable = true)
 |-- V27: double (nullable = true)
 |-- V28: double (nulla

In [31]:
# Data Transformations: Convert "Time" column to hours
data = data.withColumn("Time_hours", (col("Time") / 3600).cast("double"))

In [32]:
# Aggregations: Fraud Transaction Statistics
agg_stats = data.groupBy("Class").agg(
    count("*").alias("Total_Transactions"),
    avg("Amount").alias("Avg_Transaction_Amount"),
    sum(when(col("Class") == 1, 1).otherwise(0)).alias("Fraud_Count")
)
agg_stats.show()

[Stage 3:>                                                          (0 + 8) / 8]

+-----+------------------+----------------------+-----------+
|Class|Total_Transactions|Avg_Transaction_Amount|Fraud_Count|
+-----+------------------+----------------------+-----------+
|    1|               492|    122.21132113821139|        492|
|    0|            284315|     88.29102242231887|          0|
+-----+------------------+----------------------+-----------+



                                                                                

In [34]:
# Optimize partitioning & partition data by fraud class for efficient parallel processing
data = data.repartition(4, "Class")

In [35]:
# Generate Data Quality Report
data.describe(["Amount", "Time_hours"]).show()

+-------+------------------+------------------+
|summary|            Amount|        Time_hours|
+-------+------------------+------------------+
|  count|            284807|            284807|
|   mean| 88.34961925087359|26.337183215300488|
| stddev|250.12010924018765| 13.19115165404625|
|    min|               0.0|               0.0|
|    max|          25691.16| 47.99777777777778|
+-------+------------------+------------------+



In [36]:
# Stop Spark Session
spark.stop()