In [0]:
%sql
drop table fraud_bronze;

In [0]:
dbutils.fs.rm("dbfs:/mnt/fraud/", True)

In [0]:
# Import needed libraries
from pyspark.sql import functions as F
from pyspark.sql.types import *

# Define workspace paths & table names
bronze_path = "dbfs:/mnt/fraud/bronze"
silver_path = "dbfs:/mnt/fraud/silver"
gold_path = "dbfs:/mnt/fraud/gold"
checkpoint_path = "dbfs:/mnt/fraud/checkpoints"

In [0]:
import random
from datetime import datetime, timedelta
from pyspark.sql import functions as F

def generate_transaction():
    return {
        "transaction_id": f"TX{random.randint(1000000,9999999)}",
        "timestamp": datetime.now().isoformat(),
        "customer_id": f"C{random.randint(1000,9999)}",
        "amount": round(random.uniform(5,10000),2),
        "merchant_id": f"M{random.randint(10,999)}",
        "country": random.choice(["IN", "US", "UK", "CA", "DE", "JP", "BR"]),
        "channel": random.choice(["ecommerce", "offline", "mobile"]),
        "payment_method": random.choice(["credit_card", "debit_card", "upi", "wallet"]),
        "is_international": random.choice([True, False])
    }

# Generate test data
transactions = [dict(**{k: v for k, v in t.items() if k != 'timestamp'}, timestamp=datetime.fromisoformat(t['timestamp'])) for t in [generate_transaction() for _ in range(1000)]]

# Convert to Spark DataFrame
transaction_schema = StructType([
    StructField("transaction_id", StringType()),
    StructField("timestamp", TimestampType()),
    StructField("customer_id", StringType()),
    StructField("amount", DoubleType()),
    StructField("merchant_id", StringType()),
    StructField("country", StringType()),
    StructField("channel", StringType()),
    StructField("payment_method", StringType()),
    StructField("is_international", BooleanType())
])

df = spark.createDataFrame(transactions, schema=transaction_schema)

df = df.withColumn("timestamp", F.to_timestamp("timestamp"))  
print(df.schema)

# Write as a single batch to a folder emulating streaming source
df.write.mode("overwrite").parquet(bronze_path + "/raw")

In [0]:
# Read from simulated real-time source (bronze/raw)
stream_df = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "parquet")
    .option("cloudFiles.schemaLocation", checkpoint_path + "/schema")
    .load(bronze_path + "/raw"))

# Store raw events to Bronze table (append)
bronze_query = (stream_df
    .writeStream
    .format("delta")
    .option("checkpointLocation", checkpoint_path + "/bronze")
    .outputMode("append")
    .trigger(availableNow=True)
    .table("fraud_bronze"))

In [0]:
%sql
describe table fraud_bronze;

In [0]:
from pyspark.sql import functions as F

bronze_stream = spark.readStream.table("fraud_bronze")

# Cleansing: filter out incomplete rows
clean_df = bronze_stream.filter(
    "transaction_id IS NOT NULL and customer_id IS NOT NULL and amount > 0"
)

# Deduplication
dedup_df = clean_df.withWatermark("timestamp", "10 seconds").dropDuplicates(["transaction_id"])

# Enrichment: flag high-value and international transactions
enriched_df = (dedup_df
    .withColumn("high_value", F.col("amount") > 5000)
    .withColumn("is_night", F.hour(F.to_timestamp("timestamp")) >= 22)
)

# Write to Silver table
silver_query = (enriched_df
    .writeStream
    .format("delta")
    .option("checkpointLocation", checkpoint_path + "/silver")
    .outputMode("append")
    .trigger(availableNow=True)
    .table("fraud_silver"))

In [0]:
silver_stream = spark.readStream.table("fraud_silver")

# Add fraud rules column (simple, production-style logic)
fraud_rules_df = (silver_stream
    .withColumn("fraud_suspect", 
       (
           (F.col("is_international") & F.col("high_value")) |          # high-value & international
           (F.col("channel") == "ecommerce") & (F.col("amount") > 8000) | # big ecommerce transactions
           (F.col("is_night") & (F.col("amount") > 2000))                # night time, high amount
       )
    )
    .withColumn("fraud_reason",
       F.when((F.col("is_international") & F.col("high_value")), "International High Value")
        .when((F.col("channel") == "ecommerce") & (F.col("amount") > 8000), "Ecommerce Large Payment")
        .when((F.col("is_night") & (F.col("amount") > 2000)), "Night-Time Large Transaction")
        .otherwise(None)
    )
)

# Write to Gold table
gold_query = (fraud_rules_df
    .writeStream
    .format("delta")
    .option("checkpointLocation", checkpoint_path + "/gold")
    .outputMode("append")
    .trigger(availableNow=True)
    .table("fraud_gold"))


In [0]:
# Example: Monitor counts and suspects
fraud_gold_df = spark.read.table("fraud_gold")
fraud_gold_df.groupBy("fraud_suspect", "fraud_reason").count().show()


In [0]:
%sql
-- View latest suspect transactions

-- Query recent fraud suspects
SELECT customer_id,channel, payment_method,fraud_reason FROM fraud_gold WHERE fraud_suspect = true ORDER BY timestamp DESC LIMIT 10


In [0]:
#Check Streaming Query Health

# Access query progress and checkpoint status
print(gold_query.recentProgress)