In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, avg, when, month, year



# 1. Spark session (Databricks provides by default)

In [None]:

spark = SparkSession.builder.appName("ETL_Unusual_Spending").getOrCreate()
df = spark.read.csv("Unusual_Spending.csv", header=True, inferSchema=True)
df.show()

In [None]:

# 2. Load raw user & expense data (assume CSV input from storage)
# Replace paths with your ADLS/Blob or mounted location

user_df = spark.read.format("csv").option("header", "true").load("/mnt/raw_data/users.csv")
expense_df = spark.read.format("csv").option("header", "true").load("/mnt/raw_data/expenses.csv")



In [None]:

# 3. Clean data (drop nulls, cast columns, remove duplicates)

user_df = user_df.dropDuplicates().na.drop()
expense_df = (
    expense_df
    .dropDuplicates()
    .na.drop()
    .withColumn("amount", col("amount").cast("double"))
    .withColumn("date", col("date").cast("date"))
)



In [None]:

# 4. Join user + expense data

combined_df = expense_df.join(user_df, on="user_id", how="inner")



In [None]:

summary_df = (
    combined_df
    .withColumn("month", month("date"))
    .withColumn("year", year("date"))
    .groupBy("user_id", "user_name", "month", "year")
    .agg(
        _sum("amount").alias("total_monthly_spend"),
        avg("amount").alias("avg_transaction")
    )
)

summary_df = summary_df.withColumn("savings_estimate", col("total_monthly_spend") * 0.3)

user_avg_df = summary_df.groupBy("user_id").agg(avg("total_monthly_spend").alias("user_avg_spend"))

summary_df = (
    summary_df
    .join(user_avg_df, on="user_id", how="left")
    .withColumn("alert_flag",
        when(col("total_monthly_spend") > 1.5 * col("user_avg_spend"), "UNUSUAL")
        .otherwise("NORMAL")
    )
    .drop("user_avg_spend")
)


In [None]:

# 6. Save results (Delta + CSV for dashboards)

summary_df.write.format("delta").mode("overwrite").save("/mnt/processed/summary_delta")

summary_df.write.format("csv").option("header", "true").mode("overwrite").save("/mnt/processed/summary_csv")


# 7. Display final results (Databricks UI)

display(summary_df)

