In [1]:
import pandas as pd
import numpy as np

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, month, year, sum as _sum, avg, stddev

spark = SparkSession.builder.appName("ExpenseAnalysis").getOrCreate()

df = spark.read.option("header", True).option("inferSchema", True).csv("expense_transactions.csv")

df.show()

+--------------+--------------------+------+--------+----------+
|transaction_id|          user_email|amount|category|      date|
+--------------+--------------------+------+--------+----------+
|             1|NikitaYS06@gmail.com|  1200| Grocery|2025-01-01|
|             2|Arivunazi21@gmail...|   800|  Travel|2025-01-16|
|             3|LokeyNikey21@gmai...|   900|  Health|2025-01-31|
|             4|NikitaYS06@gmail.com| 10000|  Luxury|2025-02-15|
|             5|Arivunazi21@gmail...|   850| Grocery|2025-03-02|
|             6|LokeyNikey21@gmai...|   300|  Health|2025-03-17|
|             7|NikitaYS06@gmail.com|  1300| Grocery|2025-04-01|
|             8|Arivunazi21@gmail...|  7000|  Luxury|2025-04-16|
|             9|LokeyNikey21@gmai...|  1800|  Travel|2025-05-01|
|            10|NikitaYS06@gmail.com|   600| Grocery|2025-05-16|
|            11|Arivunazi21@gmail...|   400|  Health|2025-05-31|
|            12|LokeyNikey21@gmai...|  1500| Grocery|2025-06-15|
|            13|NikitaYS0

In [4]:
# Convert date to proper format
df = df.withColumn("date", to_date("date", "yyyy-MM-dd"))
df.show()

+--------------+--------------------+------+--------+----------+
|transaction_id|          user_email|amount|category|      date|
+--------------+--------------------+------+--------+----------+
|             1|NikitaYS06@gmail.com|  1200| Grocery|2025-01-01|
|             2|Arivunazi21@gmail...|   800|  Travel|2025-01-16|
|             3|LokeyNikey21@gmai...|   900|  Health|2025-01-31|
|             4|NikitaYS06@gmail.com| 10000|  Luxury|2025-02-15|
|             5|Arivunazi21@gmail...|   850| Grocery|2025-03-02|
|             6|LokeyNikey21@gmai...|   300|  Health|2025-03-17|
|             7|NikitaYS06@gmail.com|  1300| Grocery|2025-04-01|
|             8|Arivunazi21@gmail...|  7000|  Luxury|2025-04-16|
|             9|LokeyNikey21@gmai...|  1800|  Travel|2025-05-01|
|            10|NikitaYS06@gmail.com|   600| Grocery|2025-05-16|
|            11|Arivunazi21@gmail...|   400|  Health|2025-05-31|
|            12|LokeyNikey21@gmai...|  1500| Grocery|2025-06-15|
|            13|NikitaYS0

In [6]:
# Add year and month
df = df.withColumn("month", month("date")).withColumn("year", year("date"))
df.show()

+--------------+--------------------+------+--------+----------+-----+----+
|transaction_id|          user_email|amount|category|      date|month|year|
+--------------+--------------------+------+--------+----------+-----+----+
|             1|NikitaYS06@gmail.com|  1200| Grocery|2025-01-01|    1|2025|
|             2|Arivunazi21@gmail...|   800|  Travel|2025-01-16|    1|2025|
|             3|LokeyNikey21@gmai...|   900|  Health|2025-01-31|    1|2025|
|             4|NikitaYS06@gmail.com| 10000|  Luxury|2025-02-15|    2|2025|
|             5|Arivunazi21@gmail...|   850| Grocery|2025-03-02|    3|2025|
|             6|LokeyNikey21@gmai...|   300|  Health|2025-03-17|    3|2025|
|             7|NikitaYS06@gmail.com|  1300| Grocery|2025-04-01|    4|2025|
|             8|Arivunazi21@gmail...|  7000|  Luxury|2025-04-16|    4|2025|
|             9|LokeyNikey21@gmai...|  1800|  Travel|2025-05-01|    5|2025|
|            10|NikitaYS06@gmail.com|   600| Grocery|2025-05-16|    5|2025|
|           

In [12]:
# Group by user and month to get total expense
monthly_spend = df.groupBy("user_email", "year", "month").agg( _sum("amount").alias("monthly_total"))
print("=== Monthly Spend by User ===")
monthly_spend.orderBy("user_email", "year", "month").show()

=== Monthly Spend by User ===
+--------------------+----+-----+-------------+
|          user_email|year|month|monthly_total|
+--------------------+----+-----+-------------+
|Arivunazi21@gmail...|2025|    1|          800|
|Arivunazi21@gmail...|2025|    3|          850|
|Arivunazi21@gmail...|2025|    4|         7000|
|Arivunazi21@gmail...|2025|    5|          400|
|Arivunazi21@gmail...|2025|    7|         1000|
|Arivunazi21@gmail...|2025|    8|          200|
|Arivunazi21@gmail...|2025|   10|          750|
|LokeyNikey21@gmai...|2025|    1|          900|
|LokeyNikey21@gmai...|2025|    3|          300|
|LokeyNikey21@gmai...|2025|    5|         1800|
|LokeyNikey21@gmai...|2025|    6|         1500|
|LokeyNikey21@gmai...|2025|    7|          900|
|LokeyNikey21@gmai...|2025|    9|          800|
|NikitaYS06@gmail.com|2025|    1|         1200|
|NikitaYS06@gmail.com|2025|    2|        10000|
|NikitaYS06@gmail.com|2025|    4|         1300|
|NikitaYS06@gmail.com|2025|    5|          600|
|NikitaYS0

In [20]:
from pyspark.sql.functions import avg, stddev, col

# Step 1: Calculate average and std deviation of spend per user
stats = df.groupBy("user_email").agg(
    avg("amount").alias("avg_spend"),
    stddev("amount").alias("std_spend")
)
stats.show()

+--------------------+------------------+------------------+
|          user_email|         avg_spend|         std_spend|
+--------------------+------------------+------------------+
|NikitaYS06@gmail.com| 4742.857142857143| 6736.432572921948|
|Arivunazi21@gmail...|1571.4285714285713|2409.6779077394594|
|LokeyNikey21@gmai...|1033.3333333333333| 535.4126134736337|
+--------------------+------------------+------------------+



In [19]:

# Step 2: Join stats with original dataframe
df_with_stats = df.join(stats, on="user_email")
df_with_stats.show()

+--------------------+--------------+------+--------+----------+-----+----+------------------+------------------+
|          user_email|transaction_id|amount|category|      date|month|year|         avg_spend|         std_spend|
+--------------------+--------------+------+--------+----------+-----+----+------------------+------------------+
|NikitaYS06@gmail.com|             1|  1200| Grocery|2025-01-01|    1|2025| 4742.857142857143| 6736.432572921948|
|Arivunazi21@gmail...|             2|   800|  Travel|2025-01-16|    1|2025|1571.4285714285713|2409.6779077394594|
|LokeyNikey21@gmai...|             3|   900|  Health|2025-01-31|    1|2025|1033.3333333333333| 535.4126134736337|
|NikitaYS06@gmail.com|             4| 10000|  Luxury|2025-02-15|    2|2025| 4742.857142857143| 6736.432572921948|
|Arivunazi21@gmail...|             5|   850| Grocery|2025-03-02|    3|2025|1571.4285714285713|2409.6779077394594|
|LokeyNikey21@gmai...|             6|   300|  Health|2025-03-17|    3|2025|1033.33333333

In [22]:

# Step 3: Detect unusually large expenses: amount > avg + 2*std
anomalies = df_with_stats.filter(
    col("amount") > (col("avg_spend") + 2 * col("std_spend"))
)

# Show anomalies
print("=== Unusual High Expenses Detected ===")
anomalies.select("user_email", "date", "amount", "category", "avg_spend", "std_spend").show()


=== Unusual High Expenses Detected ===
+--------------------+----------+------+--------+------------------+------------------+
|          user_email|      date|amount|category|         avg_spend|         std_spend|
+--------------------+----------+------+--------+------------------+------------------+
|Arivunazi21@gmail...|2025-04-16|  7000|  Luxury|1571.4285714285713|2409.6779077394594|
+--------------------+----------+------+--------+------------------+------------------+



In [None]:
# Show unusual transactions
print("=== Unusual Transactions Detected ===")
flagged.filter("is_unusual = true").select("user_id", "date", "category", "amount").show()

In [25]:

user_total_spend = df.groupBy("user_email").agg(_sum("amount").alias("total_spend"))
user_total_spend.orderBy("total_spend", ascending=False).show()
pivot_df = df.groupBy("user_email").pivot("category").agg(_sum("amount"))
pivot_df.show()



+--------------------+-----------+
|          user_email|total_spend|
+--------------------+-----------+
|NikitaYS06@gmail.com|      33200|
|Arivunazi21@gmail...|      11000|
|LokeyNikey21@gmai...|       6200|
+--------------------+-----------+

+--------------------+-------+------+------+------+
|          user_email|Grocery|Health|Luxury|Travel|
+--------------------+-------+------+------+------+
|NikitaYS06@gmail.com|   4000|  NULL| 28000|  1200|
|Arivunazi21@gmail...|   1050|   400|  8000|  1550|
|LokeyNikey21@gmai...|   1500|  2100|  NULL|  2600|
+--------------------+-------+------+------+------+



In [26]:
avg_monthly_category = df.groupBy("year", "month", "category").agg(avg("amount").alias("avg_spend"))

avg_monthly_category.orderBy("year", "month", "category").show()


+----+-----+--------+---------+
|year|month|category|avg_spend|
+----+-----+--------+---------+
|2025|    1| Grocery|   1200.0|
|2025|    1|  Health|    900.0|
|2025|    1|  Travel|    800.0|
|2025|    2|  Luxury|  10000.0|
|2025|    3| Grocery|    850.0|
|2025|    3|  Health|    300.0|
|2025|    4| Grocery|   1300.0|
|2025|    4|  Luxury|   7000.0|
|2025|    5| Grocery|    600.0|
|2025|    5|  Health|    400.0|
|2025|    5|  Travel|   1800.0|
|2025|    6| Grocery|   1500.0|
|2025|    6|  Travel|   1200.0|
|2025|    7|  Health|    900.0|
|2025|    7|  Luxury|   1000.0|
|2025|    8| Grocery|    200.0|
|2025|    8|  Luxury|  18000.0|
|2025|    9| Grocery|    900.0|
|2025|    9|  Travel|    800.0|
|2025|   10|  Travel|    750.0|
+----+-----+--------+---------+



In [29]:
from pyspark.sql.functions import collect_list

missing_months_grouped = missing_months.groupBy("user_email").agg(
    collect_list("month").alias("missing_months")
)

missing_months_grouped.orderBy("user_email").show(truncate=False)


+----------------------+---------------------+
|user_email            |missing_months       |
+----------------------+---------------------+
|Arivunazi21@gmail.com |[2, 6, 9, 11, 12]    |
|LokeyNikey21@gmail.com|[2, 4, 8, 10, 11, 12]|
|NikitaYS06@gmail.com  |[3, 7, 10, 11, 12]   |
+----------------------+---------------------+

