In [30]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("ExpensePerCustomer").getOrCreate()

In [31]:
# read csv
orders = spark.read.csv("../data/customer-orders.csv", inferSchema=True)
# we only want columns 1 and 3
orders = orders.select(["_c0", "_c2"])
# rename columns
column_names = ["customer_id", "purchase_amount"]
orders = orders.toDF(*column_names)

orders.show(5)

+-----------+---------------+
|customer_id|purchase_amount|
+-----------+---------------+
|         44|          37.19|
|         35|          65.89|
|          2|          40.64|
|         47|          14.98|
|         29|          13.08|
+-----------+---------------+
only showing top 5 rows



In [32]:
# Compute aggregate stats of the purchase amount column for each customer
customers_summary = (
    orders.groupBy("customer_id")
    .agg(
        F.round(F.sum("purchase_amount"), 2).alias("total_spent"),
        F.round(F.mean("purchase_amount"), 2).alias("avg_spent"),
        F.round(F.max("purchase_amount"), 2).alias("max_purchase"),
        F.round(F.min("purchase_amount"), 2).alias("min_purchase"),
    )
    .orderBy("customer_id")
)

customers_summary.show(5)

+-----------+-----------+---------+------------+------------+
|customer_id|total_spent|avg_spent|max_purchase|min_purchase|
+-----------+-----------+---------+------------+------------+
|          0|    5524.95|    47.22|       99.45|        0.35|
|          1|     4958.6|    44.67|        96.8|         0.1|
|          2|    5994.59|     50.8|       99.54|        1.71|
|          3|    4659.63|     51.2|       99.83|        0.73|
|          4|    4815.05|    49.64|       96.24|        0.38|
+-----------+-----------+---------+------------+------------+
only showing top 5 rows



In [33]:
spark.stop()