In [1]:
# a notebook to calculate the total amount spent by a customer

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [3]:
#creating a spark session
spark = SparkSession.builder.appName("Customer Total").getOrCreate()

In [4]:
#creating a schema when reading customer-orders
customerOrderSchema = StructType([
    StructField("cust_id", IntegerType(), True),
    StructField("item_id", IntegerType(), True),
    StructField("amount_spent", FloatType(), True)
])

In [5]:
#loading the data into spark dataframe
customerDF = spark.read.schema(customerOrderSchema).csv("///customer-orders.csv")

In [6]:
# a quick look at what we are working with
customerDF.printSchema()

#counting the number of rows
customerDF.count()

root
 |-- cust_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- amount_spent: float (nullable = true)



10000

In [7]:
#selecting only the fields we need for this exercises
customer_total = customerDF.select("cust_id", "amount_spent")
customer_total.printSchema()

root
 |-- cust_id: integer (nullable = true)
 |-- amount_spent: float (nullable = true)



In [9]:
customer_total_spent = customer_total.groupBy("cust_id").sum("amount_spent")
customer_total_spent.show()

+-------+------------------+
|cust_id| sum(amount_spent)|
+-------+------------------+
|     31| 4765.050008416176|
|     85|  5503.42998456955|
|     65| 5140.349995829165|
|     53| 4945.300026416779|
|     78| 4524.510001778603|
|     34|5330.8000039458275|
|     81|   5112.7100045681|
|     28|  5000.71000123024|
|     76| 4904.210003614426|
|     27| 4915.890009522438|
|     26| 5250.399979650974|
|     44| 4756.890008449554|
|     12| 4664.589988231659|
|     91| 4642.259980916977|
|     22| 5019.449993014336|
|     93|5265.7500213086605|
|     47| 4316.299998342991|
|      1| 4958.599974133074|
|     52|  5245.05999673903|
|     13| 4367.619992315769|
+-------+------------------+
only showing top 20 rows



In [10]:
#formating 
customer_total_spent_format = customer_total.groupBy("cust_id") \
    .agg(round(sum("amount_spent"), 2)
         .alias("Total Spent")) \
            .sort(desc("Total Spent"))
customer_total_spent_format.show()


+-------+-----------+
|cust_id|Total Spent|
+-------+-----------+
|     68|    6375.45|
|     73|     6206.2|
|     39|    6193.11|
|     54|    6065.39|
|     71|    5995.66|
|      2|    5994.59|
|     97|    5977.19|
|     46|    5963.11|
|     42|    5696.84|
|     59|    5642.89|
|     41|    5637.62|
|      0|    5524.95|
|      8|    5517.24|
|     85|    5503.43|
|     61|    5497.48|
|     32|    5496.05|
|     58|    5437.73|
|     63|    5415.15|
|     15|    5413.51|
|      6|    5397.88|
+-------+-----------+
only showing top 20 rows

