### Agreegate

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (col, sum as sparkSum)

In [4]:
spark = SparkSession.builder.appName("factsales2").getOrCreate()

In [5]:
fact_sales = spark.read.parquet("spark-warehouse/fact_sales")

In [6]:
dim_store = spark.read.parquet("spark-warehouse/dim_store")

In [12]:
fact_sales2 = fact_sales.join(
    dim_store,
    fact_sales.StoreKey == dim_store.StoreKey,
    "inner"
)

In [13]:
fact_sales2.show()

+---------+---------+--------+----------+----------+--------+-----------+----------+------+------+---------+--------+
|UnitsSold|UnitPrice|Discount|  SaleDate|ProductKey|StoreKey|EmployeeKey|NetRevenue|Region|  Name|     Type|StoreKey|
+---------+---------+--------+----------+----------+--------+-----------+----------+------+------+---------+--------+
|     12.0|      0.0|     5.0|2022-12-14|        12|       7|         21|       0.0|  East|StoreX|Franchise|       7|
|      0.0|   272.49|     0.0|2023-02-24|         0|       9|         11|       0.0|  East|StoreZ|Franchise|       9|
|      0.0|   484.75|    15.0|2025-03-24|         7|      18|         14|       0.0| South|StoreX|   Retail|      18|
|      0.0|   205.74|    10.0|2023-09-30|         1|      12|         13|       0.0|  West|StoreY|   Outlet|      12|
|     46.0|    20.25|     5.0|2022-10-14|        15|      19|          2|   884.925|  East|StoreZ|   Outlet|      19|
|      0.0|   361.06|    10.0|2024-02-23|        15|    

In [21]:
total_revenue_by_store_region = fact_sales2.groupBy("Region") \
    .agg(sparkSum(col("UnitsSold")).alias("TotalUnitSold"), sparkSum(col("NetRevenue")).alias("TotalRevenue")) \
    .select([col("Region").alias("StoreRegion"),"TotalUnitSold", "TotalRevenue"])

In [22]:
total_revenue_by_store_region.show()

+-----------+-------------+------------------+
|StoreRegion|TotalUnitSold|      TotalRevenue|
+-----------+-------------+------------------+
|      South|        156.0|53107.997500000005|
|       East|        158.0|          18755.03|
|       West|          7.0|         1729.2485|
|      North|         64.0|         17629.491|
+-----------+-------------+------------------+



In [20]:
dim_employee = spark.read.parquet("spark-warehouse/dim_employee")

In [24]:
total_revenue_by_employee = fact_sales2.join(
    dim_employee,
    fact_sales.EmployeeKey == dim_employee.EmployeeKey,
    "inner"
).groupBy("SalesRep") \
    .agg(sparkSum(col("UnitsSold")).alias("TotalUnitSold"), sparkSum(col("NetRevenue")).alias("TotalRevenue")) \
    .select(["SalesRep", "TotalUnitSold", "TotalRevenue"])

In [25]:
total_revenue_by_employee.show()

+--------------+-------------+------------+
|      SalesRep|TotalUnitSold|TotalRevenue|
+--------------+-------------+------------+
|Wendy Castillo|         69.0|   4589.0635|
|  James Austin|         41.0|     2155.37|
|   John Harris|         78.0|  28929.8975|
|      Kyle Lin|          0.0|         0.0|
|    Kara Lewis|          0.0|         0.0|
| Emily Vazquez|         34.0|    7635.435|
|   Billy Perez|         59.0|     22845.0|
|   Kelly Moore|          0.0|         0.0|
|Charles Fields|         85.0|  23337.7525|
|   Martha Long|         19.0|   1729.2485|
+--------------+-------------+------------+

