In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (col, hash as hashFcn, upper, year, min as minValue, max as maxValue, sum as sparkSum, date_format, year, month, quarter, when, regexp_replace) 

In [0]:
spark = SparkSession.builder.appName("Day2").getOrCreate()

### Basic Inner Joins

In [0]:
df_fact = spark.read.format("delta").load("/FileStore/tables/FACT_sales")

In [0]:
display(df_fact)

UnitsSold,Revenue,DIM_RegionKey,DIM_CategoryKey,DIM_SubCategoryKey,DIM_SalesChannelKey,DIM_CustomerSegmentKey,DIM_SalesRepKey,DIM_StoreTypeKey,SalesDateKey
10,1000,1869582694,1422188909,-1314911383,2000399776,-1594651443,1352887388,-1427273716,20230721
20,1500,-96241737,218386994,-686888261,1815579165,-7382621,-392921166,-1640598988,20240128
15,1200,-1592160435,-871983438,-1029118937,-2034590472,-951279011,591137949,-817774136,20250222
30,3000,-402154387,963552435,-167645305,2128323039,911933479,-1866580689,1735384348,20230124
12,1100,1869582694,1422188909,-1314911383,2000399776,-1594651443,1352887388,-1427273716,20250131
25,2500,-96241737,218386994,-686888261,1815579165,-7382621,-392921166,-1640598988,20230627
22,2200,-1592160435,-871983438,-1029118937,-2034590472,-951279011,591137949,-817774136,20220602
18,1800,-402154387,963552435,-167645305,2128323039,911933479,-1866580689,1735384348,20241229
10,1000,1869582694,1422188909,-1314911383,2000399776,-1594651443,1352887388,-1427273716,20250604
20,1500,-96241737,218386994,-686888261,1815579165,-7382621,-392921166,-1640598988,20240316


In [0]:
df_dim_region = spark.read.format("delta").load("/FileStore/tables/DIM_Region")

In [0]:
df_fact2 = df_fact.join(df_dim_region, df_dim_region.RegionKey == df_fact.DIM_RegionKey, "inner")

In [0]:
display(df_fact2)

UnitsSold,Revenue,DIM_RegionKey,DIM_CategoryKey,DIM_SubCategoryKey,DIM_SalesChannelKey,DIM_CustomerSegmentKey,DIM_SalesRepKey,DIM_StoreTypeKey,SalesDateKey,Region,RegionKey
10,1000,1869582694,1422188909,-1314911383,2000399776,-1594651443,1352887388,-1427273716,20230721,North,1869582694
20,1500,-96241737,218386994,-686888261,1815579165,-7382621,-392921166,-1640598988,20240128,South,-96241737
15,1200,-1592160435,-871983438,-1029118937,-2034590472,-951279011,591137949,-817774136,20250222,East,-1592160435
30,3000,-402154387,963552435,-167645305,2128323039,911933479,-1866580689,1735384348,20230124,West,-402154387
12,1100,1869582694,1422188909,-1314911383,2000399776,-1594651443,1352887388,-1427273716,20250131,North,1869582694
25,2500,-96241737,218386994,-686888261,1815579165,-7382621,-392921166,-1640598988,20230627,South,-96241737
22,2200,-1592160435,-871983438,-1029118937,-2034590472,-951279011,591137949,-817774136,20220602,East,-1592160435
18,1800,-402154387,963552435,-167645305,2128323039,911933479,-1866580689,1735384348,20241229,West,-402154387
10,1000,1869582694,1422188909,-1314911383,2000399776,-1594651443,1352887388,-1427273716,20250604,North,1869582694
20,1500,-96241737,218386994,-686888261,1815579165,-7382621,-392921166,-1640598988,20240316,South,-96241737


In [0]:
df_dim_product_category = spark.read.format("delta").load("/FileStore/tables/DIM_Category")

df_fact3 = df_fact.join(df_dim_product_category, df_dim_product_category.CategoryKey == df_fact.DIM_CategoryKey, "inner")

In [0]:
display(df_fact3)

UnitsSold,Revenue,DIM_RegionKey,DIM_CategoryKey,DIM_SubCategoryKey,DIM_SalesChannelKey,DIM_CustomerSegmentKey,DIM_SalesRepKey,DIM_StoreTypeKey,SalesDateKey,Title,CategoryKey
10,1000,1869582694,1422188909,-1314911383,2000399776,-1594651443,1352887388,-1427273716,20230721,Electronics,1422188909
20,1500,-96241737,218386994,-686888261,1815579165,-7382621,-392921166,-1640598988,20240128,Clothing,218386994
15,1200,-1592160435,-871983438,-1029118937,-2034590472,-951279011,591137949,-817774136,20250222,Furniture,-871983438
30,3000,-402154387,963552435,-167645305,2128323039,911933479,-1866580689,1735384348,20230124,Groceries,963552435
12,1100,1869582694,1422188909,-1314911383,2000399776,-1594651443,1352887388,-1427273716,20250131,Electronics,1422188909
25,2500,-96241737,218386994,-686888261,1815579165,-7382621,-392921166,-1640598988,20230627,Clothing,218386994
22,2200,-1592160435,-871983438,-1029118937,-2034590472,-951279011,591137949,-817774136,20220602,Furniture,-871983438
18,1800,-402154387,963552435,-167645305,2128323039,911933479,-1866580689,1735384348,20241229,Groceries,963552435
10,1000,1869582694,1422188909,-1314911383,2000399776,-1594651443,1352887388,-1427273716,20250604,Electronics,1422188909
20,1500,-96241737,218386994,-686888261,1815579165,-7382621,-392921166,-1640598988,20240316,Clothing,218386994


### Analytics

In [0]:
files = dbutils.fs.ls("FileStore/tables")
for file in files:
    print(file.path)

dbfs:/FileStore/tables/DIM_Category/
dbfs:/FileStore/tables/DIM_Region/
dbfs:/FileStore/tables/DIM_SubCategory/
dbfs:/FileStore/tables/DIM_customer_segment/
dbfs:/FileStore/tables/DIM_date/
dbfs:/FileStore/tables/DIM_sales_channel/
dbfs:/FileStore/tables/DIM_sales_rep/
dbfs:/FileStore/tables/DIM_store_type/
dbfs:/FileStore/tables/FACT_sales/
dbfs:/FileStore/tables/Read_CSV_Example.ipynb
dbfs:/FileStore/tables/df_DIM_sales_rep/
dbfs:/FileStore/tables/fact_sales-1.csv
dbfs:/FileStore/tables/fact_sales-2.csv
dbfs:/FileStore/tables/fact_sales.csv
dbfs:/FileStore/tables/students.csv


In [0]:
# Total Sales Made By Each SalesRep
df_sales_rep = spark.read.format("delta").load("/FileStore/tables/DIM_sales_rep")

df_sales_by_customer = df_fact.join(df_sales_rep, df_sales_rep.SalesRepKey == df_fact.DIM_SalesRepKey, "inner").groupBy("Name").agg(sparkSum(col("UnitsSold")).alias("TotalUnitSold"),sparkSum(col("Revenue")).alias("TotalRevenue")).select(["Name","TotalUnitSold","TotalRevenue"])

display(df_sales_by_customer)


Name,TotalUnitSold,TotalRevenue
Diana,174,17400
Charlie,126,11400
Bob,180,16000
Alice,88,8400


In [0]:
# Total Sales By Store Type in each year
df_store_type = spark.read.format("delta").load("/FileStore/tables/DIM_store_type")
df_date = spark.read.format("delta").load("/FileStore/tables/DIM_date")

df_sales_by_store_type = df_fact.join(df_store_type, df_store_type.StoreTypeKey == df_fact.DIM_StoreTypeKey, "inner") \
    .join(df_date, df_date.DateKey == df_fact.SalesDateKey, "inner") \
    .groupBy("Name","Year") \
    .agg(sparkSum(col("UnitsSold")).alias("TotalUnitSold"), sparkSum(col("Revenue")).alias("TotalRevenue")) \
    .select([col("Name").alias("StoreType"),"Year","TotalUnitSold","TotalRevenue"])

display(df_sales_by_store_type)


StoreType,Year,TotalUnitSold,TotalRevenue
Urban,2021,22,2100
Urban,2020,12,1100
Rural,2024,15,1200
Suburban,2024,40,3000
Rural,2025,37,3400
Metro,2024,66,6600
Suburban,2023,25,2500
Urban,2025,22,2100
Suburban,2022,70,6500
Suburban,2021,45,4000


In [0]:
# Sales by saleschannel where product subcategory is Mobile or vegetable

df_saleschannel = spark.read.format("delta").load("/FileStore/tables/DIM_sales_channel")
df_subcategory = spark.read.format("delta").load("/FileStore/tables/DIM_SubCategory")

df_sales_by_saleschannel_subcat_mobile = df_fact.join(df_saleschannel.alias("storeType"), df_saleschannel.SalesChannelKey == df_fact.DIM_SalesChannelKey, "inner") \
    .join(df_subcategory.alias("subCategory"), df_subcategory.SubCategoryKey == df_fact.DIM_SubCategoryKey, "inner") \
    .groupBy(col("storeType.Name"),col("subCategory.Title")) \
    .agg(sparkSum(col("UnitsSold")).alias("TotalUnitSold"), sparkSum(col("Revenue")).alias("TotalRevenue")) \
    .filter((col("subCategory.Title") == "Mobile") | (col("subCategory.Title") == "Vegetables")) \
    .select([col("storeType.Name").alias("StoreType"),col("subCategory.Title").alias("SubCategory"),"TotalUnitSold","TotalRevenue"])

display(df_sales_by_saleschannel_subcat_mobile)

StoreType,SubCategory,TotalUnitSold,TotalRevenue
Online,Mobile,88,8400
Wholesale,Vegetables,174,17400
