In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (col, hash as hashFcn, upper, year, min as minValue, max as maxValue, date_format, year, month, quarter, when, regexp_replace) 
from pyspark.sql.types import (StructType, StructField, StringType, IntegerType, LongType, DoubleType, DateType)
from datetime import datetime, timedelta

In [0]:
spark = SparkSession.builder.appName("Day2").getOrCreate()

In [0]:
df_sales = spark.read.option("header", True).option("inferSchema", True).csv("/FileStore/tables/fact_sales-2.csv")

In [0]:
display(df_sales)

Region,ProductCategory,ProductSubCategory,SalesChannel,CustomerSegment,SalesRep,StoreType,SalesDate,UnitsSold,Revenue
North,Electronics,Mobile,Online,Consumer,Alice,Urban,2023-07-21,10,1000
South,Clothing,Shirt,Offline,Corporate,Bob,Suburban,2024-01-28,20,1500
East,Furniture,Table,Retail,Home Office,Charlie,Rural,2025-02-22,15,1200
West,Groceries,Vegetables,Wholesale,Small Business,Diana,Metro,2023-01-24,30,3000
North,Electronics,Mobile,Online,Consumer,Alice,Urban,2025-01-31,12,1100
South,Clothing,Shirt,Offline,Corporate,Bob,Suburban,2023-06-27,25,2500
East,Furniture,Table,Retail,Home Office,Charlie,Rural,2022-06-02,22,2200
West,Groceries,Vegetables,Wholesale,Small Business,Diana,Metro,2024-12-29,18,1800
North,Electronics,Mobile,Online,Consumer,Alice,Urban,2025-06-04,10,1000
South,Clothing,Shirt,Offline,Corporate,Bob,Suburban,2024-03-16,20,1500


##### Create Dimension Region 

In [0]:
df_region = df_sales.select([
    col("Region")
]).distinct()

In [0]:
display(df_region)

Region
South
East
West
North


In [0]:
df_regionAll = df_region.withColumn("RegionKey",hashFcn(upper(col("Region"))).cast("bigint"))

In [0]:
# add default N/A row for unknown/null values
df_na_region = spark.createDataFrame([
    ("N/A", -1)
],["Region","RegionKey"])
df_DIM_region = df_regionAll.unionAll(df_na_region)

In [0]:
# save region dim as delta table
df_DIM_region.write.format("delta").mode("overwrite").save("/FileStore/tables/DIM_Region")

In [0]:
# read data from delta to see if it works
delta_dim_region = spark.read.format("delta").load("/FileStore/tables/DIM_Region")

In [0]:
display(delta_dim_region)

Region,RegionKey
South,-96241737
East,-1592160435
West,-402154387
North,1869582694
,-1


##### Create Dimension ProductCategory

In [0]:
df_category = df_sales.select([
    col("ProductCategory").alias("Title")
]).distinct()

In [0]:
df_category2 = df_category.withColumn("CategoryKey", hashFcn(upper(col("Title"))).cast("bigint"))

In [0]:
#handle n/a
df_cat_na = spark.createDataFrame([
    ("N/A",-1)
],["Title","CategoryKey"])
df_DIM_category = df_category2.unionAll(df_cat_na)

In [0]:
display(df_DIM_category)

Title,CategoryKey
Groceries,963552435
Electronics,1422188909
Clothing,218386994
Furniture,-871983438
,-1


In [0]:
df_DIM_category.write.format("delta").mode("overwrite").save("/FileStore/tables/DIM_Category")

In [0]:
df_subcategory = df_sales.select([
    col("ProductSubCategory").alias("Title")
]).distinct()

In [0]:
df_subcategory2 = df_subcategory.withColumn("SubCategoryKey",hashFcn(upper(col("Title"))).cast("bigint"))

In [0]:
#handle unknown
df_na_subcat = spark.createDataFrame([
    ("N/A",-1)
],["Title","SubCategoryKey"])
df_DIM_subcategory = df_subcategory2.unionAll(df_na_subcat)

In [0]:
display(df_DIM_subcategory)

Title,SubCategoryKey
Vegetables,-167645305
Table,-1029118937
Shirt,-686888261
Mobile,-1314911383
,-1


In [0]:
# write to delta table
df_DIM_subcategory.write.format("delta").mode("overwrite").save("/FileStore/tables/DIM_SubCategory")

##### Create Dimension SalesChannel,CustomerSegment,SalesRep,StoreType

In [0]:
df_sales_channel = df_sales.select([
    col("SalesChannel").alias("Name")
]).distinct()

df_customer_segment = df_sales.select([
    col("CustomerSegment").alias("Segment")
]).distinct()

df_sales_rep = df_sales.select([
    col("SalesRep").alias("Name")
]).distinct()

df_store_type = df_sales.select([
    col("StoreType").alias("Name")
]).distinct()

In [0]:
df_sales_channel2 = df_sales_channel.withColumn("SalesChannelKey",hashFcn(upper(col("Name"))).cast("bigint"))

df_customer_segment2 = df_customer_segment.withColumn("CustomerSegmentKey",hashFcn(upper(col("Segment"))).cast("bigint"))

df_sales_rep2 = df_sales_rep.withColumn("SalesRepKey",hashFcn(upper(col("Name"))).cast("bigint"))

df_store_type2 = df_store_type.withColumn("StoreTypeKey",hashFcn(upper(col("Name"))).cast("bigint"))

In [0]:
df_sales_channel_na = spark.createDataFrame([
    ("N/A",-1)
],["Name","SalesChannelKey"])
df_DIM_sales_channel = df_sales_channel2.unionAll(df_sales_channel_na)

df_customer_segment_na = spark.createDataFrame([
    ("N/A",-1)
],["Segment","CsutomerSegmentKey"])
df_DIM_customer_segment = df_customer_segment2.unionAll(df_customer_segment_na)

df_sales_rep_na = spark.createDataFrame([
    ("N/A",-1)
],["Name","SalesRepKey"])
df_DIM_sales_rep = df_sales_rep2.unionAll(df_sales_rep_na)

df_store_type_na = spark.createDataFrame([
    ("N/A",-1)
],["Name","StoreTypeKey"])
df_DIM_store_type = df_store_type2.unionAll(df_store_type_na)

In [0]:
display(df_DIM_sales_channel)
display(df_DIM_customer_segment)
display(df_DIM_sales_rep)
display(df_DIM_store_type)

Name,SalesChannelKey
Wholesale,2128323039
Online,2000399776
Retail,-2034590472
Offline,1815579165
,-1


Segment,CustomerSegmentKey
Consumer,-1594651443
Home Office,-951279011
Corporate,-7382621
Small Business,911933479
,-1


Name,SalesRepKey
Diana,-1866580689
Charlie,591137949
Bob,-392921166
Alice,1352887388
,-1


Name,StoreTypeKey
Urban,-1427273716
Suburban,-1640598988
Rural,-817774136
Metro,1735384348
,-1


In [0]:
df_DIM_sales_channel.write.format("delta").mode("overwrite").save("/FileStore/tables/DIM_sales_channel")

df_DIM_customer_segment.write.format("delta").mode("overwrite").save("/FileStore/tables/DIM_customer_segment")

df_DIM_sales_rep.write.format("delta").mode("overwrite").save("/FileStore/tables/DIM_sales_rep")

df_DIM_store_type.write.format("delta").mode("overwrite").save("/FileStore/tables/DIM_store_type")

##### Create Date Dimension

In [0]:
# extract unique year from fact sales
df_unique_years = df_sales.select([
    year(col('SalesDate')).alias('Year').cast("int")
]).distinct()
#display(unique_years)

In [0]:
# Get min and max years
aggYear = df_unique_years.agg(
   minValue(col("Year")).alias("MinYear"),
   maxValue(col("Year")).alias("MaxYear"), 
)
minYear = aggYear.collect()[0][0]
maxYear = aggYear.collect()[0][1]


In [0]:
date_schema = StructType([
    StructField("DateKey", LongType(), True),
    StructField("Date", DateType(), True),
])

# had to use datetime library for date manipulation

start_date = datetime(minYear, 1, 1) 
end_date = datetime(maxYear, 12, 31) 

date_rows = []
current_date = start_date
while current_date <= end_date:
    date_key = int(current_date.strftime('%Y%m%d'))
    date_rows.append((date_key, current_date.date())) 
    current_date += timedelta(days=1)

df_date = spark.createDataFrame(date_rows, date_schema)

df_DIM_date = df_date.withColumn("MonthName", date_format(col("Date"),"MMMM")) \
                  .withColumn("Year", year(col("Date"))) \
                  .withColumn("Quarter", quarter(col("Date"))) \
                  .withColumn("semester", when(month(col("Date")) >= 8, "Fall").when(month(col("Date")) <= 5, "Spring").otherwise("Summer"))

In [0]:
df_DIM_date.write.format("delta").mode("overwrite").save("/FileStore/tables/DIM_date")

#### Create Fact Table With DIM Keys

In [0]:
df_fact2 = df_sales.withColumn("DIM_RegionKey", when(col("Region").isNull(), -1) \
                              .otherwise(hashFcn(upper(col("Region")))).cast("bigint")) \
                  .withColumn("DIM_CategoryKey", when(col("ProductCategory").isNull(), -1) \
                              .otherwise(hashFcn(upper(col("ProductCategory")))).cast("bigint")) \
                  .withColumn("DIM_SubCategoryKey", when(col("ProductSubCategory").isNull(), -1) \
                              .otherwise(hashFcn(upper(col("ProductSubCategory")))).cast("bigint")) \
                  .withColumn("DIM_SalesChannelKey", when(col("SalesChannel").isNull(), -1) \
                              .otherwise(hashFcn(upper(col("SalesChannel")))).cast("bigint")) \
                  .withColumn("DIM_CustomerSegmentKey", when(col("CustomerSegment").isNull(), -1) \
                              .otherwise(hashFcn(upper(col("CustomerSegment")))).cast("bigint")) \
                  .withColumn("DIM_SalesRepKey", when(col("SalesRep").isNull(), -1) \
                              .otherwise(hashFcn(upper(col("SalesRep")))).cast("bigint")) \
                  .withColumn("DIM_StoreTypeKey", when(col("StoreType").isNull(), -1) \
                              .otherwise(hashFcn(upper(col("StoreType")))).cast("bigint")) \
                  .withColumn("SalesDateKey", when(col("SalesDate").isNull(), -1) \
                              .otherwise(regexp_replace(col("SalesDate"),"-","")).cast("bigint")) \
                              .drop(col("Region")) \
                              .drop(col("ProductCategory")) \
                              .drop(col("ProductSubCategory")) \
                              .drop(col("SalesChannel")) \
                              .drop(col("CustomerSegment")) \
                              .drop(col("SalesRep")) \
                              .drop(col("StoreType")) \
                              .drop(col("SalesDate")) 


In [0]:
display(df_fact2)

UnitsSold,Revenue,DIM_RegionKey,DIM_CategoryKey,DIM_SubCategoryKey,DIM_SalesChannelKey,DIM_CustomerSegmentKey,DIM_SalesRepKey,DIM_StoreTypeKey,SalesDateKey
10,1000,1869582694,1422188909,-1314911383,2000399776,-1594651443,1352887388,-1427273716,20230721
20,1500,-96241737,218386994,-686888261,1815579165,-7382621,-392921166,-1640598988,20240128
15,1200,-1592160435,-871983438,-1029118937,-2034590472,-951279011,591137949,-817774136,20250222
30,3000,-402154387,963552435,-167645305,2128323039,911933479,-1866580689,1735384348,20230124
12,1100,1869582694,1422188909,-1314911383,2000399776,-1594651443,1352887388,-1427273716,20250131
25,2500,-96241737,218386994,-686888261,1815579165,-7382621,-392921166,-1640598988,20230627
22,2200,-1592160435,-871983438,-1029118937,-2034590472,-951279011,591137949,-817774136,20220602
18,1800,-402154387,963552435,-167645305,2128323039,911933479,-1866580689,1735384348,20241229
10,1000,1869582694,1422188909,-1314911383,2000399776,-1594651443,1352887388,-1427273716,20250604
20,1500,-96241737,218386994,-686888261,1815579165,-7382621,-392921166,-1640598988,20240316


In [0]:
df_fact2.write.format("delta").mode("overwrite").save("/FileStore/tables/FACT_sales")