#### Create ProductCategory Dimension

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (col, monotonically_increasing_id)

In [2]:
spark = SparkSession.builder.appName("fact_mapping").getOrCreate()

In [3]:
df_store_region = spark.read.option("header",True).option("inferSchema",True).csv("source/Dim_StoreRegion.csv")
df_store_region.show()

+-----------+--------+---------+
|StoreRegion|RegionID|Territory|
+-----------+--------+---------+
|      North|       1|        A|
|      South|       2|        B|
|       East|       3|        C|
|       West|       4|        D|
+-----------+--------+---------+



In [4]:
df_store_region2 = df_store_region.withColumn("StoreRegionKey", monotonically_increasing_id()).drop("RegionID")

In [5]:
df_store_region_na = spark.createDataFrame([
    ("N/A","N/A",-1)
],["StoreRegion","Territory","StoreRegionKey"])

In [6]:
dim_store_region = df_store_region2.unionAll(df_store_region_na)

In [7]:
dim_store_region.write.mode("overwrite").parquet("warehouse/dim_store_region")

In [8]:
df_dim_store_region = spark.read.parquet("warehouse/dim_store_region")

In [9]:
df_dim_store_region.show()

+-----------+---------+--------------+
|StoreRegion|Territory|StoreRegionKey|
+-----------+---------+--------------+
|      North|        A|             0|
|      South|        B|             1|
|       East|        C|             2|
|       West|        D|             3|
|        N/A|      N/A|            -1|
+-----------+---------+--------------+



#### Save to CSV

In [10]:
dim_store_region.toPandas().to_csv("output_csv/dim_store_region.csv", index=False)