#### Create ProductCategory Dimension

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (col, monotonically_increasing_id)

In [2]:
spark = SparkSession.builder.appName("fact_mapping").getOrCreate()

In [3]:
df_customer_type = spark.read.option("header",True).option("inferSchema",True).csv("source/Dim_CustomerType.csv")
df_customer_type.show()

+------------+--------------+-------+
|CustomerType|CustomerTypeID|Segment|
+------------+--------------+-------+
|      Retail|             1|    B2C|
|   Wholesale|             2|    B2B|
+------------+--------------+-------+



In [4]:
df_customer_type2 = df_customer_type.withColumn("CustomerTypeKey", monotonically_increasing_id()).drop("CustomerTypeID")

In [5]:
df_customer_type_na = spark.createDataFrame([
    ("N/A","N/A",-1)
],["CustomerType","Segment","CustomerTypeKey"])

In [6]:
dim_customer_type = df_customer_type2.unionAll(df_customer_type_na)

In [7]:
dim_customer_type.write.mode("overwrite").parquet("warehouse/dim_customer_type")

In [8]:
df_dim_customer_type = spark.read.parquet("warehouse/dim_customer_type")

In [9]:
df_dim_customer_type.show()

+------------+-------+---------------+
|CustomerType|Segment|CustomerTypeKey|
+------------+-------+---------------+
|      Retail|    B2C|              0|
|   Wholesale|    B2B|              1|
|         N/A|    N/A|             -1|
+------------+-------+---------------+



#### Additionally write final result to csv.

In [11]:
df_dim_customer_type.toPandas().to_csv("output_csv/dim_customer_type.csv", index=False)