#### Create ProductCategory Dimension

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (col, monotonically_increasing_id)

In [2]:
spark = SparkSession.builder.appName("fact_mapping").getOrCreate()

In [3]:
df_category = spark.read.option("header",True).option("inferSchema",True).csv("source/Dim_ProductCategory.csv")
df_category.show()

+---------------+-----------------+-------------+
|ProductCategory|ProductCategoryID|CategoryGroup|
+---------------+-----------------+-------------+
|         Laptop|              101|    Computing|
|         Mobile|              102|  Electronics|
|         Tablet|              103|    Computing|
+---------------+-----------------+-------------+



In [4]:
df_category2 = df_category.withColumn("ProductCategoryKey", monotonically_increasing_id()).drop("ProductCategoryID")

In [5]:
df_category_na = spark.createDataFrame([
    ("N/A","N/A",-1)
],["ProductCategory","CategoryGroup","ProductCategoryKey"])

In [6]:
dim_product_category = df_category2.unionAll(df_category_na)

In [7]:
dim_product_category.write.mode("overwrite").parquet("warehouse/dim_product_category")

In [8]:
df_dim_product_category = spark.read.parquet("warehouse/dim_product_category")

In [9]:
df_dim_product_category.show()

+---------------+-------------+------------------+
|ProductCategory|CategoryGroup|ProductCategoryKey|
+---------------+-------------+------------------+
|         Laptop|    Computing|                 0|
|         Mobile|  Electronics|                 1|
|         Tablet|    Computing|                 2|
|            N/A|          N/A|                -1|
+---------------+-------------+------------------+



#### Additionally write final result to csv

In [10]:
df_dim_product_category.toPandas().to_csv("output_csv/dim_product_category.csv", index=False)