### Create Custom Mapping Dimension

In [28]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (col, monotonically_increasing_id, count as sparkCount, when)

In [21]:
spark = SparkSession.builder.appName("fact_mapping").getOrCreate()

In [22]:
df_custom_mapping = spark.read.option("header",True).option("inferSchema",True).csv("source/Custom_Mapping_DIM.csv")
df_custom_mapping.show()

+---------------+-----------+------------+-----------------+
|ProductCategory|StoreRegion|CustomerType|     MappingLabel|
+---------------+-----------+------------+-----------------+
|         Tablet|       East|      Retail|        Side Head|
|         Laptop|       West|      Retail|      Premium Box|
|         Mobile|      North|   Wholesale|North Dist Mobile|
|         Tablet|      South|      Retail|      Tablet Push|
|         Laptop|      North|   Wholesale|    Laptop Supply|
|         Laptop|        ANY|      Retail|   Regular Laptop|
|            ANY|        ANY|   Wholesale| Normal Wholesale|
|            ANY|        ANY|      Retail|    Normal Retail|
+---------------+-----------+------------+-----------------+



#### Ensure that mapping are unique in nature

In [23]:
df_custom_mapping_check = df_custom_mapping.groupBy([col("ProductCategory"),col("StoreRegion"),col("CustomerType")]).agg(sparkCount(col("MappingLabel")).alias("count"))

In [24]:
count_duplicate = df_custom_mapping_check.filter(col("count") > 1).count()

In [25]:
if count_duplicate > 0:
    print("Handle Duplicate Mapping combinations")

#### Add Priority for common mappings (handling ANY)

In [29]:
df_custom_mapping2 = df_custom_mapping.withColumn("Priority", 1 + when(col("ProductCategory") != "ANY",0).otherwise(1) + \
                                                                  when(col("StoreRegion") != "ANY",0).otherwise(1) + \
                                                                  when(col("CustomerType") != "ANY",0).otherwise(1))
df_custom_mapping2.show()

+---------------+-----------+------------+-----------------+--------+
|ProductCategory|StoreRegion|CustomerType|     MappingLabel|Priority|
+---------------+-----------+------------+-----------------+--------+
|         Tablet|       East|      Retail|        Side Head|       1|
|         Laptop|       West|      Retail|      Premium Box|       1|
|         Mobile|      North|   Wholesale|North Dist Mobile|       1|
|         Tablet|      South|      Retail|      Tablet Push|       1|
|         Laptop|      North|   Wholesale|    Laptop Supply|       1|
|         Laptop|        ANY|      Retail|   Regular Laptop|       2|
|            ANY|        ANY|   Wholesale| Normal Wholesale|       3|
|            ANY|        ANY|      Retail|    Normal Retail|       3|
+---------------+-----------+------------+-----------------+--------+



#### Add Surrogate key and N/A for Mapping Table

In [32]:
df_custom_mapping3 = df_custom_mapping2.withColumn("CustomMappingKey",monotonically_increasing_id())

In [33]:
df_mapping_na = spark.createDataFrame([
    ("N/A","N/A","N/A","N/A",9999,-1)
],["MappingLabel","ProductCategory","StoreRegion","CustomerType", "CustomMappingKey"])

In [34]:
dim_custom_mapping = df_custom_mapping3.unionAll(df_mapping_na)

In [35]:
dim_custom_mapping.show()

+---------------+-----------+------------+-----------------+--------+----------------+
|ProductCategory|StoreRegion|CustomerType|     MappingLabel|Priority|CustomMappingKey|
+---------------+-----------+------------+-----------------+--------+----------------+
|         Tablet|       East|      Retail|        Side Head|       1|               0|
|         Laptop|       West|      Retail|      Premium Box|       1|               1|
|         Mobile|      North|   Wholesale|North Dist Mobile|       1|               2|
|         Tablet|      South|      Retail|      Tablet Push|       1|               3|
|         Laptop|      North|   Wholesale|    Laptop Supply|       1|               4|
|         Laptop|        ANY|      Retail|   Regular Laptop|       2|               5|
|            ANY|        ANY|   Wholesale| Normal Wholesale|       3|               6|
|            ANY|        ANY|      Retail|    Normal Retail|       3|               7|
|            N/A|        N/A|         N/A| 

In [36]:
dim_custom_mapping.write.mode("overwrite").parquet("warehouse/dim_custom_mapping")

In [37]:
spark.read.parquet("warehouse/dim_custom_mapping").show()

+---------------+-----------+------------+-----------------+--------+----------------+
|ProductCategory|StoreRegion|CustomerType|     MappingLabel|Priority|CustomMappingKey|
+---------------+-----------+------------+-----------------+--------+----------------+
|         Tablet|       East|      Retail|        Side Head|       1|               0|
|         Laptop|       West|      Retail|      Premium Box|       1|               1|
|         Mobile|      North|   Wholesale|North Dist Mobile|       1|               2|
|         Tablet|      South|      Retail|      Tablet Push|       1|               3|
|         Laptop|      North|   Wholesale|    Laptop Supply|       1|               4|
|         Laptop|        ANY|      Retail|   Regular Laptop|       2|               5|
|            ANY|        ANY|   Wholesale| Normal Wholesale|       3|               6|
|            ANY|        ANY|      Retail|    Normal Retail|       3|               7|
|            N/A|        N/A|         N/A| 

#### Additionally write final result to csv as well for ssis

In [18]:
#dim_custom_mapping.repartition(1).write.option("header", True).csv("output_csv/dim_custom_mapping2.csv")

In [38]:
dim_custom_mapping.toPandas().to_csv("output_csv/dim_custom_mapping.csv", index=False)

#### Note: Directly using data-frame to csv results in multiple csv files (due to partitions). So, either we repartition or if we have small table, we can just convert to pandas and save as csv, which produces much cleaner output as opposed to earlier.