In [0]:
spark.conf.set("spark.sql.shuffle.partitions",sc.defaultParallelism*4)

In [0]:
quote_header_v1_list , quote_header_v2_list  , quote_line_v1_list , quote_line_v2_list = 1,1,1,1


In [0]:
"""
Quote Header
"""


from pyspark.sql.types import StringType , TimestampType , DoubleType , StructType , StructField , LongType
from pyspark.sql.functions import col  , year , to_timestamp , to_date , row_number , lit
from pyspark.sql import Window

quoteSchema = StructType([
  
                          StructField("crmBranch_quote",StringType(), True),
                          StructField("lineOfBusiness_quote",StringType(), True),
                          StructField("sublineOfBusiness_quote",StringType(), True),
                          StructField("crmOpportunityId_quote",StringType(), True),
                          StructField("quoteNumber_quote",StringType(), True),
                          StructField("status_quote",StringType(), True),
                          StructField("_reconfiguration_date",TimestampType(), True),
                          StructField("createdDate_quote",TimestampType(), True),
                          StructField("orderedDate_quote",TimestampType(), True),
                          StructField("submittedDate_quote",TimestampType(), True),
                          StructField("transactionID_quote",StringType(), True),
                          StructField("branchLaborEfficiency_quote",StringType(), True),
                          StructField("_date_modified",TimestampType(), True) ,
                          StructField("oracleBranchID_quote",StringType(),True) ,
                          StructField("_customer_id",StringType(),True) ,
                          StructField("CRMSalesStage_quote",StringType(),True) ,
                          StructField("crmSalesRepId_quote",StringType(),True) ,
                          StructField("crmSalesRep_quote",StringType(),True) ,
                          StructField("proposedDate_quote",TimestampType(),True),
                          StructField("jobStatus_quote",StructType([StructField("value", StringType() , True)]),True), # added on 3/25/2021 sprint #6 
                          StructField("maxLink_quote",DoubleType(),True) , # added on 3/25/2021 sprint #6 
                          StructField("multimediaMonitoring_quote",DoubleType(),True) , # added on 3/25/2021 sprint #6 
                          StructField("_currency_pref",StructType([StructField("currencyCode",StringType(),True),StructField("id",LongType(),True)]))
                           ] 
                           )


emptyDF = (sqlContext.createDataFrame(sc.emptyRDD(), quoteSchema)
           .withColumn("SourceSystem",lit("Commerce"))
           .withColumn("currency_pref",col("_currency_pref.currencyCode"))                        
           .select("*",col("jobStatus_quote.value").alias("jobStatus"))
           .drop("_currency_pref","jobStatus_quote")
          )
if quote_header_v1_list:
  Quote_Header_v1 = (
                        spark.read
                          .schema(quoteSchema)
                          .option("multiline","true")
                          .option("timestampFormat","MM/dd/yyyy HH:mm:SS a")
                          .json("/mnt/datalake_raw/batch/sales/bigmachine/commerce_v1/quote_header/*/*.json")
                          #.json(quote_header_v1_list)
                          .withColumn("SourceSystem",lit("Commerce_v1"))
                          .withColumn("currency_pref",col("_currency_pref.currencyCode"))
                          .select("*",col("jobStatus_quote.value").alias("jobStatus"))
                          .drop("_currency_pref","jobStatus_quote")
                     )
else:
  Quote_Header_v1 = emptyDF

if quote_header_v2_list:  
  Quote_Header_v2 = (
                        spark.read
                          .schema(quoteSchema)
                          .option("multiline","true")
                          #.option("timestampFormat","MM/dd/yyyy HH:mm:SS a")
                          .json("/mnt/datalake_raw/batch/sales/bigmachine/commerce_v2/quote_header/*/*.json")
                          #.json(quote_header_v2_list)
                          .withColumn("SourceSystem",lit("Commerce_v2"))
                          .withColumn("currency_pref",col("_currency_pref.currencyCode"))
                          .select("*",col("jobStatus_quote.value").alias("jobStatus"))
                          .drop("_currency_pref","jobStatus_quote")
 
                     )
  
  
else:
  Quote_Header_v2 = emptyDF
  

Quote_Header = Quote_Header_v1.unionAll(Quote_Header_v2)
        
window = Window.partitionBy("SourceSystem","transactionID_quote").orderBy(Quote_Header["_date_modified"].desc())

(Quote_Header.dropDuplicates().withColumn("RowNumber" , row_number().over(window))
 .filter("RowNumber == 1")
 .drop("RowNumber")
 .withColumn("CreatedYear",year(col("createdDate_quote").cast("date")))
 .coalesce(1)
 .write
 .format('parquet')
 .mode("overwrite")
 .partitionBy("CreatedYear")
 .save("/mnt/datalake_curated/sales/quote_header")
)

In [0]:
"""
Quote Line
"""


from pyspark.sql.types import StringType , TimestampType , DoubleType , StructType , StructField
from pyspark.sql.functions import col  , year , to_timestamp , to_date , row_number , lit
from pyspark.sql import Window

quoteLineSchema = StructType([
                     StructField("_bs_id", StringType() , False),
                    StructField("_id", StringType() , False),
                  StructField("_sequence_number", StringType() , False),
                    StructField("lineDocNum_line", StringType() , False),
                  StructField("buildingName_line", StringType() , True),
                  StructField("lineType_line", StructType([StructField("value", StringType() , True)])), # added on 3/24/2021 - sprint #6 
                  StructField("crmIntegrationProductName_line", StringType() , True),
                  StructField("crmNumberOfFrontOpenings_line", StringType() , True),
                  StructField("crmNumberOfRearOpenings_line", StringType() , True),
                  StructField("crmNumberOfStops_line", StringType() , True),
                  StructField("crmSpeed_line", StringType() , True),
                  StructField("crmCapacity_line", StringType() , True), # added on 4/21/2021 - Sprint #8	
                  StructField("factoryJobNumber_line", StringType() , True),
                  StructField("itemDescription_line", StringType() , True),
                   StructField("itemType_line", StringType() , True),
                   StructField("marginAmount_line" ,StructType([StructField("value", DoubleType() , True) , StructField("currency" , StringType() , True)])),
                  StructField("marginPercentage_line" , StringType() , True),
                   StructField("markedUpPrice_line" ,StructType([StructField("value", DoubleType() , True) , StructField("currency" , StringType() , True)])),
                  StructField("oemSerialNumber_line", StringType() , True),
                  StructField("oracleSerialNumber_line", StringType() , True),
                  StructField("ozProduct_line", StringType() , True),
                   StructField("proposalPricePerUnit_line" ,StructType([StructField("value", DoubleType() , True) , StructField("currency" , StringType() , True)])),
                   StructField("proposalPrice_line" ,StructType([StructField("value", DoubleType() , True) , StructField("currency" , StringType() , True)])),
                   StructField("totalCost_line" ,StructType([StructField("value", DoubleType() , True) , StructField("currency" , StringType() , True)])),
                  StructField("totalLaborHours_line" , StringType() , True),
                  StructField("totalTeamLaborHours_line" , StringType() , True),
                  StructField("unitLaborCost_line" ,StructType([StructField("value", DoubleType() , True) , StructField("currency" , StringType() , True)])),
                  StructField("unitMaterialCost_line" ,StructType([StructField("value", DoubleType() , True) , StructField("currency" , StringType() , True)])),
                   StructField("unitTotalLaborHours_line" , StringType() , True),
                  StructField("unitTravel_line", StringType() , True),
                  StructField("voltage_line", StringType() , True),
                  StructField("wBSTemplate_line", StringType() , True),
                  StructField("modelLevelConstructionPrice_line" ,StructType([StructField("value", DoubleType() , True) , StructField("currency" , StringType() , True)])),
                  StructField("extraExpenses_line" ,StructType([StructField("value", DoubleType() , True) , StructField("currency" , StringType() , True)])),
                   StructField("grossMarginWithoutOverhead_line" , StringType() , True),
                  StructField("_part_id", StringType() , True),
                  StructField("_model_id", StringType() , True),
                  StructField("productCode_line", StringType() , True),
                  StructField("productDescriptionUnitDescCRM_line", StringType() , True),
                  StructField("productLine_line", StringType() , True),
                  StructField("productType_line", StringType() , True),
                  StructField("_part_number", StringType() , True),
                  StructField("_part_supplier_company_id", StringType() , True),
                  StructField("_part_supplier_company_name", StringType() , True),
                  StructField("_model_name", StringType() , True),
                  StructField("_model_product_line_id", StringType() , True),
                  StructField("_model_product_line_name", StringType() , True),
                  StructField("_model_segment_id", StringType() , True),
                  StructField("_model_supplier_company_id", StringType() , True),
                  StructField("_model_supplier_company_name", StringType() , True),
                  StructField("laborEfficiencyPercentage_line", StringType() , True),
                  StructField("_date_modified",TimestampType(), True) ,
                  StructField("_date_added",TimestampType(), True)  ,
                  StructField("productConfiguration_line", StringType() , True),
  StructField("freightShippingCharges_line" ,StructType([StructField("value", DoubleType() , True) , StructField("currency" , StringType() , True)])),
  StructField("transportCosts_line" ,StructType([StructField("value", DoubleType() , True) , StructField("currency" , StringType() , True)])),
  StructField("inspectionPermitFees_line" ,StructType([StructField("value", DoubleType() , True) , StructField("currency" , StringType() , True)])),
  StructField("miscellaneousExpenses_line" ,StructType([StructField("value", DoubleType() , True) , StructField("currency" , StringType() , True)])),
  StructField("useTax_line" ,StructType([StructField("value", DoubleType() , True) , StructField("currency" , StringType() , True)])),
  StructField("factorTax_line" ,StructType([StructField("value", DoubleType() , True) , StructField("currency" , StringType() , True)])),
  StructField("unitDesignation_line", StringType() , True) ,
  StructField("numOfUnitsOnTheEstimate_line", StringType() , True) ,
  StructField("totalNumberOfCarsInBank_line", StringType() , True) ,

                           ] 
                           )
emptyDF = (sqlContext.createDataFrame(sc.emptyRDD(), quoteLineSchema).withColumn("SourceSystem",lit("Commerce")).select("_bs_id"
                        ,"_id"
                        ,"_sequence_number"
                        ,"lineDocNum_line"
                        ,"buildingName_line"
                        ,col("lineType_line.value").alias("lineType_line")  # added on 3/24/2021 - sprint #6 
                        ,"crmIntegrationProductName_line"
                        ,"crmNumberOfFrontOpenings_line"
                        ,"crmNumberOfRearOpenings_line"
                        ,"crmNumberOfStops_line"
                        ,"crmSpeed_line"
                        ,"factoryJobNumber_line"
                        ,"itemDescription_line"
                        ,"itemType_line"
                        ,col("marginAmount_line.value").alias("marginAmount_line")
                        ,"marginPercentage_line"
                        ,col("markedUpPrice_line.value").alias("markedUpPrice_line")
                        ,"oemSerialNumber_line"
                        ,"oracleSerialNumber_line"
                        ,"ozProduct_line"
                        ,col("proposalPricePerUnit_line.value").alias("proposalPricePerUnit_line")
                        ,col("proposalPrice_line.value").alias("proposalPrice_line")
                        ,col("totalCost_line.value").alias("totalCost_line")
                        ,"totalCost_line.currency"
                        ,"totalLaborHours_line"
                        ,"totalTeamLaborHours_line"
                        ,col("unitLaborCost_line.value").alias("unitLaborCost_line")
                        ,col("unitMaterialCost_line.value").alias("unitMaterialCost_line")
                        ,"unitTotalLaborHours_line"
                        ,"unitTravel_line"
                        ,"voltage_line"
                        ,"wBSTemplate_line"
                        ,col("modelLevelConstructionPrice_line.value").alias("modelLevelConstructionPrice_line")
                        ,col("extraExpenses_line.value").alias("extraExpenses_line")                                                                                         
                        ,"grossMarginWithoutOverhead_line"
                        ,"_part_id"
                        ,"_model_id"
                        ,"productCode_line"
                        ,"productDescriptionUnitDescCRM_line"
                        ,"productLine_line"
                        ,"productType_line"
                        ,"_part_number"
                        ,"_part_supplier_company_id"
                        ,"_part_supplier_company_name"
                        ,"_model_name"
                        ,"_model_product_line_id"
                        ,"_model_product_line_name"
                        ,"_model_segment_id"
                        ,"_model_supplier_company_id"
                        ,"_model_supplier_company_name"
                        ,"laborEfficiencyPercentage_line"
                        ,"_date_modified"
                        ,"_date_added"
                        ,"SourceSystem"
                        ,"productConfiguration_line"                                                                                                                                                      
                                                                                                                        
                               ,col("freightShippingCharges_line.value").alias("freightShippingCharges_line")
                               ,col("transportCosts_line.value").alias("transportCosts_line")
                               ,col("inspectionPermitFees_line.value").alias("inspectionPermitFees_line")
                               ,col("miscellaneousExpenses_line.value").alias("miscellaneousExpenses_line")
                               ,col("useTax_line.value").alias("useTax_line")
                              ,col("factorTax_line.value").alias("factorTax_line")
                              ,"unitDesignation_line"
                              ,"numOfUnitsOnTheEstimate_line"
                              ,"totalNumberOfCarsInBank_line"
                              ,"crmCapacity_line" # added on 4/21/2021 - Sprint #8	


 
              ))
if quote_line_v1_list:
  Quote_Line_v1 = (
                        spark.read
                           .schema(quoteLineSchema)
                          .option("multiline","true")
                          .option("timestampFormat","MM/dd/yyyy HH:mm:SS a")
                          .json("/mnt/datalake_raw/batch/sales/bigmachine/commerce_v1/quote_line/*/*.json")
                          #.json(quote_line_v1_list)
                          .withColumn("SourceSystem",lit("Commerce_v1"))
                          .select(                       
                          "_bs_id"
                          ,"_id"
                          ,"_sequence_number"
                          ,"lineDocNum_line"
                          ,"buildingName_line"
                          ,col("lineType_line.value").alias("lineType_line")  # added on 3/24/2021 - sprint #6 
                          ,"crmIntegrationProductName_line"
                          ,"crmNumberOfFrontOpenings_line"
                          ,"crmNumberOfRearOpenings_line"
                          ,"crmNumberOfStops_line"
                          ,"crmSpeed_line"
                          ,"factoryJobNumber_line"
                          ,"itemDescription_line"
                          ,"itemType_line"
                          ,col("marginAmount_line.value").alias("marginAmount_line")
                          ,"marginPercentage_line"
                          ,col("markedUpPrice_line.value").alias("markedUpPrice_line")
                          ,"oemSerialNumber_line"
                          ,"oracleSerialNumber_line"
                          ,"ozProduct_line"
                          ,col("proposalPricePerUnit_line.value").alias("proposalPricePerUnit_line")
                          ,col("proposalPrice_line.value").alias("proposalPrice_line")
                          ,col("totalCost_line.value").alias("totalCost_line")
                          ,"totalCost_line.currency"
                          ,"totalLaborHours_line"
                          ,"totalTeamLaborHours_line"
                          ,col("unitLaborCost_line.value").alias("unitLaborCost_line")
                          ,col("unitMaterialCost_line.value").alias("unitMaterialCost_line")
                          ,"unitTotalLaborHours_line"
                          ,"unitTravel_line"
                          ,"voltage_line"
                          ,"wBSTemplate_line"
                          ,col("modelLevelConstructionPrice_line.value").alias("modelLevelConstructionPrice_line")
                           ,col("extraExpenses_line.value").alias("extraExpenses_line")   
                          ,"grossMarginWithoutOverhead_line"
                          ,"_part_id"
                          ,"_model_id"
                          ,"productCode_line"
                          ,"productDescriptionUnitDescCRM_line"
                          ,"productLine_line"
                          ,"productType_line"
                          ,"_part_number"
                          ,"_part_supplier_company_id"
                          ,"_part_supplier_company_name"
                          ,"_model_name"
                          ,"_model_product_line_id"
                          ,"_model_product_line_name"
                          ,"_model_segment_id"
                          ,"_model_supplier_company_id"
                          ,"_model_supplier_company_name"
                          ,"laborEfficiencyPercentage_line"
                          ,"_date_modified"
                          ,"_date_added"
                          ,"SourceSystem"
                          ,"productConfiguration_line"
                                ,col("freightShippingCharges_line.value").alias("freightShippingCharges_line")
                               ,col("transportCosts_line.value").alias("transportCosts_line")
                               ,col("inspectionPermitFees_line.value").alias("inspectionPermitFees_line")
                               ,col("miscellaneousExpenses_line.value").alias("miscellaneousExpenses_line")
                               ,col("useTax_line.value").alias("useTax_line")
                              ,col("factorTax_line.value").alias("factorTax_line")
                              ,"unitDesignation_line"
                              ,"numOfUnitsOnTheEstimate_line"
                              ,"totalNumberOfCarsInBank_line"
                              ,"crmCapacity_line" # added on 4/21/2021 - Sprint #8	

                          )
                     )
else: 
  Quote_Line_v1 = emptyDF

if quote_line_v2_list:
  Quote_Line_v2 = (
                        spark.read
                           .schema(quoteLineSchema)
                          .option("multiline","true")
                          .option("timestampFormat","MM/dd/yyyy HH:mm:SS a")
                          .json("/mnt/datalake_raw/batch/sales/bigmachine/commerce_v2/quote_line/*/*.json")
                          #.json(quote_line_v2_list)
                          .withColumn("SourceSystem",lit("Commerce_v2"))
                          .select(

                          "_bs_id"
                          ,"_id"
                          ,"_sequence_number"
                          ,"lineDocNum_line"
                          ,"buildingName_line"
                          ,col("lineType_line.value").alias("lineType_line")  # added on 3/24/2021 - sprint #6 
                          ,"crmIntegrationProductName_line"
                          ,"crmNumberOfFrontOpenings_line"
                          ,"crmNumberOfRearOpenings_line"
                          ,"crmNumberOfStops_line"
                          ,"crmSpeed_line"
                          ,"factoryJobNumber_line"
                          ,"itemDescription_line"
                          ,"itemType_line"
                          ,col("marginAmount_line.value").alias("marginAmount_line")
                          ,"marginPercentage_line"
                          ,col("markedUpPrice_line.value").alias("markedUpPrice_line")
                          ,"oemSerialNumber_line"
                          ,"oracleSerialNumber_line"
                          ,"ozProduct_line"
                          ,col("proposalPricePerUnit_line.value").alias("proposalPricePerUnit_line")
                          ,col("proposalPrice_line.value").alias("proposalPrice_line")
                          ,col("totalCost_line.value").alias("totalCost_line")
                          ,"totalCost_line.currency"
                          ,"totalLaborHours_line"
                          ,"totalTeamLaborHours_line"
                          ,col("unitLaborCost_line.value").alias("unitLaborCost_line")
                          ,col("unitMaterialCost_line.value").alias("unitMaterialCost_line")
                          ,"unitTotalLaborHours_line"
                          ,"unitTravel_line"
                          ,"voltage_line"
                          ,"wBSTemplate_line"
                          ,col("modelLevelConstructionPrice_line.value").alias("modelLevelConstructionPrice_line")
                          ,col("extraExpenses_line.value").alias("extraExpenses_line")   
                          ,"grossMarginWithoutOverhead_line"
                          ,"_part_id"
                          ,"_model_id"
                          ,"productCode_line"
                          ,"productDescriptionUnitDescCRM_line"
                          ,"productLine_line"
                          ,"productType_line"
                          ,"_part_number"
                          ,"_part_supplier_company_id"
                          ,"_part_supplier_company_name"
                          ,"_model_name"
                          ,"_model_product_line_id"
                          ,"_model_product_line_name"
                          ,"_model_segment_id"
                          ,"_model_supplier_company_id"
                          ,"_model_supplier_company_name"
                          ,"laborEfficiencyPercentage_line"
                          ,"_date_modified"
                          ,"_date_added"
                          ,"SourceSystem"
                          ,"productConfiguration_line"
                               ,col("freightShippingCharges_line.value").alias("freightShippingCharges_line")
                               ,col("transportCosts_line.value").alias("transportCosts_line")
                               ,col("inspectionPermitFees_line.value").alias("inspectionPermitFees_line")
                               ,col("miscellaneousExpenses_line.value").alias("miscellaneousExpenses_line")
                               ,col("useTax_line.value").alias("useTax_line")
                              ,col("factorTax_line.value").alias("factorTax_line")
                              ,"unitDesignation_line"
                              ,"numOfUnitsOnTheEstimate_line"
                              ,"totalNumberOfCarsInBank_line"
                              ,"crmCapacity_line" # added on 4/21/2021 - Sprint #8	

                          )

                     )
else:
  Quote_Line_v2 = emptyDF
  
Quote_Line = Quote_Line_v1.unionAll(Quote_Line_v2)
        
window = Window.partitionBy("SourceSystem","_id").orderBy(Quote_Line["_date_modified"].desc())


(Quote_Line.dropDuplicates().withColumn("RowNumber" , row_number().over(window))
 .filter("RowNumber == 1")
 .drop("RowNumber")
 .withColumn("CreatedYear",year(col("_date_added").cast("date")))
 .coalesce(1)
 .write
 .format('parquet')
 .mode("overwrite")
 .partitionBy("CreatedYear")
 .save("/mnt/datalake_curated/sales/quote_line")
)

In [0]:
"""
Quote Line Config & Material Summary
"""

from pyspark.sql.types import StringType , StructField , StructType , TimestampType
from pyspark.sql.functions import split , col , lit , explode, from_csv , first , row_number ,  size , arrays_zip , year ,to_timestamp , to_date 
from pyspark.sql.window import Window
spark.conf.set('spark.sql.caseSensitive', True)


quoteLineSchema = StructType([StructField("_bs_id",StringType() , False),
                              StructField("_id",StringType() , False),
                               StructField("_sequence_number",StringType() , True),
                              StructField("lineDocNum_line",StringType() , True),
                              StructField("_date_modified",TimestampType(), False) ,
                              StructField("_date_added",TimestampType(), False) ,
                              StructField("_config_attr_info", StructType([StructField("value",StringType(), True)]), True)
                                               
                             ])

emptyDF = (sqlContext.createDataFrame(sc.emptyRDD(), quoteLineSchema).withColumn("SourceSystem",lit("Commerce")).select(
                          "_bs_id"
                          ,"_id"
                          ,"_sequence_number"
                        ,"lineDocNum_line"
                          ,"_date_modified"
                           ,"_date_added"
                             ,"_config_attr_info.value"
                             ,"SourceSystem"
                          )
          )
if quote_line_v2_list:
  Quote_Line_v2 = (
                        spark.read
                           .schema(quoteLineSchema)
                          .option("multiline","true")
                          .option("timestampFormat","MM/dd/yyyy HH:mm:SS a")
                          .json("/mnt/datalake_raw/batch/sales/bigmachine/commerce_v2/quote_line/*/*.json")
                          #.json(quote_line_v2_list)
                          .withColumn("SourceSystem" , lit("commerce_v2"))
                          .select(
                          "_bs_id"
                          ,"_id"
                          ,"_sequence_number"
                        ,"lineDocNum_line"
                          ,"_date_modified"
                             ,"_date_added"
                             ,"_config_attr_info.value"
                             ,"SourceSystem"
                          )

                     )

else:
  Quote_Line_v2 = emptyDF

if  quote_line_v1_list:
  Quote_Line_v1 = (
                      spark.read
                         .schema(quoteLineSchema)
                        .option("multiline","true")
                        .option("timestampFormat","MM/dd/yyyy HH:mm:SS a")
                        .json("/mnt/datalake_raw/batch/sales/bigmachine/commerce_v1/quote_line/*/*.json")
                        #.json(quote_line_v1_list)
                         .withColumn("SourceSystem" , lit("commerce_v1"))
                        .select(
                        "_bs_id"
                        ,"_id"
                        ,"_sequence_number"
                      ,"lineDocNum_line"
                        ,"_date_modified"
                           ,"_date_added"
                           ,"_config_attr_info.value"
                          ,"SourceSystem"
                        )

                   )
else:
  Quote_Line_v1 = emptyDF
  
Quote_Line = Quote_Line_v2.unionAll(Quote_Line_v1)

# Drop Duplicates and retieve the most recent lines 
window = Window.partitionBy("SourceSystem","_id").orderBy(Quote_Line["_date_modified"].desc())

Quote_Line = (Quote_Line.dropDuplicates().withColumn("RowNumber" , row_number().over(window))
  .filter("RowNumber == 1")
  .filter("value is not null")
  .drop("RowNumber"))

quote_line_config = (Quote_Line
.select(split(col("value"), '\\|\\^\\|').alias("ColumnArray"),"_bs_id","_id","_date_modified" ,"_date_added","SourceSystem")
.select("_bs_id","_id","_date_modified" ,"_date_added","SourceSystem",explode("ColumnArray"))
.select("_bs_id","_id","_date_modified" ,"_date_added","SourceSystem",split("col","~")[0].alias("colName") , split("col","~")[2].alias("value"))
.groupBy("_bs_id","_id","_date_modified" ,"_date_added","SourceSystem").pivot("colName").agg(first("value"))
                   
     )

quote_line_config_details = quote_line_config.select(
  "_bs_id",
  "_id",
  "SourceSystem",
  "_date_modified",
  "_date_added",
"capacity",
"carSpeedSelection",
"equipmentClassification",
"existingEquipment",
"frontOpenings",
"mainLineVoltage",
"numberOfCarsInGroup",
"numberOfUnitsInThisEstimate",
"rearOpenings",
"seismicZone",
"totalOpenings",
"travelInFeet",
"totalTravelInFeet",
"unitNumberOfStops",
"upspeedOfCar",
"carDownspeed",
"carUpspeed",
"hydraulicStrategy",
"allowGPowerPlus",
"applicableCode",
"applicationOfUnit",
"totalFactoryMaterialCost",
"otherMaterialCosts",
"includesMultimediaMonitoring",
"includeMAXLink"
)

quote_line_material_summary = (quote_line_config.withColumn("zipped",arrays_zip(
                                           #split(col("_array_key_materialSummaryCount"),'\$,\$').alias("key") # 0 
                                           split(col("proposalPartDescription_materialSummary"),'\$,\$').alias("Desc") #0
                                           ,split(col("baseMaterialCost_materialSummary"),'\$,\$').alias("BaseMaterialCost") #1
#                                            ,split(col("leadTime_materialSummary"),'\$,\$').alias("leadTime") #2
                                           ,split(col("qty_materialSummary"),'\$,\$').alias("qty")  #2
                                            ,split(col("groupMaterial_materialSummary"),'\$,\$').alias("groupMaterial")  #3
                                            ,split(col("includeInTheProposal_materialSummary"),'\$,\$').alias("includeInTheProposal")  #4
                                           ,split(col("level1_materialSummary"),'\$,\$').alias("level1")  #5
                                           ,split(col("level2_materialSummary"),'\$,\$').alias("level2")  #6
                                           ,split(col("level3_materialSummary"),'\$,\$').alias("level3")  #7
                                           ,split(col("partNumber_materialSummary"),'\$,\$').alias("partNumber")  #8
                                           ,split(col("section_materialSummary"),'\$,\$').alias("section")  #9
                                           ,split(col("selectedPartDescription_materialSummary"),'\$,\$').alias("selectedPartDescription")  #10
                                           ,split(col("sequence_materialSummary"),'\$,\$').alias("sequence")  #11
                                           ,split(col("surveyKeys_materialSummary"),'\$,\$').alias("surveyKeys")  #12
                                           ,split(col("tKECost_materialSummary"),'\$,\$').alias("tKECost")  #13
                                           ,split(col("totalMaterialCost_materialSummary"),'\$,\$').alias("totalMaterialCost")  #14
                                           ,split(col("vendor_materialSummary"),'\$,\$').alias("vendor")  #15
                                                                                 
                                           
                                          ))
        .withColumn("zipped" ,explode(col("zipped"))) 
             
        .select(
                "_bs_id"
                ,"_id"
          ,"_date_modified"
           ,"_date_added"
               # ,col("zipped.0").alias("Key") 
                ,col("zipped.Desc").alias("proposalPartDescription") 
                ,col("zipped.BaseMaterialCost").alias("baseMaterialCost") 
                #,col("zipped.2").alias("leadTime") 
                ,col("zipped.qty").alias("qty") 
                ,col("zipped.groupMaterial").alias("groupMaterial") 
                ,col("zipped.includeInTheProposal").alias("includeInTheProposal") 
                ,col("zipped.level1").alias("level1") 
                ,col("zipped.level2").alias("level2")
                ,col("zipped.level3").alias("level3")
                ,col("zipped.partNumber").alias("partNumber") 
                ,col("zipped.section").alias("section")
                ,col("zipped.selectedPartDescription").alias("selectedPartDescription")
                ,col("zipped.sequence").alias("sequence") 
                ,col("zipped.surveyKeys").alias("surveyKeys")
                ,col("zipped.tKECost").alias("tKECost")
                ,col("zipped.totalMaterialCost").alias("totalMaterialCost") 
                ,col("zipped.vendor").alias("vendor")
              
                ))

# Save quote line Config 
(quote_line_config_details
#.withColumn("CreatedYear",year(col("_date_added").cast("date")))
 #.coalesce(1)
 .write
 .format('parquet')
 .mode("overwrite")
#.partitionBy("CreatedYear")
 .save("/mnt/datalake_curated/sales/quote_line_config")
)

# Save quote Material Summary 
(quote_line_material_summary
#.withColumn("CreatedYear",year(col("_date_added").cast("date")))
#.coalesce(1)
.write
.format("parquet")
#.partitionBy("CreatedYear")
.mode("overwrite")
.save("/mnt/datalake_curated/sales/quote_line_material_summary"))



In [0]:
dbutils.notebook.exit("Job Completed Successfuly!")

Job Completed Successfuly!