In [1]:
%run "./UTL_Gen"

In [2]:
#################################################################################
""" Set of Functions for Quality Data Structure Generation

"""
 #Who                 When           What
 #Victor Salesa       15/11/2018     Initial Version
 #Victor Salesa       19/11/2018     QA_CTL_PROCESS_FILE_ERROR_DATA: Replaced Field _Err spec with _ERR spec and _Raw Spec with _RAW spec
 #                                   QA_CTL_PROCESS_FILE_ERROR_DATA: Replaced Field ROW_LINE with FILE_LINE_NUM
 #Victor Salesa       20/11/2018     QA_CTL_PROCESS_FILE_ERROR_DATA: Replaced Field FIELDNAME with FIELD_NAME     
 #                                   QA_CTL_PROCESS_FILE_QA_DATA: Initial Version
 #                                   QA_CTL_PROCESS_FILE_ERROR_DATA: Changed expected Type for LANDING_DATE and PROCESS_DATE to be Timestamp
 #                                   QA_CTL_PROCESS_FILE_QA_DATA: Changed expected Type for LANDING_DATE and PROCESS_DATE to be Timestamp
 #Victor Salesa       22/11/2018     QA_GENERATE_DATA: Added condition to avoid storing Errors when no Errors
 #Ana Pérez           26/02/2019     Adapted to new Temporary source Dataframe 
##################################################################################

def QA_CTL_PROCESS_FILE_ERROR_DATA(df,debug=False):
  '''Canonize dataframe to a Data Format to Populate Database for table CTL.PROCESS_FILE
    
    Parameters:
      df: Dataframe with data to be canonized according the following structure
        Input Dataframe Expected Structure:
          FILE_NAME :   String
          LANDING_DATE: Timestamp
          PROCESS_DATE: Timestamp
          FILE_LINE_NUM:String
          ....................
          ....................
          ....................
          <Field1>_RAW: String
          <Field2>_RAW: String
          ....................
          <Fieldn>_RAW: String 
          <Field1>_ERR: String (JSON STRING)
          <Field2>_ERR: String (JSON STRING)
          ....................
          <Fieldn>_ERR: String (JSON STRING)
      
      <Fieldn>_ERR Expected Structure (JSON STRING):
          {ERR_TYPE_1:<Error_Code_1>,ERR_TYPE_2:<Error_Code_2>,....,ERR_TYPE_N:<Error_Code_N>}
      
      debug: Show debug information in the process (Default False)
    
    Return:
      Dataframe

    Example:
      Sample Origin Dataframe for QA
  
      input_df = (spark.createDataFrame(
      [
        ("file1_txt","1","aspirinas","bayer","""{"ERR_TYPE_1":"-1","ERR_TYPE_2":"0","ERR_TYPE_3":"-1"}""","""{"ERR_TYPE_4":"-1","ERR_TYPE_5":"0","ERR_TYPE_6":"-1"}"""),
        ("file2_txt","1","aspirinas","pepito", """{"ERR_TYPE_4":"-1","ERR_TYPE_5":"0","ERR_TYPE_6":"-1"}""","""{"ERR_TYPE_1":"-1","ERR_TYPE_2":"0","ERR_TYPE_3":"-1"}"""),
        ("file3_txt","1","aspirinas","pepito", """""","""{"ERR_TYPE_1":"-1","ERR_TYPE_2":"0","ERR_TYPE_3":"-1"}""")
      ],
      ("FILE_NAME","ROW_LINE","PRODUCT_RAW","MANUFACTURER_RAW","PRODUCT_ERR","MANUFACTURER_ERR"))
      .withColumn("temp_timestamp",from_unixtime(unix_timestamp(current_timestamp())))
          .withColumn("LANDING_DATE",col("temp_timestamp").cast(StringType()))
          .withColumn("PROCESS_DATE",col("temp_timestamp").cast(StringType()))
          .drop("temp_timestamp")
      )
      input_df.show(100000,False)
            
      #Canonize QA Dataframe
      qa_df = QA_CTL_PROCESS_FILE_ERROR_DATA(df)
      
      qa_df.show(10000,False)
      
      input_df Show
      
      +---------+-------------+-----------+----------------+------------------------------------------------------+------------------------------------------------------+-------------------+-------------------+
      |FILE_NAME|FILE_LINE_NUM|PRODUCT_RAW|MANUFACTURER_RAW|PRODUCT_ERR                                           |MANUFACTURER_ERR                                      |LANDING_DATE       |PROCESS_DATE       |
      +---------+-------------+-----------+----------------+------------------------------------------------------+------------------------------------------------------+-------------------+-------------------+
      |file1_txt|1            |aspirinas  |bayer           |{"ERR_TYPE_1":"-1","ERR_TYPE_2":"0","ERR_TYPE_3":"-1"}|{"ERR_TYPE_4":"-1","ERR_TYPE_5":"0","ERR_TYPE_6":"-1"}|2018-11-15 15:26:08|2018-11-15 15:26:08|
      |file2_txt|1            |aspirinas  |pepito          |{"ERR_TYPE_4":"-1","ERR_TYPE_5":"0","ERR_TYPE_6":"-1"}|{"ERR_TYPE_1":"-1","ERR_TYPE_2":"0","ERR_TYPE_3":"-1"}|2018-11-15 15:26:08|2018-11-15 15:26:08|
      |file3_txt|1            |aspirinas  |pepito          |                                                      |{"ERR_TYPE_1":"-1","ERR_TYPE_2":"0","ERR_TYPE_3":"-1"}|2018-11-15 15:26:08|2018-11-15 15:26:08|
      +---------+-------------+-----------+----------------+------------------------------------------------------+------------------------------------------------------+-------------------+-------------------+
      
      qa_df Show
      
      +---------+-------------+-------------------+-------------------+-------+----------+----------+-----------+
      |FILE_NAME|FILE_LINE_NUM|LANDING_DATE       |PROCESS_DATE       |FIELD  |ERROR_TYPE|ERROR_CODE|FIELD_VALUE|
      +---------+-------------+-------------------+-------------------+-------+----------+----------+-----------+
      |file2_txt|1            |2018-11-15 15:58:00|2018-11-15 15:58:00|PRODUCT|ERR_TYPE_4|-1        |aspirinas  |
      |file2_txt|1            |2018-11-15 15:58:00|2018-11-15 15:58:00|PRODUCT|ERR_TYPE_4|-1        |aspirinas  |
      |file2_txt|1            |2018-11-15 15:58:00|2018-11-15 15:58:00|PRODUCT|ERR_TYPE_4|-1        |aspirinas  |
      |file2_txt|1            |2018-11-15 15:58:00|2018-11-15 15:58:00|PRODUCT|ERR_TYPE_4|-1        |aspirinas  |
      |file2_txt|1            |2018-11-15 15:58:00|2018-11-15 15:58:00|PRODUCT|ERR_TYPE_4|-1        |aspirinas  |
      |file2_txt|1            |2018-11-15 15:58:00|2018-11-15 15:58:00|PRODUCT|ERR_TYPE_4|-1        |aspirinas  |
      |file2_txt|1            |2018-11-15 15:58:00|2018-11-15 15:58:00|PRODUCT|ERR_TYPE_5|0         |aspirinas  |
      |file2_txt|1            |2018-11-15 15:58:00|2018-11-15 15:58:00|PRODUCT|ERR_TYPE_5|0         |aspirinas  |
      |file2_txt|1            |2018-11-15 15:58:00|2018-11-15 15:58:00|PRODUCT|ERR_TYPE_5|0         |aspirinas  |
      |file2_txt|1            |2018-11-15 15:58:00|2018-11-15 15:58:00|PRODUCT|ERR_TYPE_5|0         |aspirinas  |
      +---------+-------------+-------------------+-------------------+-------+----------+----------+-----------+
      only showing top 10 rows
      
      
  '''
  #Who                 When           What
  #Victor Salesa       15/11/2018     Initial Version
  #Victor Salesa       19/11/2018     Replaced Field _Err spec with _ERR spec and _Raw Spec with _RAW spec 
  try:
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)
    
      
    #Define Key Columns 
    key_columns = ["FILE_NAME","LANDING_DATE","PROCESS_DATE","FILE_LINE_NUM","FIELD"]
  
    ADP_log_debug(process, logger_name, level_action, log_level, "---Before iterator error_columns_generator", sys._getframe().f_code.co_name)
    
    error_columns_generator = (col for col in df.schema.names if "_ERR" in col)
    
    ADP_log_debug(process, logger_name, level_action, log_level, "---Error Columns Iterator", sys._getframe().f_code.co_name)
    not_error_columns_generator = [col for col in df.schema.names if "_ERR" not in col]
    ADP_log_debug(process, logger_name, level_action, log_level, "---Not Error Columns Iterator", sys._getframe().f_code.co_name)
         
    #Parse from described columns to struct
    dfsan = parseJSONCols(df,*error_columns_generator,debug=False)
    ADP_log_debug(process, logger_name, level_action, log_level, "---Parse from Json", sys._getframe().f_code.co_name)
    
    #Flatten Values to Fields
    dfsan_flattened = dfsan.select(flatten_struct(dfsan.schema))
    ADP_log_debug(process, logger_name, level_action, log_level, "---Flatten Structure", sys._getframe().f_code.co_name)     
      
    #Rename Columns to avoid unpivot problem (. character take as structu)
    new_column_name_list= list(map(lambda x: x.replace(".", "#"), dfsan_flattened.columns))
    dfsan_flattened_renamed = dfsan_flattened.toDF(*new_column_name_list)
    ADP_log_debug(process, logger_name, level_action, log_level, "---Rename columns to avoid . error", sys._getframe().f_code.co_name)
      
    #Unpivot Errortypes
    dfsan_flattened_unpivot = unpivot(dfsan_flattened_renamed,not_error_columns_generator)

    df_table = (dfsan_flattened_unpivot.filter(col("val").isNotNull())
                          .withColumn('FIELD',regexp_replace(split("key", '#').getItem(0),'_ERR',''))
                          .withColumn('ERROR_TYPE',split("key", '#').getItem(1))
                          .withColumn('ERROR_CODE',col("val"))
                          .drop("val","key")
    )
    ADP_log_debug(process, logger_name, level_action, log_level, "---Unpivot errors", sys._getframe().f_code.co_name)
    
    not_raw_columns_generator = [col for col in df_table.schema.names if "_RAW" not in col]
    df_table_error = df_table.select(*not_raw_columns_generator)
    ADP_log_debug(process, logger_name, level_action, log_level, "---Select all but _Raw column", sys._getframe().f_code.co_name)
         
    #Unpivot to add Raw Field Values and Drop Error Related Field 
    df_table_raw = (unpivot(df_table,not_raw_columns_generator)
                            .drop("FIELD","ERROR_TYPE","ERROR_CODE")
                            .withColumn("FIELD",regexp_replace("key","_RAW",""))
                            .withColumn("FIELD_VALUE",col("val"))
                            .drop("key","val")
                         ).distinct()
    ADP_log_debug(process, logger_name, level_action, log_level, "---Select all but _Err column", sys._getframe().f_code.co_name)
      
    df_table_error = df_table_error.persist(StorageLevel.MEMORY_AND_DISK)
    df_table_error.count()
    ADP_log_debug(process, logger_name, level_action, log_level, "---Cache Error", sys._getframe().f_code.co_name)
 
    df_table_raw = df_table_raw.persist(StorageLevel.MEMORY_AND_DISK)
    df_table_raw.count()
    ADP_log_debug(process, logger_name, level_action, log_level, "---Cache RAW", sys._getframe().f_code.co_name)
                 
    
    join_cond_generated = [ (col('errors.'+field) == col('raw_values.'+field)) for field in key_columns]
    ADP_log_debug(process, logger_name, level_action, log_level, "---Generate Join condition", sys._getframe().f_code.co_name)    
    
    df_table_db = df_table_error.alias('errors').join(df_table_raw.alias('raw_values'),join_cond_generated,'left').select("errors.*","raw_values.FIELD_VALUE")
    ADP_log_debug(process, logger_name, level_action, log_level, "---Join Error&Raw Data", sys._getframe().f_code.co_name)

    ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name)  
    return df_table_db
  
  except Exception as err:
    ADP_log_exception(process, logger_name, level_action, log_level,  "", sys._getframe().f_code.co_name,  sys.exc_info())
    raise Exception(err)