In [1]:
%run "./ADP_Spain_MDM_Def"

In [2]:
def GenerateValCols(df,valModel):
  """Generates in the dataframe df: 
        - one column for each column described in valModel with the "_ERR" suffix that will contain a 0 if the column for that field has a right value or 1 if not
        - one column named RESULT_ERR with a 0 if no errors in the row or 1 if any error in the row
        - one column named ERR_COUNT  with the number of total errors in the row
      
    Parameters:
      df       -- Dataframe with columns with data
      valModel -- Dictionary with a list of pairs "FIELD_NAME":<validation_function> describing which function is used to validate each field.  "FIELD_NAME" should be one of df's ones
        - valModel Example:
          __PRODUCT_MASTER_CAT_STRUCT_VALIDATIONS__ = {
            "PRODUCT_INTERNAL_CODE":digit_val,
            "BAR_CODE":digit_val,
            "PRODUCT_NAME":no_val,
            "MANUFACTURER_CODE":digit_val,
            "MANUFACTURER_NAME":no_val,
            "LEGAL_CATEGORY":no_val,
            "COMMERCIAL_CATEGORY_L1":no_val,
            "COMMERCIAL_CATEGORY_L2":no_val,
            "COMMERCIAL_CATEGORY_L3":no_val,
            "BRAND":no_val
          }
          
    Return:
      Boolean

    Example:
        
        Input  Columns: ["PRODUCT_INTERNAL_CODE", "BAR_CODE", "PRODUCT_NAME", "MANUFACTURER_CODE", "MANUFACTURER_NAME", "LEGAL_CATEGORY", "COMMERCIAL_CATEGORY_L1", "COMMERCIAL_CATEGORY_L2", "COMMERCIAL_CATEGORY_L3", "BRAND"]
        Output Columns: ["PRODUCT_INTERNAL_CODE", "BAR_CODE", "PRODUCT_NAME", "MANUFACTURER_CODE", "MANUFACTURER_NAME", "LEGAL_CATEGORY", "COMMERCIAL_CATEGORY_L1", "COMMERCIAL_CATEGORY_L2", "COMMERCIAL_CATEGORY_L3", "BRAND",
                         "PRODUCT_INTERNAL_CODE_ERR", "BAR_CODE_ERR", "PRODUCT_NAME_ERR", "MANUFACTURER_CODE_ERR", "MANUFACTURER_NAME_ERR", "LEGAL_CATEGORY_ERR", "COMMERCIAL_CATEGORY_L1_ERR", "COMMERCIAL_CATEGORY_L2_ERR", 
                         "COMMERCIAL_CATEGORY_L3_ERR", "BRAND_ERR", "ERR_COUNT", "RESULT_ERR"
                        ]
        
        
        GenerateValCols(pharmatic_product_master_df,__PRODUCT_MASTER_CAT_STRUCT_VALIDATIONS__)
  """
  #Who                 When           What
  #Ana Perez           25/03/2019     Included log managment and exception managment
  try:    
      ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name) 

      #Calculate Validation Columns
      val_cols = [when((valModel[column](column))==True,0).otherwise(1).alias(column+"_ERR") for column in df.columns]

      #Build original Columns
      org_cols = [col(column) for column in df.columns]

      #Generate Result Column
      result_col = when(__builtin__.sum(val_cols)>0,1).otherwise(0).alias("RESULT_ERR")
      err_count_col = __builtin__.sum(val_cols).alias("ERR_COUNT")

      #Join original and Validation Columns
      selected_cols = org_cols + val_cols + [err_count_col,result_col]

      #Select cols from 
      df_validated = df.select(*selected_cols)

      ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name) 
      
      return df_validated
  except Exception as err:
      ADP_log_exception(process, logger_name, level_action, log_level, "", sys._getframe().f_code.co_name,  sys.exc_info())
      raise Exception(err)

In [3]:
##################################################################################################################################################################
""" Process functions for Pharmatic MasterData Ingest

"""
 #Who                 When           What
 #Victor Salesa       15/10/2018     Initial Version
 #Victor Salesa       31/10/2018     Forced Schema in pharmacies Load
 #Victor Salesa       05/11/2018     Changed ValidateAndDistributeSpainPharmaciesMasterData for the new Pharmacies format file
 #Victor Salesa       04/04/2019     Changed ValidateAndDistributeSpainProductMasterData: START_DATE AND END_DATE format from dd/mm/yyyyy to yyyymmddHHmmss
 #Ana Perez           06/05/2019     Changed source folder from  __PHARMATIC_MASTER_DATA_LANDING_PRODUCT_PATH__ to __PHARMATIC_MASTER_DATA_TOBEPROCESSED_PRODUCT_PATH__
 #Ana Perez           07/05/2019     Added save as Database Table and replace ALPHEGA_FLG value (Yes/No by Y/N)
 #Ana Perez           09/05/2019     Fixed issue with "\" character at the end of field in MDM_PRODUCT.csv (python must consider other character as ESC character, not "\")
################################################################################

def ValidateAndDistributeSpainProductMasterData():
  """Process Product Master Data Files,Validates Structure and Distributes to the Canonical Folder 
    Return:
      Boolean

    Example 1:
      product_ok = ValidateAndDistributeSpainPharmaciesMasterData()
  """
  
  #Who                 When           What
  #Victor Salesa       23/01/2019     Initial Version
  #Ana Perez           19/03/2019     Included log managment and exception managment
  #Victor Salesa       04/04/2019     Changed START_DATE AND END_DATE format from dd/mm/yyyyy to yyyymmddHHmmss
  try:
    
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name) 
    
      
    pharmatic_product_master_df = (spark.read.format('csv')
                                        .options(header='true',delimiter=__SPAIN_MASTERDATA_CSV_DELIMITER__,mode=__SPAIN_MASTERDATA_CSV_MODE_,escape='¬')
                                        .load(__PHARMATIC_MASTER_DATA_TOBEPROCESSED_PRODUCT_PATH__,schema=__M_SP_PRODUCT_RAW_SCHEMA__)
                                )
    #Trim spaces inside fields
    pharmatic_product_master_df = (reduce(
            lambda pharmatic_product_master_df, col_name: pharmatic_product_master_df.withColumn(col_name, trim(col(col_name))),
            pharmatic_product_master_df.columns,
            pharmatic_product_master_df)
    )
    
    pharmatic_product_master_df_validated = GenerateValCols(pharmatic_product_master_df,__PRODUCT_MASTER_STRUCT_VALIDATIONS__)
    
    #TODO: implement rejection criteria based on validation
    error_lines = pharmatic_product_master_df_validated.filter(col("RESULT_ERR")==1).count()
    total_lines = pharmatic_product_master_df_validated.count()
    

    #TODO: if Not Rejected:
    pharmatic_product_master_df = (pharmatic_product_master_df.withColumn("START_DATE",to_timestamp("START_DATE",'yyyymmddHHmmss'))
                                                              .withColumn("END_DATE",to_timestamp("END_DATE",'yyyymmddHHmmss'))
                                                              .withColumn("OWNBRAND_FLG",substring("OWNBRAND_FLG",1,1))
                                                              .withColumn("SALES_PRICE_MANUFACTURER",regexp_replace(col("SALES_PRICE_MANUFACTURER"),',','.').cast(FloatType()))
                                                              .withColumn("SALES_PRICE_PHARMACY"    ,regexp_replace(col("SALES_PRICE_PHARMACY"),',','.').cast(FloatType())    )
                                                              .withColumn("SALES_PRICE_PUBLIC"      ,regexp_replace(col("SALES_PRICE_PUBLIC"),',','.').cast(FloatType())      )
                                 )
    ADP_log_debug(process, logger_name, level_action, log_level, "Before saveAsCanonical SP PRODUCT: "\
                                                                  + str(pharmatic_product_master_df.count()) + " rows ", sys._getframe().f_code.co_name)
    saveAsCanonical(pharmatic_product_master_df,__PHARMATIC_MASTER_DATA_CANONICAL_PRODUCT_PATH__,table_name=__PRODUCT_TABLE_NAME__,mode='overwrite') 
    ADP_log_debug(process, logger_name, level_action, log_level, "After saveAsCanonical SP PRODUCT", sys._getframe().f_code.co_name)
    
    
    #refresh LOAD_DATE to Database 
    pharmatic_product_master_df_DB = pharmatic_product_master_df.withColumn('LOAD_DATE', to_timestamp(current_timestamp(), "yyyyMMddHHmmss"))
    items_count = pharmatic_product_master_df_DB.count()
    ADP_log_debug(process, logger_name, level_action, log_level, "Before saveToDB SP PRODUCT: "\
                                                                  + str(items_count) + " rows ", sys._getframe().f_code.co_name)	
    saveToDB(pharmatic_product_master_df_DB,__MDM_STG_M_PRODUCT_DB_TABLE_NAME__,mode="overwrite",debug=False,job_id='')
    ADP_log_debug(process, logger_name, level_action, log_level, "After saveToDB SP PRODUCT", sys._getframe().f_code.co_name)
    
    pharmatic_product_master_df_DB.unpersist()
    pharmatic_product_master_df.unpersist()
    
    ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name) 
    
    return items_count
  except Exception as err:
    ADP_log_exception(process, logger_name, level_action, log_level, "", sys._getframe().f_code.co_name,  sys.exc_info())
    
    raise Exception(err)

##################################################################################################################################################################
  
def ValidateAndDistributeSpainCategoriesMasterData():
  """Process Product Categories Master Data Files,Validates Structure and Distributes to the Canonical Folder 
    Return:
      Boolean

    Example 1:
      categories_ok = ValidateAndDistributeSpainCategoriesMasterData()
  """

  #Who                 When           What
  #Victor Salesa       23/01/2019     Initial Version
  #Ana Perez           25/03/2019     Included log managment and exception managment
  #Ana Perez           06/05/2019     Changed source folder from  __PHARMATIC_MASTER_DATA_LANDING_CATEGORIES_PATH__ to __MASTER_DATA_TOBEPROCESSED_BASE_PATH__
  #Ana Perez           07/05/2019     Added save as Database Table
  try:
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)
    pharmatic_categories_master_df = (spark.read.format('csv')
                                        .options(header='true',delimiter=__SPAIN_MASTERDATA_CSV_DELIMITER__,mode=__SPAIN_MASTERDATA_CSV_MODE_)
                                        .load(__PHARMATIC_MASTER_DATA_TOBEPROCESSED_CATEGORIES_PATH__,schema=__M_SP_PRODUCT_CAT_RAW_SCHEMA__)
                                )
    
    #Trim spaces inside fields
    pharmatic_categories_master_df = (reduce(
            lambda pharmatic_categories_master_df, col_name: pharmatic_categories_master_df.withColumn(col_name, trim(col(col_name))),
            pharmatic_categories_master_df.columns,
            pharmatic_categories_master_df)
    )

    pharmatic_categories_master_df_validated = GenerateValCols(pharmatic_categories_master_df,__PRODUCT_MASTER_CAT_STRUCT_VALIDATIONS__)
    
    #Todo implement rejection criteria based on validation
    error_lines = pharmatic_categories_master_df_validated.filter(col("RESULT_ERR")==1).count()
    total_lines = pharmatic_categories_master_df_validated.count()
    
    ADP_log_debug(process, logger_name, level_action, log_level, "Before saveAsCanonical SP PRODUCT CATEGORIES: " + str(pharmatic_categories_master_df.count()) + " rows ", sys._getframe().f_code.co_name)
    saveAsCanonical(pharmatic_categories_master_df,__PHARMATIC_MASTER_DATA_CANONICAL_CATEGORIES_PATH__,table_name=__CATEGORIES_TABLE_NAME__,mode='overwrite')
    ADP_log_debug(process, logger_name, level_action, log_level, "After saveAsCanonical SP PRODUCT CATEGORIES", sys._getframe().f_code.co_name)

    #refresh LOAD_DATE to Database 
    pharmatic_categories_master_df_DB = pharmatic_categories_master_df.withColumn('LOAD_DATE', to_timestamp(current_timestamp(), "yyyyMMddHHmmss")) 
    items_count = pharmatic_categories_master_df_DB.count()
    ADP_log_debug(process, logger_name, level_action, log_level, "Before saveToDB SP PRODUCT_CAT: "\
                                                                  + str(items_count) + " rows ", sys._getframe().f_code.co_name)	
    saveToDB(pharmatic_categories_master_df_DB,__MDM_STG_M_PRODUCT_CAT_DB_TABLE_NAME__,mode="overwrite",debug=False,job_id='')
    ADP_log_debug(process, logger_name, level_action, log_level, "After saveToDB SP PRODUCT_CAT", sys._getframe().f_code.co_name)
    
    pharmatic_categories_master_df_DB.unpersist()
    pharmatic_categories_master_df.unpersist()
    
    ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name)
    return items_count
  
  except Exception as err:
    ADP_log_exception(process, logger_name, level_action, log_level, "", sys._getframe().f_code.co_name,  sys.exc_info())
    raise Exception(err)
    
  
##################################################################################################################################################################

def ValidateAndDistributeSpainPharmaciesMasterData():
  
  """Process Pharmacies Master Data Files,Validates Structure and Distributes to the Canonical Folder 
    Return:
      Boolean

    Example 1:    
      pharmacies_ok    = ValidateAndDistributeSpainPharmaciesMasterData()
  """
  #Who                 When           What
  #Victor Salesa       23/01/2019     Initial Version
  #Ana Perez           25/03/2019     Included log managment and exception managment
  #Ana Perez           06/05/2019     Changed source folder from  __PHARMATIC_MASTER_DATA_LANDING_PHARMACIES_PATH__ to __PHARMATIC_MASTER_DATA_TOBEPROCESSED_PHARMACIES_PATH__
  #Ana Perez           07/05/2019     Added save as Database Table and replace ALPHEGA_FLG value (Yes/No by Y/N)
  try:
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name) 
    pharmatic_pharmacies_master_df = (spark.read.format('csv')
                                        .options(header='true',delimiter=__SPAIN_MASTERDATA_CSV_DELIMITER__,mode=__SPAIN_MASTERDATA_CSV_MODE_)
                                        .load(__PHARMATIC_MASTER_DATA_TOBEPROCESSED_PHARMACIES_PATH__,schema=__M_SP_PHARMACY_RAW_SCHEMA__)
                                )
    
    #Trim spaces inside fields
    pharmatic_pharmacies_master_df = (reduce(
            lambda pharmatic_pharmacies_master_df, col_name: pharmatic_pharmacies_master_df.withColumn(col_name, trim(col(col_name))),
            pharmatic_pharmacies_master_df.columns,
            pharmatic_pharmacies_master_df)
    )

    pharmatic_pharmacies_master_df_validated = GenerateValCols(pharmatic_pharmacies_master_df,__PHARMACY_MASTER_STRUCT_VALIDATIONS__)
    
    #Todo implement rejection criteria based on validation
    error_lines = pharmatic_pharmacies_master_df_validated.filter(col("RESULT_ERR")==1).count()
    total_lines = pharmatic_pharmacies_master_df_validated.count()
    
    #Replace ALPHEGA_FLG value (Yes/No by Y/N)
    pharmatic_pharmacies_master_df = pharmatic_pharmacies_master_df.withColumn("ALPHEGA_FLG", col('ALPHEGA_FLG').substr(0,1))
    
    ADP_log_debug(process, logger_name, level_action, log_level, "Before saveAsCanonical SP FARMACIES: " + str(pharmatic_pharmacies_master_df.count()) + " rows ", sys._getframe().f_code.co_name)
    saveAsCanonical(pharmatic_pharmacies_master_df,__PHARMATIC_MASTER_DATA_CANONICAL_PHARMACIES_PATH__,table_name=__PHARMACY_TABLE_NAME__,mode='overwrite')
    ADP_log_debug(process, logger_name, level_action, log_level, "After saveAsCanonical SP FARMACIES", sys._getframe().f_code.co_name)
    
    #refresh LOAD_DATE to Database 
    pharmatic_pharmacies_master_df_DB = pharmatic_pharmacies_master_df.withColumn('LOAD_DATE', to_timestamp(current_timestamp(), "yyyyMMddHHmmss")) 
    items_count = pharmatic_pharmacies_master_df_DB.count()
    ADP_log_debug(process, logger_name, level_action, log_level, "Before saveToDB SP PHARMACY: "\
                                                                  + str(items_count) + " rows ", sys._getframe().f_code.co_name)	
    saveToDB(pharmatic_pharmacies_master_df_DB,__MDM_STG_M_PHARMACY_DB_TABLE_NAME__,mode="overwrite",debug=False,job_id='')
    ADP_log_debug(process, logger_name, level_action, log_level, "After saveToDB SP PHARMACY", sys._getframe().f_code.co_name)
    
    pharmatic_pharmacies_master_df_DB.unpersist()
    pharmatic_pharmacies_master_df.unpersist()
    
    ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name) 

    return items_count
  except Exception as err:
    ADP_log_exception(process, logger_name, level_action, log_level, "", sys._getframe().f_code.co_name,  sys.exc_info())
    raise Exception(err)
  
    
###########################################################################################################################################################################


def ValidateAndDistributeSpainManufacturersMasterData():
  
  """Process Manufacturers Master Data Files,Validates Structure and Distributes to the Canonical Folder 
    Return:
      Boolean

    Example 1:    
      manufacturer_ok    = ValidateAndDistributeSpainManufacturersMasterData()
  """
  #Who                 When           What
  #Victor Salesa       23/01/2019     Initial Version
  #Ana Perez           25/03/2019     Included log managment and exception managment
  #Ana Perez           06/05/2019     Changed source folder from  __PHARMATIC_MASTER_DATA_LANDING_MANUFACTURERS_PATH__ to __PHARMATIC_MASTER_DATA_TOBEPROCESSED_MANUFACTURERS_PATH__
  #Ana Perez           07/05/2019     Added save as Database Table
  
  try:
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name) 
    pharmatic_manufacturer_master_df = (spark.read.format('csv')
                                        .options(header='true',delimiter=__SPAIN_MASTERDATA_CSV_DELIMITER__,mode=__SPAIN_MASTERDATA_CSV_MODE_)
                                        .load(__PHARMATIC_MASTER_DATA_TOBEPROCESSED_MANUFACTURERS_PATH__,schema=__M_SP_MANUFACTURER_RAW_SCHEMA__)
                                )
    
    #Trim spaces inside fields
    pharmatic_manufacturer_master_df = (reduce(
            lambda pharmatic_manufacturer_master_df, col_name: pharmatic_manufacturer_master_df.withColumn(col_name, trim(col(col_name))),
            pharmatic_manufacturer_master_df.columns,
            pharmatic_manufacturer_master_df)
    )

    pharmatic_manufacturer_master_df_validated = GenerateValCols(pharmatic_manufacturer_master_df,__MANUFACTURER_MASTER_STRUCT_VALIDATIONS__)
  
    #Todo implement rejection criteria based on validation
    error_lines = pharmatic_manufacturer_master_df_validated.filter(col("RESULT_ERR")==1).count()
    total_lines = pharmatic_manufacturer_master_df_validated.count()
  
    ADP_log_debug(process, logger_name, level_action, log_level, "Before saveAsCanonical SP MANUFACTURERS: " + str(pharmatic_manufacturer_master_df.count()) + " rows ", sys._getframe().f_code.co_name)
    saveAsCanonical(pharmatic_manufacturer_master_df,__PHARMATIC_MASTER_DATA_CANONICAL_MANUFACTURERS_PATH__,table_name=__MANUFACTURERS_TABLE_NAME__,mode='overwrite')
    ADP_log_debug(process, logger_name, level_action, log_level, "After saveAsCanonical SP MANUFACTURERS", sys._getframe().f_code.co_name)
    
    #refresh LOAD_DATE to Database 
    pharmatic_manufacturer_master_df_DB = pharmatic_manufacturer_master_df.withColumn('LOAD_DATE', to_timestamp(current_timestamp(), "yyyyMMddHHmmss")) 
    items_count = pharmatic_manufacturer_master_df_DB.count()
      
    ADP_log_debug(process, logger_name, level_action, log_level, "Before saveToDB SP MANUFACTURER: "\
                                                                  + str(items_count) + " rows ", sys._getframe().f_code.co_name)	
    saveToDB(pharmatic_manufacturer_master_df_DB,__MDM_STG_M_MANUFACTURER_DB_TABLE_NAME__,mode="overwrite",debug=False,job_id='')
    ADP_log_debug(process, logger_name, level_action, log_level, "After saveToDB SP MANUFACTURER", sys._getframe().f_code.co_name)
    
    pharmatic_manufacturer_master_df_DB.unpersist()
    pharmatic_manufacturer_master_df.unpersist()
    
    ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name) 
    return items_count
  except Exception as err:
    ADP_log_exception(process, logger_name, level_action, log_level, "", sys._getframe().f_code.co_name,  sys.exc_info())
    raise Exception(err)