In [1]:
 %run "../Libraries/ADP_Spain_MDM_Def"

In [2]:
%run "../Libraries/ADP_MDM_SP_QA"

In [3]:
process = "MDM_SP"
logger_name = "MDM_Spain_Data_ValidateFiles"
level_action = "DEFAULT"
log_level = "DEFAULT"

In [4]:
#################################################################################
""" Main Process for Filename Validation

"""
 #Who                 When           What
 #Victor Salesa       03/05/2019     Initial Version
 #Victor Salesa       06/05/2019     Added move and archive files
################################################################################

# Process Landing Files
try: 
  #Vars to control process
  exception_source = logger_name+'.DEFAULT'
  exception_messages={ logger_name+'.DEFAULT':'Undefined Error'
                       ,logger_name+'.VALIDATION_START':'Fail before Starting the process'
                       ,logger_name+'.VALIDATION_VALIDATE':'Fail before validating the filename'
                       ,logger_name+'.VALIDATION_FILTER_OK':'Fail before filtering the filenames'
                       ,logger_name+'.VALIDATION_FILTER_LATEST_TIMESTAMP':'Fail before filtering latest timestamp'
                       ,logger_name+'.COPY_FILES_TOBEPROCESSED':'Fail before copying files to tobeprocessed'
                       ,logger_name+'.CALCULATE_UPDATED_MASTERDATA_FILES':'Fail before calculating updated masterdata files'
                       ,logger_name+'.DELETE_OLD_MASTERDATA_FILES':'Fail before deleting old masterdata files '
                       ,logger_name+'.ARCHIVING_FILES':'Fail before archiving the files'
  }
  
  #Create a dynamic column with name validation based on masterdata prefixes
  IS_MDM_FILE_ERR = (when( (reduce(lambda x, y: (x|y),[col("name").startswith(PREFIX) for PREFIX in __MDM_SP_FILE_PREFIX__],lit(False))==True)   # Column starts with one of the valid prefixes
                            & (col("name").substr(-19,1)=='_')                                                                                   # Separator symbol validation
                            & (udf_isDate_sql(col("name").substr(-18,14),lit(__YYYYMMDDhhmmss__))==True)                                         # Timestamp in FileName Validation
                            & (upper(col("name").substr(-4,4))==__MASTER_DATA_FILES_EXT__)                                                       # File extension
                         ,0)
                        .otherwise(1)
                        .alias("IS_MDF_FILE_ERR")
                      )

  #Create parameter to launch task from notebook launcher
  try:
    level_action = dbutils.widgets.get("level_action") 
    log_level = dbutils.widgets.get("log_level")  
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)   
  except Exception as err:
    ADP_log_info(process, logger_name, "DEFAULT", "DEFAULT", "Internal Call parameter was not Created Please Re-run", sys._getframe().f_code.co_name)   

    dbutils.widgets.text("level_action", "DEFAULT","Level Action")
    dbutils.widgets.text("log_level","DEFAULT","Log Level")

    level_action = dbutils.widgets.get("level_action") 
    log_level = dbutils.widgets.get("log_level")
  #end exception 
  
  # Date and Time of the begining to this process
  start_date = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')

  # Create File list from Pharmatic Landing Folder
  filelist = blob_ls(__MASTER_DATA_INGESTION_BASE_PATH__)
  
  ADP_log_debug(process, logger_name, level_action, log_level, "Files in ingestion folder: "+str(len(filelist)), sys._getframe().f_code.co_name)
    
  if len(filelist)!=0:
    
    #Add column with Start Process Timestamp
    file_validation_start_df = (createDFFromList(filelist)
                                                    .withColumn("START_DATE", to_timestamp(lit(start_date), 'yyyy-MM-dd HH:mm:ss'))
                                                    .repartition(__PARTITIONS_DEFAULT__)
                               )
    
    exception_source = logger_name+'.VALIDATION_START'
    
    #Validate FileNames
    file_validation_validate_df = (file_validation_start_df
                                                    .withColumn("START_DATE", to_timestamp(lit(start_date), 'yyyy-MM-dd HH:mm:ss'))
                                                    .withColumn("MASTER_DATA_FILE_NAME",concat(col("name").substr(lit(1),length(col("name"))-19),lit(__MASTER_DATA_FILES_EXT__))   )
                                                    .withColumn("MASTER_DATA_FILE_TIMESTAMP",col("name").substr(-18,14))
                                                    .select('*',IS_MDM_FILE_ERR)
                                                    .repartition(__PARTITIONS_DEFAULT__)
                               )
    
    exception_source = logger_name+'.VALIDATION_VALIDATE'
    
    #Filter Valid Files
    file_validation_ok_df = file_validation_validate_df.filter(col("IS_MDF_FILE_ERR")==0).drop("IS_MDF_FILE_ERR")
    files_ok_count = file_validation_ok_df.count()
    
    exception_source = logger_name+'.VALIDATION_FILTER_OK'
    
    ADP_log_debug(process, logger_name, level_action, log_level, "Files ok ingestion folder: "+str(files_ok_count), sys._getframe().f_code.co_name)
    
    if files_ok_count!=0:
      
      #Create a window function to get the latest timestamp of the file
      windowLatestFile = Window.partitionBy(file_validation_ok_df['MASTER_DATA_FILE_NAME']).orderBy(file_validation_ok_df['MASTER_DATA_FILE_TIMESTAMP'].desc())
      
      #Filter Latest File
      file_latest_df = (file_validation_ok_df
                          .withColumn("LATEST_MASTER_DATA_FILE_TIMESTAMP",first("MASTER_DATA_FILE_TIMESTAMP").over(windowLatestFile))
                          .withColumn("LATEST_MASTER_DATA_FILE_FLG",when(col("LATEST_MASTER_DATA_FILE_TIMESTAMP")==col("MASTER_DATA_FILE_TIMESTAMP"),1).otherwise(0))
                          .filter(col("LATEST_MASTER_DATA_FILE_FLG")==1)
                          .drop("LATEST_MASTER_DATA_FILE_TIMESTAMP","LATEST_MASTER_DATA_FILE_FLG")
                       )
      
      exception_source = logger_name+'.CALCULATE_UPDATED_MASTERDATA_FILES'
      
      #Calculate whether or not a master data file is going to be updated based on the file name.
      updated_mdm_prefixes = list(map(lambda r:str(r.MASTER_DATA_FILE_NAME).replace(__MASTER_DATA_FILES_EXT__,''),file_latest_df.select("MASTER_DATA_FILE_NAME").distinct().collect()))
      
      IS_UPDATED_MDM_FILE = (when( (reduce(lambda x, y: (x|y),[col("name").startswith(PREFIX) for PREFIX in updated_mdm_prefixes],lit(False))==True)
                                 ,1
                             ).otherwise(0)
                             .alias("IS_UPDATED_MDM_FILE")
                            )
      
      ADP_log_debug(process, logger_name, level_action, log_level, "Files ready to move to tobeprocessed: "+str(file_latest_df.count()), sys._getframe().f_code.co_name)
      
      exception_source = logger_name+'.DELETE_OLD_MASTERDATA_FILES'
      
      #Delete old files version before moving just for the files that are going to be updated
      old_master_data_df = (sc.parallelize(blob_ls(__MASTER_DATA_TOBEPROCESSED_BASE_PATH__)).toDF()
                             .select('*',IS_MDM_FILE_ERR,IS_UPDATED_MDM_FILE)
                             .filter((col("IS_MDF_FILE_ERR")==0) & (col("IS_UPDATED_MDM_FILE")==1))
                             .withColumn("TOBEPROCESSED_FILE_DELETED",blob_delete_file_sql("path"))
                      )
      old_master_data_df.cache()
      old_master_data_df.count()
      
      exception_source = logger_name+'.VALIDATION_FILTER_LATEST_TIMESTAMP'
      
      #Copy Master Data Files renamed to TobeProcessed
      file_movetobeprocessed_df   = (file_latest_df
                                      .withColumn("MASTER_DATA_FILE_DEST_PATH",concat(lit(__MASTER_DATA_TOBEPROCESSED_BASE_PATH__),regexp_replace(col("MASTER_DATA_FILE_NAME"),__MASTER_DATA_FILES_EXT__, ""),lit('_'),col("MASTER_DATA_FILE_TIMESTAMP"),lit(__MASTER_DATA_FILES_EXT__)))
                                      .withColumn("FILE_TOBEPROCESSED",when(blob_copy_file_sql("path","MASTER_DATA_FILE_DEST_PATH"),1).otherwise(0))
                                     )
      file_movetobeprocessed_df.cache()
      file_movedtobeprocessed = file_movetobeprocessed_df.count()
      
      
      ADP_log_debug(process, logger_name, level_action, log_level, "Files moved to tobeprocessed: "+str(file_movedtobeprocessed), sys._getframe().f_code.co_name)
      
      exception_source = logger_name+'.COPY_FILES_TOBEPROCESSED'
      
      #Archive all files in ingestion to archive
      file_archived_df = (file_validation_start_df
                            .withColumn("DEST_ARCHIVED_PATH",concat(lit(__MASTER_DATA_ARCHIVE_BASE_PATH__),col("name")))
                            .withColumn("FILE_ARCHIVE",when(blob_move_file_sql("path","DEST_ARCHIVED_PATH"),0).otherwise(1))       
                         )
      file_archived_df.cache()
      files_archived = file_archived_df.count()
      
      ADP_log_debug(process, logger_name, level_action, log_level, "Files archived: "+str(files_archived), sys._getframe().f_code.co_name)
      
      exception_source = logger_name+'.ARCHIVING_FILES'
      
      ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name) 
      
    else:
      ADP_log_warning(process, logger_name, level_action, log_level, "END No files found", sys._getframe().f_code.co_name)
    #end if len(filelist)!=0:
    
    #Serialize qa data to canonical and database
    file_validation_validate_df.transform(QA_SERIALIZE_MDM_SP_FILE_NAME_VALIDATION_DATA)
    exception_source = logger_name+'.SERIALIZE_QA'
    
  else:
    ADP_log_warning(process, logger_name, level_action, log_level, "END No files found", sys._getframe().f_code.co_name)
  #end if len(filelist)!=0: 
  
except Exception as err:
  try:
    error_message = exception_messages[exception_source]
  except Exception as e:
    error_message    = "Undefined Error"
    exception_source = logger_name+".DEFAULT"
  
  ADP_log_exception(process, logger_name, level_action, log_level, error_message, sys._getframe().f_code.co_name,  sys.exc_info())
  raise Exception(err)