In [1]:
%run "./ADP_Farmatic_Def"

In [2]:
#################################################################################
""" Process functions for Farmatic Data Ingest

"""
 #Who                 When           What
 #Victor Salesa       02/10/2018     Initial Version
 #Victor Salesa       26/02/2019     ValidateAndDistributePharmaticFiles: Remove archiving from TXT files as it's moved to zip files
 #Victor Salesa       15/03/2018     ValidateAndDistributePharmaticFiles: Included file_date validation
 #Victor Salesa       01/04/2019     Split ValidateAndDistributePharmaticFiles in ValidatePharmaticFiles and DistributePharmaticFiles
 #Victor Salesa       10/04/2019     ValidatePharmaticFiles: Updated getFirstLine function to use ISO-8859-1 encoding
################################################################################

#################################################################################################################################################################

def ManualRecoverPharmaticFilesQAData():
  """Manual Recover QA FileName Validation data in case process fails before writing to DB and blob
  """
  #Who                 When           What
  #Victor Salesa       01/04/2018     Initial version
  
  RecoverPharmaticFilesQAData("path","name","last_modified","tobeprocessed",start_date,__PHARMATIC_TOBEPROCESSED_BASE_PATH__)
  RecoverPharmaticFilesQAData("path","name","last_modified","error",start_date,__PHARMATIC_ERROR_BASE_PATH__)
#end def ManualRecoverPharmaticFilesQAData():

def RecoverPharmaticFilesQAData(pathColName,nameColName,LastModifiedColName,recover_source,recover_start_date,recoverPath,debug=False):
  """Recover QA data in case process fails before writing to DB and blob

        Parameters:
          pathColName:         Name of the column holding the files path
          nameColName:         Name of the column holding the files name
          LastModifiedColName: Name of the column holding the last modified file date
          recover_source     : "tobeprocessed" or "error" depending on where we are performing the recover
          recover_start_date : timestamp where the process real started
          recoverPath        : basepath where the files to be used for recovering are located
  """
  #Who                 When           What
  #Victor Salesa       01/04/2018     Initial version
  try:
    recover_df = sc.parallelize(blob_ls(recoverPath)).toDF().filter(col(LastModifiedColName)>=lit(recover_start_date)).orderBy(LastModifiedColName,ascending=False)
    if recover_df.count()>=0:
      recover_slice_df = recover_df.withColumn("START_DATE",lit(recover_start_date))
      recover_validate_df = recover_slice_df.transform(ValidatePharmaticFiles(pathColName,nameColName))
      recover_validate_df = (recover_validate_df.withColumn(nameColName,regexp_replace(col(nameColName),"\_(.*?)\.","."))
                                                   .withColumn(pathColName,regexp_replace(col(pathColName),"\_(.*?)\.","."))
                                                   .withColumn(pathColName,regexp_replace(col(pathColName),recover_source,"landing"))
                               )
      recover_validate_df.transform(QA_SERIALIZE_FARMATIC_FILE_NAME_VALIDATION_DATA)
    else:
      ADP_log_warning(process, logger_name, level_action, log_level, "Not Files to Recover Found on "+recover_source, sys._getframe().f_code.co_name)
    #end if recover_df.count()>=0
  except Exception as err:
    ADP_log_exception(process, logger_name, level_action, log_level, error_message, sys._getframe().f_code.co_name,  sys.exc_info())
    raise Exception(err)  
    
#end def RecoverPharmaticFilesQAData():


#################################################################################################################################################################

def DistributePharmaticFiles(pathColName,nameColName,tobeProcessedPath,errorPath,archivePath,debug=False):
  ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)
  def inner(df):
    """Distribute Pharmatic Files to error or tobeprocessed based on the name validation

        Parameters:
        Dataframe df      -- Dataframe with the splitted name of the files
        pathColName       -- Column with the path of the file to be distributed
        nameColName       -- Column with the name of the file to be distributed
        tobeProcessedPath -- Path to put the file to be processed
        errorPath         -- Path to put the file if an error in the name
        archivePath       -- Path to put the original file for archiving
        debug             -- enable debug
        
      Return:
        Dataframe -- new dataframe with the validation columns

      Example 1:
        This is a function intented to be used with the chain Transformation pattern with the quinn libary
        
    """
    #Who                 When           What
    #Victor Salesa       01/04/2018     Initial version
    #Victor Salesa       01/04/2019     Redistributed code between ValidatePharmaticFiles and DistributePharmaticFiles
    #Ana Perez           16/04/2019     Modify log managment and exception managment
    try: 
      ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name) 
      ADP_log_info(process, logger_name, level_action, log_level, "END: After this line it will prepare and validate files", sys._getframe().f_code.co_name)
      
      return(
        df.withColumn("formatted_timestamp",col("file_date"))
          .withColumn("temp_filename_noext",regexp_replace(nameColName,'^*\.TXT(?i)$',''))
          .withColumn("file_destname",concat(col("temp_filename_noext"),lit("_LD"),col("formatted_timestamp"),lit(".TXT")))
          .drop("temp_filename_noext")
          .drop("formatted_timestamp")
          .withColumn("file_distributed",distributeFile_sql(pathColName,lit(tobeProcessedPath),lit(errorPath),"file_name_ok","file_destname"))
          .withColumn("file_distributed_to",concat(when(col("file_name_ok")==True,lit(tobeProcessedPath)).otherwise(lit(errorPath)),col("file_destname")))
      )   
    except Exception as err:
      ADP_log_exception(process, logger_name, level_action, log_level,  "", sys._getframe().f_code.co_name,  sys.exc_info())
      raise Exception(err)
  return inner

##################################################################################################################################################################

def ValidatePharmaticFiles(pathColName,nameColName,debug=False):
  ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)
  def inner(df):
    """Validate Farmatic File Name to dstribute it accordingly

        Parameters:
        Dataframe df      -- Dataframe with the splitted name of the files
        pathColName       -- Column with the path of the file to be distributed
        
      Return:
        Dataframe -- new dataframe with the validation columns

      Example 1:
        This is a function intented to be used with the chain Transformation pattern with the quinn libary
        
    """
    #Who                 When           What
    #Victor Salesa       02/10/2018     Initial version
    #Victor Salesa       08/11/2018     Removed file_id column(not necessary timestamp will go in filename)
    #                                   Added formatted_timestamp,file_timestamp and file_destname to the file dataframe for filename calculation
    #Victor Salesa       11/01/2019     Added debug parameter
    #Victor Salesa       14/02/2019     Removed file archiving as we are archiving as ZIP
    #Victor Salesa       14/03/2019     Removed any reference to timestamp generation except the new coming from file_date
    #Victor Salesa       15/03/2018     Included file_date validation 
    #Ana Perez           28/03/2019     Included log managment and exception managment
    #Victor Salesa       01/04/2019     Splitted Function in two in order to be able to validate without distributing
    #Victor Salesa       01/04/2019     Redistributed code between ValidatePharmaticFiles and DistributePharmaticFiles
    #Victor Salesa       10/04/2019     ValidatePharmaticFiles: Updated getFirstLine function to use ISO-8859-1 encoding
    #Ana Perez           16/04/2019     Modify log managment and exception managment
    try: 
      ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name) 
      
      ADP_log_debug(process, logger_name, level_action, log_level, "Start ValidatePharmaticFiles", sys._getframe().f_code.co_name)
      
      drop_list = __SP_HEADER_COLUMN_NAMES__.copy()
      drop_list.remove("FileDate")
      
      ADP_log_info(process, logger_name, level_action, log_level, "END: After this line it will prepare and validate files", sys._getframe().f_code.co_name) 
      
      return(
        df.transform(SliceDFColumn(nameColName,__SP_FILENAME_COLUMN_NAMES__,__SP_FILENAME_LENGHTS__))
          .withColumn("pharmacy_unique_code_ok",    (udf_isDigit_sql("pharmacy_unique_code")==True) & (length("pharmacy_unique_code")==5) )
          .withColumn("spec_version_ok",            (udf_isDigit_sql("spec_version")==True)         & (length("spec_version")==3)         )
          .withColumn("release_version_ok",         (udf_isDigit_sql("release_version")==True)      & (length("release_version")==2)      )
          .withColumn("data_date_ok",               udf_isDate_sql("data_date",lit(__YYYMMDD__))                                          )
          .withColumn("origin_ok",                  col("origin").isin(list(file_origins.keys()))                                         )
          .withColumn("file_type_ok",               col("file_type").isin(list(file_types.keys()))                                        )
          
          # Get file date column from file by slicing first file line data
          .withColumn("file_header",getFirstLine(pathColName,lit("ISO-8859-1")))
          .transform(SliceDFColumn("file_header",__SP_HEADER_COLUMN_NAMES__,__SP_HEADER_LENGHTS__))
          .drop("file_header")
          .drop(*iter(drop_list))
          .withColumnRenamed("FileDate","file_date")
          .withColumn("file_date_ok",               udf_isDate_sql("file_date",lit(__YYYYMMDDhhmmss__)))
        
          # Set validation true or False
          .withColumn("file_name_ok",expr("pharmacy_unique_code_ok==True and spec_version_ok==True and release_version_ok=True and data_date_ok=True and origin_ok=True and file_type_ok=True and file_date_ok=True"))
        
          #Generate file_timestamp column
          .withColumn("temp_timestamp",to_timestamp("file_date", "yyyyMMddHHmmss"))
          .withColumn("file_timestamp",coalesce(col("temp_timestamp").cast(StringType()),col("last_modified").cast(StringType())))
          .drop("temp_timestamp")
      )  
    except Exception as err:
      ADP_log_exception(process, logger_name, level_action, log_level,  "", sys._getframe().f_code.co_name,  sys.exc_info())
      raise Exception(err)
    
  ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name)
  return inner