In [1]:
%run "./ADP_Farmatic_Def"

In [2]:
#################################################################################
""" Unzip Files Process Functions

"""
 #Who                 When           What
 #Victor Salesa       12/12/2018     Initial Version
 #Victor Salesa       21/01/2019     Moved patch - depatch ftp lib to functions
 #Victor Salesa       19/02/2019     Change code to retrieve zip password as a parameter
 #Victor Salesa       04/03/2019     Added GPR & GST Housekeeping function to delete all files not GSL
 #Victor Salesa       12/03/2019     udf_unzip: Change unzip process to include ZIP file suffix
 #Victor Salesa       13/03/2019     DeleteAllButSelloutFilesFromLanding: Fix function to select files to delete locating 'GSL' string
 #Victor Salesa       22/03/2019     FMT_FTPTriggerFilesDownload: Initial Version
 #Ana Perez           26/03/2019     Included log managment and exception managment
 #Victor Salesa       04/03/2019     created UnzipPharmaticFilesFromIngestion,MoveUncompressedFilesToArchive,StoreUnzipResultsToCSV 
 #                                   for better code understanding and isolated Exception and Error control of the different blocks.
 #Ana Perez           03/05/2019     Split ADP_Farmatic_Ingestion_Process in ADP_FTP and ADP_UNZIP
 #Victor Salesa       07/05/2019     Rename DeleteAllButSelloutFilesFromLanding to  DeleteFilesFromLandingExcludingPattern and add patterns parameter
################################################################################

import socket
from pyspark.sql import Row
from dateutil import parser
from pyspark.sql.functions import *
from pyspark.sql.types import *
from shutil import copyfile
from datetime import datetime

from pyspark.sql.functions import *
from pyspark.sql.types import *
from shutil import *
import zipfile

###################################################################################  

@udf 
def udf_unzip(zip_file,output_path,passwd=None):
  """Unzips a single file in output_path in a spark sql pipeline
  
      Parameters:
        zip_file: Complete Path of the zip file
        output_path: Output path for uncompressed files
        passwd: Password of the zip file if it has
    Return:
       Return:
        1  if file unzipped ok
        0  if file not unzipped ok
    Example:
      zip_list = (sc.parallelize(dbutils.fs.ls("/mnt/blob/ingestion/pharmatic")).toDF()
                    .filter(col("name")
                    .substr(-3,3)=='ZIP')
                    .select(regexp_replace("path", 'dbfs:', '/dbfs').alias("path"),col("name"),col("size"))
                    .withColumn("uncompress_result",unzip("path",lit("/dbfs/mnt/blob/landing/pharmatic")))
      )

  """
  #Who                 When           What
  #Victor Salesa       12/12/2018     Initial version
  #Victor Salesa       19/02/2018     Change code to include zip passwd as a parameter
  #Victor Salesa       12/03/2019     Change unzip process to include ZIP file suffix
  #Ana Perez           26/03/2019     Included log managment and exception managment
  try:
#     ADP_log_debug(process, logger_name, level_action, log_level, "BEGIN unzip "+ zip_file + " to " + output_path, sys._getframe().f_code.co_name)
    
    #Encode password to pass to unzip library
    passwd = bytes(passwd.encode("UTF-8"))
    
    #Split zip name to get historical files suffix
    txt_file_sufix_list = zip_file.upper().split('_')
    
    #If the file has a suffix we take that suffix otherwhise we use _0101 (1 file of 1)
    if(len(txt_file_sufix_list)>1):
      txt_file_sufix = "_"+zip_file.upper().split('_')[-1].replace(".ZIP","")
    else:
      txt_file_sufix = "_0101"
    
    #Open the zip file
    with zipfile.ZipFile(zip_file) as myzip:
      #Create a dict with output names for each txt file inside file.txt --> file+suffix.txt
      output_file_names_dict = {zip_file_item:zip_file_item.replace(".TXT",txt_file_sufix+".TXT") for zip_file_item in myzip.namelist()}
      #Create a dict with creation date for each txt file inside 
      output_file_dates_dict = {zip_file_item.filename:datetime(*zip_file_item.date_time) for zip_file_item in myzip.infolist()}
      #For each txt file inside
      for output_file_name in output_file_names_dict.keys():
        #Open the file
        with myzip.open(name=output_file_name,pwd=passwd) as myfile:
          #Create a new file with the renamed name
          myfile_extracted = open(output_path+output_file_names_dict[output_file_name], "wb")
          #Write file data with data from txt file readed
          myfile_extracted.write(myfile.read())
          myfile_extracted.close()
        # end myzip.open(name=output_file_name,pwd=passwd) as myfile:
      #end output_file_name in output_file_names_dict.keys():
    #end with zipfile.ZipFile(zip_file) as myzip
#     ADP_log_debug(process, logger_name, level_action, log_level, "END unzip "+ zip_file + " to " + output_path, sys._getframe().f_code.co_name)
    return 1
  except Exception as err:

#     ADP_log_debug(process, logger_name, level_action, log_level, "END Warning Fail unzip "+ zip_file + " to " + output_path, sys._getframe().f_code.co_name)
    return 0
  
###################################################################################

def UnzipPharmaticFilesFromIngestion(files_list,uncompress_folder,zip_password,debug=True):
  """Trigger unzip Files from Ingestion
    Parameters:
      uncompress_folder: folder where to place the unzipped files
      files_list: Output List of blob_ls with File to be selected to uncompress every list element should be a Row with the columns "name","path","size"
                  "name":"name of the file"
                  "path":"full path of the file in dbfs:/"
                  "size":"size of the file"
      zip_password: zip password for the unzipped files
    Return:
       Dataframe: Dataframe with the unzipping results
  """
  #Who                 When           What
  #Victor Salesa       02/04/2019     Initial version
  try:
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)    
    ADP_log_debug(process, logger_name, level_action, log_level, "Before unzip zip_list in uncompress folder", sys._getframe().f_code.co_name)

    #Filter zip files to be uncompressed. Uncompress files in folder
    zip_list_df = (sc.parallelize(files_list).toDF()
                    .filter(col("name").substr(-3,3)=='ZIP')
                    .select("name","path","size")
                    .withColumn("raw_path",regexp_replace("path", 'dbfs:', '/dbfs'))
                  )
    #Mark cache in order to cache list and execute
    zip_list_df.cache()
    zip_list_count = zip_list_df.count()
    
    if zip_list_count!=0:
      ADP_log_debug(process, logger_name, level_action, log_level, "Before unzip zip_list in uncompress folder" , sys._getframe().f_code.co_name)
      
      zip_list_df = zip_list_df.withColumn("uncompress_result",udf_unzip("raw_path",lit(uncompress_folder),lit(zip_password)))
      ADP_log_debug(process, logger_name, level_action, log_level, "After unzip zip_list in uncompress folder" , sys._getframe().f_code.co_name)
      
      #Cache and execute unzipping operation
      zip_list_df.cache()
      zip_list_df.count()
      
      #Move uncompressed files to archive folder
      ADP_log_debug(process, logger_name, level_action, log_level, "After cache zip_list_df " + str(zip_list_df.count()), sys._getframe().f_code.co_name)
      ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name) 
      return zip_list_df
    else:
      ADP_log_info(process, logger_name, level_action, log_level, "END No zip Files", sys._getframe().f_code.co_name)    
      return None
    #end zip_list_count!=0:

  except Exception as e:
    ADP_log_warning(process, logger_name, level_action, log_level, "END Fail Unzipping files", sys._getframe().f_code.co_name)
    return None
  #end except Exception as e

##################################################################################################################################################################  
  
def MoveUncompressedFilesToArchive(zip_files_list_df,archiving_folder):
  """Move Uncompressed Zips to Archive
    Parameters:
      zip_files_list_df: DataFrame containing Files to be moved with the following structure:
        
        
    Return:
       Dataframe: Dataframe with the unzipping results
  """
  #Who                 When           What
  #Victor Salesa       02/04/2019     Initial version
  try:
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)    
    ADP_log_debug(process, logger_name, level_action, log_level, "Before move zip_files_moved with ", sys._getframe().f_code.co_name)
    
    zip_files_moved = zip_list_df.rdd.map(lambda r: r+Row(archived_result=('1' if blob_move_file(r.path,archiving_folder) else '0'),archived_file='dbfs:'+archiving_folder.replace('dbfs:','')+r.name)  ).collect()

    #Create dataframe with zips moved
    zip_files_moved_df = sc.parallelize(zip_files_moved).toDF(zip_list_df.columns + ['archived_result','archived_file'])
    
    #Cache and execute moving operation
    zip_files_moved_df.cache()
    zip_files_moved_count = zip_files_moved_df.count()
    
    ADP_log_debug(process, logger_name, level_action, log_level, "After create zip_files_moved_df ", sys._getframe().f_code.co_name)
   
    #Check Moved Files
    if zip_files_moved_count!=0:
      ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name)
      return zip_files_moved_df
    else:
      ADP_log_warning(process, logger_name, level_action, log_level, "END No files moved", sys._getframe().f_code.co_name)
      return None
  
  except Exception as e:
    ADP_log_warning(process, logger_name, level_action, log_level, "END Fail Moving Zip Files", sys._getframe().f_code.co_name)
    return None
  #end except Exception as e
  
##################################################################################################################################################################  
  
def StoreUnzipResultsToCSV(zip_files_moved_df,ingestion_path):
  """Store Unzip Results to CSV
    Parameters:
  """
  #Who                 When           What
  #Victor Salesa       02/04/2019     Initial version
  try:
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)    
    #Create file with unzipped information
    (zip_files_moved_df
         .coalesce(1)
         .write
         .option("sep","|")
         .option("sep","|")
         .option("header", True)
         .option("QuoteMode", "NONE")
         .option("charset", "utf-8")
         .mode('overwrite')
         .csv(__INGESTION_BASE_PATH__+'files_unzipped.csv')
      )

    # Retrives the internal part-*.csv from "Spark Way" csv in order to create a former csv
    src = [file for file in dbutils.fs.ls(ingestion_path+'files_unzipped.csv/') if 'csv' in file.name]
    #Generates the former local fs names to move the partition csv file to a new filename
    src_path = src[0].path.replace('dbfs:/','/dbfs/')
    src_name = src[0].name

    #Generates a timestamp to mark the unzipping datetime
    filename_tst = str(datetime.now().strftime("%Y%m%d%H%M%S"))

    # Moves the partition file part*.csv to a new file named with the timestamp
    copyfile(src_path, '/dbfs'+ingestion_path+'files_log/files_unzipped_'+filename_tst+'.csv')
    ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name) 
  except Exception as e:
    ADP_log_warning(process, logger_name, level_action, log_level, "END Fail Generating Unzip results CSV", sys._getframe().f_code.co_name,  sys.exc_info())
    return None
  #end except Exception as e
  
  
 ####################################################################################################################################################   
  
def DeleteFilesFromLandingExcludingPattern(patterns=__PHARMATIC_FILES_INCLUDED_PATTERN__,debug=False):
  """Deletes all unzipped Files but Sellout ones from Landing folder
  """
  
  #Who                 When           What
  #Victor Salesa       04/03/2019     Initial version
  #Victor Salesa       13/03/2019     Fix function to select files to delete locating 'GSL' string
  #Ana Perez           26/03/2019     Included log managment and exception managment
  
  try:
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)  
    
    ADP_log_debug(process, logger_name, level_action, log_level, "Select Files", sys._getframe().f_code.co_name)
    
    #Select files to be deleted
    unzipped_files_not_sellout_selected_df = sc.parallelize(blob_ls(__PHARMATIC_LANDING_BASE_PATH__)).filter(lambda file: all(pattern not in file.name for pattern in patterns)).toDF()

    # ## Delete old Timestamp files from folder as they are not going to be processed
    unzipped_files_not_sellout_deleted = (unzipped_files_not_sellout_selected_df.select("path")
                                     .distinct()
                                     .withColumn("deleted",blob_delete_file_sql("path"))
                                     .collect()
    )

    ADP_log_debug(process, logger_name, level_action, log_level, "Delete Files", sys._getframe().f_code.co_name)
    
    if len(unzipped_files_not_sellout_deleted)!=0:

      ADP_log_debug(process, logger_name, level_action, log_level, "Get Results", sys._getframe().f_code.co_name)
      
      #Mount a dataframe with deleted files result
      unzipped_files_not_sellout_deleted_df = sc.parallelize(unzipped_files_not_sellout_deleted).toDF()
      unzipped_files_not_sellout_deleted_df.cache()

      ADP_log_debug(process, logger_name, level_action, log_level, "Calculate Deleted", sys._getframe().f_code.co_name)
      
      #Calculate Deleted Files
      total_deleted = (unzipped_files_not_sellout_deleted_df.agg(sum("deleted").alias("deleted_total")).collect())[0].deleted_total
                                 
      ADP_log_debug(process, logger_name, level_action, log_level, " Deleted Count: " + str(total_deleted), sys._getframe().f_code.co_name)
      
      #Calculate Total Files
      total_rows = unzipped_files_not_sellout_deleted_df.count()

      ADP_log_debug(process, logger_name, level_action, log_level, " Row Count: " + str(total_rows), sys._getframe().f_code.co_name)
      
      #Generate Error if total_deleted < total_rows
      if total_deleted < total_rows:
          ADP_log_debug(process, logger_name, level_action, log_level, "Fail Deleting", sys._getframe().f_code.co_name)
          
          not_deleted = unzipped_files_not_sellout_deleted_df.filter(col("deleted")==0).collect()
          not_deleted_names = str([file.name for file in not_deleted]).replace("[","").replace("]","")
          ADP_log_exception(process, logger_name, level_action, log_level,  "Fail deleting bad files: "+ not_deleted_names, sys._getframe().f_code.co_name,  sys.exc_info())
          raise Exception(err)

      ADP_log_debug(process, logger_name, level_action, log_level, "Files Deleted:" + str(unzipped_files_not_sellout_deleted), sys._getframe().f_code.co_name)
      
    ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name)  
    
  except Exception as err:
    ADP_log_exception(process, logger_name, level_action, log_level,  "Warning: Not Stock or Purchases files found", sys._getframe().f_code.co_name,  sys.exc_info())
    raise Exception(err)

    ###################################################################################  