In [1]:
%run "./ADP_Farmatic_Def"

In [2]:
#################################################################################
""" FTP Files Process Functions

"""
 #Who                 When           What
 #Victor Salesa       12/12/2018     Initial Version
 #Victor Salesa       21/01/2019     Moved patch - depatch ftp lib to functions
 #Victor Salesa       19/02/2019     Change code to retrieve zip password as a parameter
 #Victor Salesa       04/03/2019     Added GPR & GST Housekeeping function to delete all files not GSL
 #Victor Salesa       12/03/2019     udf_unzip: Change unzip process to include ZIP file suffix
 #Victor Salesa       13/03/2019     DeleteAllButSelloutFilesFromLanding: Fix function to select files to delete locating 'GSL' string
 #Victor Salesa       22/03/2019     FMT_FTPTriggerFilesDownload: Initial Version
 #Ana Perez           26/03/2019     Included log managment and exception managment
 #Victor Salesa       04/03/2019     created UnzipPharmaticFilesFromIngestion,MoveUncompressedFilesToArchive,StoreUnzipResultsToCSV 
 #                                   for better code understanding and isolated Exception and Error control of the different blocks.
 #Ana Perez           03/05/2019     Split ADP_Farmatic_Ingestion_Process in ADP_FTP and ADP_UNZIP 
 #Victor Salesa       07/05/2019     Rename DeleteAllButSelloutFilesFromLanding to  DeleteFilesFromLandingExcludingPattern and add patterns parameter
 #Victor Salesa       07/05/2019     Moved DeleteFilesFromLandingExcludingPattern to ADP_Unzip
################################################################################

import socket
from pyspark.sql import Row
from dateutil import parser
from pyspark.sql.functions import *
from pyspark.sql.types import *
from shutil import copyfile
from datetime import datetime

from pyspark.sql.functions import *
from pyspark.sql.types import *
from shutil import *
#import zipfile

from ftplib import FTP_TLS
from ftplib import FTP
import os
from multiprocessing.pool import ThreadPool
import inspect

###################################################################################  
def __patch_ftp_lib__():
  #Who                 When           What
  #Victor Salesa       xx/xx/2018     Initial version
  #Ana Perez           27/03/2019     Included log managment and exception managment
  #Ana Perez           06/05/2019     Change parameters of ADP_log functions by generic variables
  try:
    _old_makepasv = FTP_TLS.makepasv

    def _new_makepasv(self):
      try:
          host,port = _old_makepasv(self)
          host = self.sock.getpeername()[0]
          return host,port
      except Exception as err:
          ADP_log_exception(process, logger_name, level_action, log_level,  "", sys._getframe().f_code.co_name,  sys.exc_info())
          raise Exception(err)

    FTP_TLS.makepasv = _new_makepasv
    return _old_makepasv
  except Exception as err:
    ADP_log_exception(process, logger_name, level_action, log_level,  "", sys._getframe().f_code.co_name,  sys.exc_info())
    raise Exception(err)

###################################################################################  
  
#Patch FTP lib to avoid cross-network issue (uses peer ip instead of "internal ip")
if not ('__ftplibpatched__' in locals() or '__ftplibpatched__' in globals()): 
  ADP_log_info("ADP", "ADP_FTP", "DEFAULT", "INFO", "patching ftplib", sys._getframe().f_code.co_name)
  __patch_ftp_lib__()
  __ftplibpatched__ = 1
else:
  ADP_log_info("ADP", "ADP_FTP", "DEFAULT", "INFO", "ftplib already patched", sys._getframe().f_code.co_name)  
  
  
###################################################################################  

def FMT_GenerateFilesDownloadResultCsvAndUpload(results_download_df,debug=False, suffix_file_name="FMT"):
  """Trigger start download of Farmatic Files

    Return:
       Dataframe: Dataframe with the download results
    Example:

  """
  #Who                 When           What
  #Victor Salesa       xx/xx/2018     Initial version
  #Ana Perez           26/03/2019     Included log managment and exception managment
  #Ana Perez           06/05/2019     Included suffix_file_name to be used by several process (FMT, MDM_SP, ...)
  
  try:
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)  
          
    ADP_log_debug(process, logger_name, level_action, log_level, "Save results to csv", sys._getframe().f_code.co_name)
    
    (results_download_df
       .withColumn("size",column("size").cast(LongType()))
       .withColumn("pending_size",column("size")-column("downloaded_size"))
       .coalesce(1)
       .write
       .option("sep","|")
       .option("sep","|")
       .option("header", True)
       .option("QuoteMode", "NONE")
       .option("charset", "utf-8")
       .mode('overwrite')
       .csv(__INGESTION_BASE_PATH__+'files_downloaded_' +suffix_file_name+ '.csv')
    )
    
    ADP_log_debug(process, logger_name, level_action, log_level, "List raw csv inside spark csv folder", sys._getframe().f_code.co_name)
    
    src = list(filter(lambda x: 'csv' in x.name,dbutils.fs.ls(__INGESTION_BASE_PATH__+'files_downloaded_' +suffix_file_name+ '.csv/')))[0]

    src_path = src.path.replace('dbfs:/','/dbfs/')
    src_name = src.name

    filename_tst = str(datetime.now().strftime("%Y%m%d%H%M%S"))

    ADP_log_debug(process, logger_name, level_action, log_level, "filename_tst is: " + suffix_file_name + '_' + filename_tst, sys._getframe().f_code.co_name)
    
    file_path = '/dbfs'+__INGESTION_BASE_PATH__+'files_log/files_downloaded_' + suffix_file_name + '_' + filename_tst+'.csv'
    
    ADP_log_debug(process, logger_name, level_action, log_level, "file_path is: " + file_path, sys._getframe().f_code.co_name)
    
    filename = 'files_downloaded_' + suffix_file_name+ '_' + filename_tst+'.csv'
    
    ADP_log_debug(process, logger_name, level_action, log_level, "filename is: " + filename, sys._getframe().f_code.co_name)
    
    # src_file
    copyfile(src_path, file_path)
    
    ADP_log_debug(process, logger_name, level_action, log_level, "src_path is: " + src_path, sys._getframe().f_code.co_name)
    
    ADP_log_debug(process, logger_name, level_action, log_level, "Timestamp csv file: ", sys._getframe().f_code.co_name)
    
    file_upload=filename
    uploaded = upload_ftp_raw(__FARMATIC_FTP_HOST__,__FARMATIC_FTP_USER__,__FARMATIC_FTP_PASSWD__,file_upload,file_path)

    ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name)  
  except Exception as err:
    ADP_log_exception(process, logger_name, level_action, log_level,  "Generate Download Result Csv has failed with Error Message:", sys._getframe().f_code.co_name,  sys.exc_info())
    raise Exception(err)
  
###################################################################################  
  
def FMT_FTPTriggerFilesDownload(debug=False):
  """Trigger start download of Farmatic Files

    Return:
       Dataframe: Dataframe with the download results
    Example:

  """
  #Who                 When           What
  #Victor Salesa       22/03/2019     Initial version
  #Ana Perez           26/03/2019     Included log managment and exception managment
  #Victor Salesa       02/04/2019     Added total files downloaded control
  try:
    results_download = None
    
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)
    
    ADP_log_debug(process, logger_name, level_action, log_level, "Get List of Files", sys._getframe().f_code.co_name)
    
    #Get Listing of files in ftp
    files = list_ftp_raw(__FARMATIC_FTP_HOST__,__FARMATIC_FTP_PORT__,__FARMATIC_FTP_USER__,__FARMATIC_FTP_PASSWD__)
    
    if len(files)!=0:
      #Convert Listing to pyspark rows
      files_rows = list(map(lambda file: file_listing_to_row(file),files))

      ADP_log_debug(process, logger_name, level_action, log_level, "Build the destination path", sys._getframe().f_code.co_name)

      #Build file destination path
      ftp_files_df = (spark.createDataFrame(files_rows)
                      .withColumn("path",concat(lit('/dbfs'+__PHARMATIC_INGESTION_BASE_PATH__),col("name")))
      )

      ADP_log_debug(process, logger_name, level_action, log_level, "Create DF", sys._getframe().f_code.co_name)

      #Collect Filese
      files = ftp_files_df.collect()

      # Just take zip files (this is the original format the)
      files = list(filter(lambda fr: '.ZIP' in fr.name or '.zip' in fr.name,files))

      ADP_log_debug(process, logger_name, level_action, log_level, "Filter ZIP Files", sys._getframe().f_code.co_name)

      parameter_list = ((__FARMATIC_FTP_HOST__,__FARMATIC_FTP_PORT__,__FARMATIC_FTP_USER__,__FARMATIC_FTP_PASSWD__,file.size,file.time,file.name,file.path) for file in files)

      #Create thread pool to paralelize canonical generation
      threadPool = ThreadPool(32)

      #Configure spark to be optimized to paralilize
      spark.conf.set("spark.scheduler.mode",'FAIR')

      #Run thread pool with the list of files to be 
      results_download = (
        threadPool.starmap(
          download_and_delete_ftp_raw,
          parameter_list
        )
      )

      ADP_log_debug(process, logger_name, level_action, log_level, "Start Download", sys._getframe().f_code.co_name)

      threadPool.close()
      threadPool.join()

      ADP_log_debug(process, logger_name, level_action, log_level, "Create DF", sys._getframe().f_code.co_name)

      results_download_df = spark.createDataFrame(results_download)
      
      #Check all files were downloaded correctly
      downloaded_ok = results_download_df.filter(col("download_ok")==True).count()
      total_files_to_download = results_download_df.count()
      
      if(downloaded_ok < total_files_to_download):
        ADP_log_warning(process, logger_name, level_action, log_level, "END Not all files were downloaded correctly", sys._getframe().f_code.co_name)
      #end if(downloaded_ok < total_files_to_download):
      
      ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name)
      return results_download_df
    else:
      ADP_log_warning(process, logger_name, level_action, log_level, "END No files found", sys._getframe().f_code.co_name)
      return None
    #endif len(files)=0
  
  except Exception as err:
    if results_download != None:
      results_download_df = spark.createDataFrame(results_download)
      
      #Check all files were downloaded correctly
      downloaded_ok = results_download_df.filter(col("download_ok")==True).count()
      total_files_to_download = results_download_df.count()
      
      if(downloaded_ok < total_files_to_download):
        ADP_log_warning(process, logger_name, level_action, log_level, "END Not all files were downloaded correctly", sys._getframe().f_code.co_name)
      #end if(downloaded_ok < total_files_to_download):
    
      FMT_GenerateFilesDownloadResultCsvAndUpload(results_donwnload_df)
    #end if results_download != None:  
    ADP_log_exception(process, logger_name, level_action, log_level,  "Download has failed with Error Message:", sys._getframe().f_code.co_name,  sys.exc_info())
    raise Exception(err)
  
###################################################################################  

def file_listing_to_row(file):
  """Generates a Dataframe Row Information from a file line in a file listing comming from ftp
  
      Parameters:
        file = string value containing 1 line file listing
    Return:
       Return:
         pyspark.sql.Row 
    Example:
         #Get Listing of files in ftp
            files = list_ftp_raw(host,port,user,password)
        #Convert Listing to pyspark rows
            files_rows = list(map(lambda file: file_listing_to_row(file),files))
  """
  #Who                 When           What
  #Victor Salesa       12/12/2018     Initial version
  #Ana Perez           26/03/2019     Included log managment and exception managment
  try:
    tokens = file.split(maxsplit = 12)
    name = tokens[8]
    size = tokens[4]
    time_str = tokens[5] + " " + tokens[6] + " " + tokens[7]
    time = parser.parse(time_str)
    row = Row(name=name,size=size,time=time)
    return row
  except Exception as err:
    ADP_log_exception(process, logger_name, level_action, log_level,  "", sys._getframe().f_code.co_name,  sys.exc_info())
    return None
  
###################################################################################  

def getSize(fileobject):
  """Returns the size of the file defined by fileobject (path to the file)
  
      Parameters:
        fileobject = file handle
    Return:
       Return:
         size
    Example:
      with open(path, 'r') as fhandle:
        source_size=getSize(fhandle)
    
  """
  #Who                 When           What
  #Victor Salesa       12/12/2018     Initial version
  #Ana Perez           26/03/2019     Included log managment and exception managment
  try:
    fileobject.seek(0,2) # move the cursor to the end of the file
    size = fileobject.tell()
    return size
  except Exception as err:
    ADP_log_exception(process, logger_name, level_action, log_level,  "", sys._getframe().f_code.co_name,  sys.exc_info())
    return 0

###################################################################################  
  
@udf
def download_and_delete_ftp(host,port,user,password,size,time,file,path):
  """Encapsulates udf call to download_and_delete_ftp_raw
  
      Parameters:
        Please see download_and_delete_ftp_raw
    Return:
       Return:
         Please see download_and_delete_ftp_raw
    Example:
         Please see download_and_delete_ftp_raw
  """
  #Who                 When           What
  #Victor Salesa       12/12/2018     Initial version
  return download_and_delete_ftp_raw(host,port,user,password,size,time,file,path)

###################################################################################  

def list_ftp_raw(host,port,user,password, ftp_path = ""):
  """Lists root directory for the ftp detailed by the below parameters
  
      Parameters:
        host: string containing name of the ftp host
        port: string containing port of the ftp host
        user: string containing user to login to the ftp host
        password: string containing passworkd to login to the ftp host
        ftp_path: string containing the path in the ftp where the files to be processed are (relative to the root folder of the FTP)
    Return:
       Return:
         List of file strings with ftp listing information
    Example:
         ftp_path = '/masterdata'
         files = list_ftp_raw(host,port,user,password, ftp_path)
  """
  #Who                 When           What
  #Victor Salesa       12/12/2018     Initial version
  #Ana Perez           26/03/2019     Included log managment and exception managment
  #Ana Perez           02/05/2019     Included new ftp_path parameter

  try:
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name) 
    
    ftps = FTP_TLS()
    ftps.connect(host, port)
    ftps.auth()
    ftps.login(user, password)
    ftps.set_pasv(True)
    ftps.prot_p()
    ftps.cwd(ftp_path)
    files = []
    ftps.retrlines('LIST',files.append)
    ftps.quit()
    
    ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name) 
    return files
  except Exception as err:
    ADP_log_exception(process, logger_name, level_action, log_level,  "", sys._getframe().f_code.co_name,  sys.exc_info())
    return []
  
##################################################################################  
def upload_ftp_raw(host,user,password,filename,path,port=21,debug=False):
  """Uploads a file to the root folder of a ftp
  
      Parameters:
        host: string containing name of the ftp host
        port: string containing port of the ftp host
        user: string containing user to login to the ftp host
        password: string containing passworkd to login to the ftp host
        filename: filename to be uploaded.
        path: full path of the file to be uploaded
    Return:
       Boolean: Whether or not the file has been uploaded
    Example:
         uploaded = upload_ftp_raw(host,port,user,password)
  """
  #Who                 When           What
  #Victor Salesa       xx/xx/2018     Initial version
  #Ana Perez           26/03/2019     Included log managment and exception managment
  
  try:
    #Set up download thread connection
    
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)  
    ftp = FTP(host,user,password)
    
    #Set up upload file
    if os.path.isfile(path):
      with open(path, 'rb') as fhandle:
        source_size=getSize(fhandle)
        ADP_log_debug(process, logger_name, level_action, log_level, "Source Size: "+str(source_size), sys._getframe().f_code.co_name)
        ftp.cwd('/')
        ftp.storbinary( "STOR " +filename,open(path, "rb"))
        uploaded_size = ftp.size(filename)
        ADP_log_debug(process, logger_name, level_action, log_level, "Uploaded Size: "+str(uploaded_size), sys._getframe().f_code.co_name)
        fhandle.close()
    else:
        ADP_log_debug(process, logger_name, level_action, log_level, "Source File does not exist", sys._getframe().f_code.co_name)
    #Close upload ftp connection
    ftp.quit()
    
    ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name)  
    return True
  except Exception as err:
    ADP_log_exception(process, logger_name, level_action, log_level,  "Upload Resulting Csv has failed with Error Message: " + filename, sys._getframe().f_code.co_name,  sys.exc_info())
    ftp.quit()
    raise Exception(err)

################################################################################## 

def download_and_delete_ftp_raw(host,port,user,password,size,time,file,path, ftp_path=""):
  """Download a single file from ftp path and delete it
  
      Parameters:
        host: string containing name of the ftp host
        port: string containing port of the ftp host
        user: string containing user to login to the ftp host
        password: string containing passworkd to login to the ftp host
    Return:
       Return:
         Row result of download
    Example:
         download_row = download_and_delete_ftp_raw(host,port,user,password,size,time,file,path)
  """
  #Who                 When           What
  #Victor Salesa       12/12/2018     Initial version
  #Ana Perez           26/03/2019     Included log managment and exception managment
  #Victor Salesa       28/03/2019     Corrected 
  #Ana Perez           02/05/2019     Included new ftp_path parameter
  
  try:
    
    #Create result to append to dataframe with parameters from funciton
    frame = inspect.currentframe()
    args, _, _, values = inspect.getargvalues(frame)
    
    result = {i:values[i] for i in args}
    del result['host']
    del result['port']
    del result['user']
    del result['password']
    
    result_true = result.copy()
    result_true["download_ok"] = True
    result_false = result.copy()
    result_false["download_ok"] = False
  
    #Set up download thread connection
    
    ftps = FTP_TLS()
    ftps.connect(host, port)
    ftps.auth()
    ftps.login(user, password)
    ftps.set_pasv(True)
    ftps.prot_p()
    ftps.cwd(ftp_path)
    
    downloaded_size = 0
    
    #Set up download file
    with open(path, 'wb') as fhandle:
      current_size=0
      ftps.retrbinary('RETR ' + file, fhandle.write)
      downloaded_size=getSize(fhandle)
      if(downloaded_size==int(size)):
        ADP_log_debug(process, logger_name, level_action, log_level, str(file), sys._getframe().f_code.co_name)
  ######ftps.delete(file)
      result_true['downloaded_size']= downloaded_size
    ADP_log_debug(process, logger_name, level_action, log_level, str(result_true), sys._getframe().f_code.co_name)
    #Close download thread ftp connection
    ftps.quit()
    
    return Row(**result_true)
  except Exception as err:
    ADP_log_exception(process, logger_name, level_action, log_level,  file + " generate an issue but it continue with the next file:", sys._getframe().f_code.co_name,  sys.exc_info())
    ftps.quit()
    result_false['downloaded_size']= downloaded_size
    return Row(**result_false)
  
 ####################################################################################################################################################   

  
def MDM_SP_FTPTriggerFilesDownload(debug=False):
  """Trigger start download of MDM SP Files

    Return:
       Dataframe: Dataframe with the download results
    Example:

  """
  #Who                 When           What
  #Ana Perez           02/05/2019     First Version
  try:
    results_download = None
    
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)
    
    ADP_log_debug(process, logger_name, level_action, log_level, "Get List of Files", sys._getframe().f_code.co_name)
    
    #Get Listing of files in ftp
    files = list_ftp_raw(__FARMATIC_FTP_HOST__,__FARMATIC_FTP_PORT__,__FARMATIC_FTP_USER__,__FARMATIC_FTP_PASSWD__, ftp_path=__MASTERDATA_FTP_FOLDER__)
    
    if len(files)!=0:
      #Convert Listing to pyspark rows
      files_rows = list(map(lambda file: file_listing_to_row(file),files))

      ADP_log_debug(process, logger_name, level_action, log_level, "Build the destination path", sys._getframe().f_code.co_name)

      #Build file destination path
      ftp_files_df = (spark.createDataFrame(files_rows)
                      .withColumn("path",concat(lit('/dbfs'+__MASTER_DATA_INGESTION_BASE_PATH__),col("name")))
      )

      ADP_log_debug(process, logger_name, level_action, log_level, "Create DF", sys._getframe().f_code.co_name)

      #Collect Files
      files = ftp_files_df.collect()

      parameter_list = ((__FARMATIC_FTP_HOST__,__FARMATIC_FTP_PORT__,__FARMATIC_FTP_USER__,__FARMATIC_FTP_PASSWD__,file.size,file.time,file.name,file.path, __MASTERDATA_FTP_FOLDER__) for file in files)

      #Create thread pool to paralelize canonical generation
      threadPool = ThreadPool(32)

      #Configure spark to be optimized to paralelize
      spark.conf.set("spark.scheduler.mode",'FAIR')

      #Run thread pool with the list of files to be 
      results_download = (
        threadPool.starmap(
          download_and_delete_ftp_raw,
          parameter_list
        )
      )

      ADP_log_debug(process, logger_name, level_action, log_level, "Start Download", sys._getframe().f_code.co_name)

      threadPool.close()
      threadPool.join()

      ADP_log_debug(process, logger_name, level_action, log_level, "Create DF", sys._getframe().f_code.co_name)

      results_download_df = spark.createDataFrame(results_download)
      
      #Check all files were downloaded correctly
      downloaded_ok = results_download_df.filter(col("download_ok")==True).count()
      total_files_to_download = results_download_df.count()
      
      if(downloaded_ok < total_files_to_download):
        ADP_log_warning(process, logger_name, level_action, log_level, "END Not all files were downloaded correctly", sys._getframe().f_code.co_name)
      #end if(downloaded_ok < total_files_to_download):
      
      ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name)
      return results_download_df
    else:
      ADP_log_warning(process, logger_name, level_action, log_level, "END No files found", sys._getframe().f_code.co_name)
      return None
    #endif len(files)=0
  
  except Exception as err:
    if results_download != None:
      results_download_df = spark.createDataFrame(results_download)
      
      #Check all files were downloaded correctly
      downloaded_ok = results_download_df.filter(col("download_ok")==True).count()
      total_files_to_download = results_download_df.count()
      
      if(downloaded_ok < total_files_to_download):
        ADP_log_warning(process, logger_name, level_action, log_level, "END Not all files were downloaded correctly", sys._getframe().f_code.co_name)
      #end if(downloaded_ok < total_files_to_download):
    
      FMT_GenerateFilesDownloadResultCsvAndUpload(results_donwnload_df)
    #end if results_download != None:  
    ADP_log_exception(process, logger_name, level_action, log_level,  "Download has failed with Error Message:", sys._getframe().f_code.co_name,  sys.exc_info())
    raise Exception(err)
  
###################################################################################  