In [1]:
%run "./ADP_Logging"

In [2]:
################################################################################
"""General Utils.

"""
 #Who                 When           What
 #Ana Perez           02/10/2018     Initial version
 #Victor Salesa Sanz  18/02/2018     Added saveAsCanonical Function
 #Victor Salesa Sanz  25/02/2018     Unified import libraries aEndnd added new
 #Victor Salesa Sanz  29/19/2018     SliceDFColumn: Optimized to avoid using UDF's
 #Victor Salesa Sanz  02/11/2018     Added isDate & isDigit functions optimized for pyspark
 #Victor Salesa Sanz  06/11/2018     Included sync files before executing shutil operations to avoid unsync errors
 #Victor Salesa Sanz  06/11/2018     Added destName to move files functions (copyFile,distributeFile,....)
 #Victor Salesa Sanz  07/11/2018     Add Window functions library
 #Victor Salesa Sanz  15/11/2018     Add pyspark.sql.functions as psf for ParseJSONCols compatibility
 #                                   Add unpivot function
 #                                   Add ParseJSONCols  function
 #                                   Add flatten_struct function
 #Victor Salesa Sanz  20/11/2018     ParseJSONCols: Add debug Paramater to ParseJSONCols
 #                                   ParseJSONCols: Drop Malformed _ERR fields (no data)
 #Victor Salesa Sanz  26/11/2018     Added jaydebeapi library to perform UPDATE operation that it's not allowed with default spark
 #                                   Added urllib library to encode folder path for "update" operations on partitioning    
 #Ana Perez           08/02/2019     Added traceback library and register blob_delete_file_sql
 #Victor Salesa       08/02/2019     Moved blob_delete_file_sql register to UTL_Blob
 #Victor Salesa       11/02/2019     Commented quinn libary as It was conflicting list code and added specific code from quinn linary we were using
 #                                         import functools that is contained into quinn  
 #Victor Salesa       01/03/2019    Create GenDataFrameAsSchema and CopyNullableStateFromSchema to adapt schemas
 #Victor Salesa       02/03/2019    Modify GenDataFrameAsSchema to avoid copying schema
 #Victor Salesa       02/03/2019    Move pyspark import to Utl gen 
 #Victor Salesa       27/02/2019    SaveAsCanonical: Changed function to write to parquet
 #Victor Salesa       06/03/2019    SaveAsCanonical: Changed function to write to hive-parquet
 #Ana Perez           07/03/2019    SaveAsCanonical: Moved save operation from if debug==True inside
 #Victor Salesa       08/03/2019    SaveAsCanonical: Added lower() to check table exists as tables query return names in lower case
 #Victor Salesa       14/03/2019    GetFirstLine: Added function
 #Ana Perez           19/03/2019    Added sys library to exception managment
 #Victor Salesa       28/03/2019    Replace sqlContext.refreshTable with sql.catalog.refreshTable
 #Victor Salesa       04/03/2019    Added callSafe method
 #Victor Salesa       10/04/2019    GetFirstLine: Added charset detection in order to be able to read all files and forcing encode enabled.
 #Victor Salesa       10/04/2019    getEncoding: Initial Version
 #Victor Salesa       14/04/2019    Added getNextJobId,getBasePath,RollbackCanonicalTable
 #Victor Salesa       14/04/2019    SaveAsCanonical: Changed saveAsCanonical to include Rollback adn process Id
 #Victor Salesa       17/04/2019    saveToDB: ADDED ROLLBACK ID
 #Victor Salesa       17/04/2019    created RollbackDBTable
 #Victor Salesa       21/04/2019    getNextJobId: Moved id generation to DBVictor Salesa       21/04/2019    Add Start AdpProcess and End AdpProcess
 #Victor Salesa       21/04/2019    Add StartAdpProcess and EndAdpProcess
 #Victor Salesa       29/04/2019    AdpProcess: Changed code to use CTL.SP_GET_JOB STORED PROCEDURE
################################################################################

from datetime import datetime 
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from functools import *
from pprint import pprint

#Quinn library transform
from pyspark.sql.dataframe import DataFrame
def transform(self,f):
  return f(self)
DataFrame.transform = transform
################################################################################

from multiprocessing.pool import ThreadPool
from itertools import chain
import time
import urllib
import traceback
import sys
from pyspark import StorageLevel

#Import functions library as psf for ParseJSONCols compatibility
import pyspark.sql.functions  as psf

import ast  #Abstract Syntax Trees
import os    #execute sh commands
import numpy as np 
import shutil #move files library

from pyspark.sql import Row
import cchardet as chardet
import math
import jaydebeapi
import decimal


#Define general Settings for the app

__PARTITIONS_DEFAULT__      = 50
__DEBUG_DEFAULT__           = False
__SAMPLE_DEFAULT__          = False
__SAMPLE_QUANTITY_DEFAULT__ = 100

__YYYMMDD__ = "%Y%m%d"
__YYYYMMDD_p2__ = "%Y-%m-%d"
__YYYYMMDDhhmmss__ = "%Y%m%d%H%M%S"


################################################################################### 

def GetDataFrameAsSchema(df,source_schema,debug=False,copy_schema=True):
  """Takes the proper fields from df casts the values to the proper types and sets the nullable value. 
     Field names from source_schema should exist in df
      Parameters:
      Dataframe df      -- Dataframe with the proper structure
      debug             -- Enable debug
      copy_schema       -- Cast value types to schema ones 
    Return:
      Dataframe -- new dataframe with the changed structure

    Example:

  """
  # Autogenerated select-cast operation to put fields in order and be sure that they have the right schema.
  if copy_schema==True:
    generated_select = [col(field.name).cast(field.dataType) for field in source_schema]
  else:
    generated_select = [col(field.name) for field in source_schema]
    
  df_selected = df.select(*iter(generated_select))
  
  if copy_schema==True:
    df_adapted = CopyNullableStateFromSchema(df_selected,source_schema,debug=debug)
  else:
    df_adapted = df_selected
    
  return df_adapted

################################################################################### 

def CopyNullableStateFromSchema(df,source_schema,debug=False):
  """Changes nullable state of a dataframe based on a source schema

      Parameters:
      Dataframe df      -- Dataframe with the proper structure
    Return:
      Dataframe -- new dataframe with the changed structure

    Example:

  """
  #Who                 When           What
  #Victor Salesa       20/11/2018     Initial version
  source_schema_fields    = [field.name for field in source_schema]
  for struct_field in df.schema:
      if struct_field.name in source_schema_fields:
          if debug==True:
            print(struct_field.name+" is "+str(struct_field.nullable))
          struct_field.nullable = source_schema[struct_field.name].nullable
          if debug==True:
            print(struct_field.name+" changes to "+str(struct_field.nullable))
  if debug==True:
    print("Schema Modified: "+str(df.schema))
  df_mod = spark.createDataFrame(df.rdd, df.schema)
  return df_mod

################################################################################### 

def callSafe(object,method,default,*args):
  """Call Method from Object safely to Avoid NoneType Error
    Parameters:
      object:  Object containing the method to be called
      method:  Name of the method to be called
      default: Default value when Object is None
      *args:   Args to be passed to the method in "method" parameter (Optional)
        
    Return:
       result of calling object.method(*args)
  """
  #Who                 When           What
  #Victor Salesa       04/04/2019     Initial version
  
  #Check if method is one of method names
  try:
    if method in [method_name for method_name in dir(object)]:
      #Check if any arg
      if len(args)!=0:
        #Run method with Args
        return getattr(object, method)(*args)
      else:
        #Run method without Args
        return getattr(object, method)()
      #end if len(args)!=0:
    else:
      #Run default value when method is not available
      return default
    #end if method in [method_name for method_name in dir(object)]
  except Exception as err:
    return default
    
  
################################################################################### 
def parseJSONCols(df, *cols, sanitize=True,debug=False):
  """Auto infer the schema of a json column and parse into a struct.
    rdd-based schema inference works if you have well-formatted JSON,
    like ``{"key": "value", ...}``, but breaks if your 'JSON' is just a
    string (``"data"``) or is an array (``[1, 2, 3]``). In those cases you
    can fix everything by wrapping the data in another JSON object
    (``{"key": [1, 2, 3]}``). The ``sanitize`` option (default True)
    automatically performs the wrapping and unwrapping.

    The schema inference is based on this
    `SO Post <https://stackoverflow.com/a/45880574)/>`_.
    Written by Nolan Conaway:
    `SO Post https://stackoverflow.com/a/51072232)/>`_. 
  
      Parameters:
      df : pyspark dataframe
          Dataframe containing the JSON cols.
      *cols : string(s)
          Names of the columns containing JSON.
      sanitize : boolean
          Flag indicating whether you'd like to sanitize your records
          by wrapping and unwrapping them in another JSON object layer.
      debug: boolean
          Flag to include debug information or Not
      
      Return:
      pyspark dataframe
        A dataframe with the decoded columns.
  """
  #Who                 When           What
   #Victor Salesa Sanz  15/11/2018     Initial version
   #Victor Salesa Sanz  20/11/2018     Add debug Paramater to ParseJSONCols
   #                                   Drop Malformed _ERR fields (no data)
  ################################################################################

  try:
    res = df
    
    empty_schema = StructType([])
    
    if(debug==True):
      print("parseJSONCols: Start Col iteration ")
  
    for i in cols:

        if(debug==True):
            print("parseJSONCols: Reading col: "+str(i))
          
        # sanitize if requested.
        if sanitize:
            res = (
                res.withColumn(
                    i,
                    psf.concat(psf.lit('{"data": '), i, psf.lit('}'))
                )
            )
            
        if(debug==True):
          print("parseJSONCols: Read from json string")
          collect = res.rdd.map(lambda x: x[i]).collect()
          print("parseJSONCols: collect:"+str(collect))
        
        # infer schema and apply it
        schema = spark.read.json(res.rdd.map(lambda x: x[i]),mode='DROPMALFORMED').schema
        
        #Check if schema is not empty
        if schema!=empty_schema:
          if(debug==True):
            print("parseJSONCols: Read Schema")
            collect = res.rdd.map(lambda x: x[i]).collect()
            print("parseJSONCols: schema:"+str(schema))

          if(debug==True):
            print("parseJSONCols: Convert json to map")

          res = res.withColumn(i, psf.from_json(psf.col(i), schema))

        if(debug==True):
           print("parseJSONCols: Sanitize Unpack")
        # unpack the wrapped object if needed

        if sanitize:
            if schema!=empty_schema:
              #Sanitize column
              if(debug==True):
                 print("parseJSONCols: Sanitize: "+str(i))
              
              res = res.withColumn(i, psf.col(i).data)
            else:
              if(debug==True):
                 print("parseJSONCols: Drop: "+str(i))
              #Drop Error column if not valid
              res = res.drop(i)
        
    return res
  except Exception as e:
    print('cols cannot be parsed: ' + str(e))
    return res

################################################################################

def unpivot(df, by):
  """Unpivot columns in "df" parameter excluding columns in "by" parameter
      Written by Zero323:
          `SO Post https://stackoverflow.com/a/37865645)/>`_.

        Parameters:
        df -- Dataframe containing columns to unpivot
        by -- list of columns to be excluded from the unpivot

        Return:
          Dataframe with the columns unpivoted

        Example:
          from pyspark.sql.functions import array, col, explode, struct, lit
          input_df = sc.parallelize([(1, 0.0, 0.6), (1, 0.6, 0.7)]).toDF(["A", "col_1", "col_2"])
          output_df = unpivot(df, ["A"])
          
          input_df show:
            +---+-----+-----+
            |  A|col_1|col_2|
            +---+-----+-----+
            |  1|  0.0|  0.6|
            |  1|  0.6|  0.7|
            +---+-----+-----+
            
          output_df show
            +---+-----+---+
            |  A|  key|val|
            +---+-----+---+
            |  1|col_1|0.0|
            |  1|col_2|0.6|
            |  1|col_1|0.6|
            |  1|col_2|0.7|
            +---+-----+---+
  """
  #Who                 When           What
  #Victor Salesa       15/11/2018     Initial version
  try:
    
    # Filter dtypes and split into column names and type description
    cols, dtypes = zip(*((c, t) for (c, t) in df.dtypes if c not in by))
    
    # Spark SQL supports only homogeneous columns
    assert len(set(dtypes)) == 1, "All columns have to be of the same type"
    
    # Create and explode an array of (column_name, column_value) structs
    kvs = explode(array([
      struct(lit(c).alias("key"), col(c).alias("val")) for c in cols
    ])).alias("kvs")
    
    return df.select(by + [kvs]).select(by + ["kvs.key", "kvs.val"])

  except Exception as e:
    print('Df cannot be unpivoted: ' + str(e))
    return df
  
################################################################################
def flatten_struct(schema, prefix=""):
  """Takes in a StructType schema object and return a column selector that flattens the Struct
      Written by Zz'Rot:
          `SO Post https://stackoverflow.com/a/46942723)/>`_.

        Parameters:
          schema -- Schema of the Dataframe containing the columns to be flattened
          prefix -- Prefix to be added to the columns

        Return:
          Flattened Schema

        Example:
          df = sc.parallelize([Row(r=Row(a=1, b=Row(foo="b", bar="12")))]).toDF()
          input_df.show()

          df_expanded = df.select("r.*")
          df_flattened = df_expanded.select(flatten_struct(df_expanded.schema))

          input_df show:
            +----------+
            |         r|
            +----------+
            |[1,[12,b]]|
            +----------+

          df_flattened show
            +---+-----+-----+
            |  a|b.bar|b.foo|
            +---+-----+-----+
            |  1|   12|    b|
            +---+-----+-----+
  """
  #Who                 When           What
  #Victor Salesa       15/11/2018     Initial version

  try:
    result = []
    for elem in schema:
        if isinstance(elem.dataType, StructType):
            result += flatten_struct(elem.dataType, prefix + elem.name + ".")
        else:
            result.append(col(prefix + elem.name).alias(prefix + elem.name))
    return result
  except Exception as e:
    print('Schema cannot be flattened')
    return result
  
################################################################################  


def udf_isDate_Def(date_text, my_format): 
  """Validate if a string has date format.

      Parameters:
      date_text -- string with the value to validate
      my_format -- date format expected

      Return:
        True: the date format is correct
        False: the date format is not correct
        
      UdfName:
        udf_isDate

      Example:
        udf_isDate('2018-02-22', __YYYMMDD__)
        %sql select udf_isDate('2018-02-22',  "%Y-%m-%d")

  """

  #Who                 When           What
  #Ana Perez           02/10/2018     Initial version
  try:
      if date_text != datetime.strptime(date_text, my_format).strftime(my_format):
          return False
      return True
  except Exception as e:
      return False
      
#register function for %sql
spark.udf.register("udf_isDate_sql", udf_isDate_Def, BooleanType())
#register function for %phyton %scala
udf_isDate_sql = udf(lambda d, f: udf_isDate_Def(d,f),BooleanType())

################################################################################

def isDate(date,format='yyyy-mm-dd'):
  """Validate if a string has date format.

      Parameters:
      date -- string with the date to validate
      format -- date format expected

      Return:
        True: the date format is correct
        False: the date format is not correct
        
      Example:
        spark.createDataFrame([('19970228220707',)], ['t']).select((isDate('t','yyyyMMddhhmmss')).alias('is_date')).collect()

  """
  #Who                 When           What
  #Victor Salesa       02/11/2018     Initial version
  return to_date(date,format).isNotNull()

################################################################################

def udf_slices_Def(s, *args):
  
  """Split a string in multiple substrings.

      Parameters:
      s     -- string to be splitted
      *args -- list of sizes for each substring

      Return:
        List of strings
        
      UdfName:
        udf_slices_sql

      Example 1:
        %sql select udf_slices_sql('testingthestring',  "[5,4,7]")
        Out: ["testi","ngth","estring"] 

      Example 2:
        my_file = spark.read.text("/FileStore/tables/quijote.txt")
        my_file.select(udf_slices_sql('value', lit("[10,10,12]"))).show() 
        Out:
           [Yo, Juan , Gallo de A, ndrada e...|
           [los que r, esiden en , su Conse...|

  """
  #Who                 When           What
  #Victor Salesa       03/10/2018     Initial version
  position = 0
  myList = []
  for length in args:
      myList.append(s[position:position + length])
      position += length
  return myList
  
#register function for %sql
spark.udf.register("udf_slices_sql", lambda x, y: udf_slices_Def(x,*ast.literal_eval(y)), ArrayType(StringType()))
#register function for %phyton %scala
udf_slices_sql = udf(lambda x, y: udf_slices_Def(x,*ast.literal_eval(y)), ArrayType(StringType()))

################################################################################

def udf_isDigit_Def(value):
  """Check if a string is a Digit String.

      Parameters:
      value -- string to checked

      Return:
        Boolean
        
      UdfName:
        udf_isDigit_sql

      Example 1:
        %sql select udf_isDigit_sql('12345678')
        Out: True 

      Example 2:
        my_file = spark.read.text("/FileStore/tables/numbers.txt")
        my_file.select("value",udf_isDigit_sql('value').alias('isDigit')).show()  
        Out:
            +------+-------+
            | value|isDigit|
            +------+-------+
            |123454|   true|
            |Lalala|  false|
            | 13223|   true|
            | Lal24|  false|
            | 33524|   true|
            +------+-------+
  """
  #Who                 When           What
  #Victor Salesa       03/10/2018     Initial version
  if value:
    return value.isdigit()
  else:
    return False
  
#register function for %sql
spark.udf.register("udf_isDigit_sql",lambda x: udf_isDigit_Def(x),BooleanType())
#register function for %phyton %scala
udf_isDigit_sql = udf(lambda x: udf_isDigit_Def(x),BooleanType())

################################################################################

def isDigit(column):
  """Check if a column is a Digit String. (just for pyspark)

      Parameters:
      column -- column to be checked

      Return:
        Boolean

      Example 1
        spark.createDataFrame([('123456789045442305823234582349058934',)], ['a']).select(isDigit('a').alias('r')).collect() 
  """
  #Who                 When           What
  #Victor Salesa       02/11/2018     Initial Version
  #Victor Salesa       09/01/2019     Added condition length(column)>0 to avoid emtpy strings considered as digit
  
  return ((length(translate(column, "1234567890", ""))==0)&(length(column)>0))

################################################################################

def distributeFile_Def(sourcefile,destpath,errorpath,isOk,destName=None):
  """Distribute file from sourcefile to destpath if isOk=True or errorPath if isOk=False

      Parameters:
        sourcefile                  -- file with path to be distributed
        destpath                    -- destination path when file is ok
        errorpath                   -- destionation path when file is not ok
        isOK                        -- the file is moved to destpath if ok or errorpath if not ok
        destName                    -- file name in destination folder (In case renaming)
      Return
        pyspark.sql.Dataframe
        
      Example 1:
        
  """  
  #Who                 When           What
  #Victor Salesa       15/10/2018     Initial version
  #Victor Salesa       06/11/2018     Added destName to move files functions
  
  sourcepath_split = sourcefile.split('/')
  
  name = sourcepath_split[len(sourcepath_split)-1]
  
  if destName!=None:
      destname = destName
  else:
      destname = name
  
  sourcefile = '/dbfs' + sourcefile.replace('dbfs:','')  
  destpath  = '/dbfs' + destpath.replace('dbfs:','') + destname
  errorpath = '/dbfs' + errorpath.replace('dbfs:','') + destname
  
  moveOk ="ok"
  
  if isOk == True:
    if len(destpath)!=0:
      os.system("sync %s" % sourcefile.replace(name,""))
      os.system("sync %s" % destpath.replace(name,""))
      moveOk = shutil.move(sourcefile,destpath)
      # Force filesync in dbfs
      os.system("sync %s" % sourcefile.replace(name,""))
      os.system("sync %s" % destpath.replace(name,""))
  else:
    if len(errorpath)!=0:
      # Force filesync in dbfs
      os.system("sync %s" % sourcefile.replace(name,""))
      os.system("sync %s" % errorpath.replace(name,""))
      moveOk = shutil.move(sourcefile,errorpath)
      # Force filesync in dbfs
      os.system("sync %s" % sourcefile.replace(name,""))
      os.system("sync %s" % errorpath.replace(name,""))
  return (len(moveOk)>0)

#register function for %sql
spark.udf.register("distributeFile_sql",lambda s,d,e,ok,dest: distributeFile_Def(s,d,e,ok,dest),BooleanType())
#register function for %phyton %scala
distributeFile_sql = udf(lambda s,d,e,ok,dest: distributeFile_Def(s,d,e,ok,dest),BooleanType())

#####################################################################################

def copyFile_Def(sourcefile,destpath,destName=None):
  """Copy file from sourcefile to destpath

      Parameters:
        sourcefile                  -- file with path to be distributed
        destpath                    -- destination path when file is ok
        destName                    -- file name in destination folder (In case renaming)
        
      Return
        Boolean                     -- file has been moved correctly          
        
      Example 1:
        
  """  
  #Who                 When           What
  #Victor Salesa       15/10/2018     Initial version
  #Victor Salesa       06/11/2018     Added destName to move files functions
  
  sourcepath_split = sourcefile.split('/')
  
  name = sourcepath_split[len(sourcepath_split)-1]
  
  if destName!=None:
      destname = destName
  else:
      destname = sourcepath_split[len(sourcepath_split)-1]
  
  sourcefile = '/dbfs' + sourcefile.replace('dbfs:','')  
  destpath  = '/dbfs' + destpath.replace('dbfs:','') + destname
  
  # Force filesync in dbfs
  os.system("sync %s" % sourcefile.replace(name,""))
  os.system("sync %s" % destpath.replace(name,""))
  
  copyOk = shutil.copy(sourcefile,destpath)
  
  # Force filesync in dbfs
  os.system("sync %s" % sourcefile.replace(name,""))
  os.system("sync %s" % destpath.replace(name,""))
  
  return (len(copyOk)>0)

#register function for %sql
spark.udf.register("copyFile_sql",lambda s,d,dest: copyFile_Def(s,d,dest),BooleanType())
#register function for %phyton %scala
copyFile_sql = udf(lambda s,d,dest: copyFile_Def(s,d,dest),BooleanType())

################################################################################
def deleteFile_Def(pathFile, nameFile):
  """Delete file 

      Parameters:
        pathFile                    -- Path to be deleted
        nameFile                    -- Name of the file to be deleted
      Return
        Boolean                     -- file has been deleted correctly          
        
      Example 1:
        
  """  
  #Who                 When           What
  #Ana Perez           15/10/2018     Initial version
  #Victor Salesa       06/11/2018     Added destName to move files functions
  
  try:
      
    pathFileTr = '/dbfs' + pathFile.replace('dbfs:','')
#     print (pathFileTr+nameFile )

    if os.path.exists(pathFileTr+nameFile):
      os.remove(pathFileTr+nameFile)
      os.system("sync %s" % pathFileTr)
    #end if path.exists
    
    return True
      
  except ValueError:
    return False
  
#register function for %sql
spark.udf.register("deleteFile_sql",lambda s,d: deleteFile_Def(s,d),BooleanType())
#register function for %phyton %scala
deleteFile_sql = udf(lambda s,d: deleteFile_Def(s,d),BooleanType())

################################################################################

@udf
def getEncoding(filePath,debug=True):
  """Get File Charset information

      Parameters:
        filePath                    -- Path of the file to be read
        debug                       -- Show debug information
      Return
        String                     -- json encoded charset information      
        
      Example 1:
        
  """  
  #Who                 When           What
  #Victor Salesa       10/03/2019     Initial version
  try:
    filePath = "/dbfs"+filePath.replace("dbfs:","")
    
    exception_state = ''
    with open(filePath,'rb') as f:
      if debug==True:
        print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+" Read First Line")
      #end if debug==True
      
      FirstLineBytes = f.readline()
      exception_state = 'getEncoding.READ_LINE'
      
      if debug==True:
        print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+" Detect Encoding")
      #end if debug==True
      charset =    chardet.detect(FirstLineBytes)['encoding']
      confidence = chardet.detect(FirstLineBytes)['confidence']
      
      if debug==True:
        print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+" Before Json")
      
      charset_desc = '"charset"'
      confidence_desc = '"confidence"'
      charset_json = '"'+charset+'"'
      confidence_json = str(confidence)
      
      encoding_det_json = '{'+charset_desc+":"+charset_json+","+confidence_desc+":"+confidence_json+'}'
  
      if debug==True:
        print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+" After Json")
#       print(encoding_det_json)
      exception_state = 'getEncoding.DETECT_ENCODING'
      return encoding_det_json
    #end with open(filePath,'rb') as f
    
   
  except Exception as e:
    print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+" Fail on "+exception_state+": "+str(e))
    return ""
#end def getEncoding(filePath,debug=True):

################################################################################
@udf
def getFirstLine(filePath,force_decode="",debug=True):
  """Get first line of the file in the path 

      Parameters:
        filePath                    -- Path of the file to be read
        debug                       -- Show debug information
      Return
        String                     -- first line of the file      
        
      Example 1:
        
  """  
  #Who                 When           What
  #Victor Salesa       13/03/2019     Initial version
  #Victor Salesa       10/04/2019     Added charset detection in order to be able to read all files.
  try:
    filePath = "/dbfs"+filePath.replace("dbfs:","")
    
    with open(filePath,'rb') as f:
      if debug==True:
        print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+" Read First Line")
      #end if debug==True
      
      FirstLineBytes = f.readline()
      exception_state = 'getFirstLine.READ_LINE'
      
      if debug==True:
        print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+" Detect Encoding")
      #end if debug==True
      charset =    chardet.detect(FirstLineBytes)['encoding']
      confidence = chardet.detect(FirstLineBytes)['confidence']
      
      exception_state = 'getFirstLine.DETECT_ENCODING'
    #end with open(filePath,'rb') as f
    
    if debug==True:
      print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+" Decode First Line")
      print("\n"+"encoding: "+charset+" | confidence:"+str(confidence))
      print("-----------------------")
    #end if debug==True
    
    if len(force_decode)!=0:
      decode_charset = force_decode
    else:
      decode_charset = charset
    #end if len(force_decode)!=0
    
    FirstLineDecodedString = FirstLineBytes.decode(charset)
    exception_state = 'getFirstLine.DECODE_LINE'

    if debug==True:
      print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+" Remove CRLF")
    #end if debug==True

    FirstLineDecodedStringRemoveCRLF = FirstLineDecodedString.rstrip()
    exception_state = 'getFirstLine.STRIP_LINE'

#     print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+" First Line: "+str(FirstLineDecodedStringRemoveCRLF))
    return FirstLineDecodedStringRemoveCRLF
  except Exception as e:
    if debug==True:
      print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+"Fail on "+exception_state+": "+str(e))
    return ""
#end def getFirstLine(filePath,debug=True):


################################################################################

def createDFFromList(filelist):
  """Create a Dataframe based on filelist.

      Parameters:
      filelist                  -- python list of StructType that contains a pair key-value with name     
      
      Return:
        pyspark.sql.Dataframe
        
      Example 1:
        filelist = dbutils.fs.ls("wasbs://container1@adppocstoragev2.blob.core.windows.net/01-Landing")
        createDFFromList(filelist).show()
        
  """ 
  #Who                 When           What
  #Victor Salesa       03/10/2018     Initial version
  
  return sc.parallelize(filelist).toDF()

################################################################################################################################################
def saveToDB(df,table_name,mode="overwrite",alternate_db_url='',debug=False,job_id=''):
  """Save Dataframe in Database

        Parameters:
        df               -- dataframe to be saved
        mode             -- save mode overwrite/append/....
        table_name       -- hive table name. If not specified or "" or None will take file name of destination path
        alternate_db_url -- if db url is not default 
        Return:
          Boolean

        Example 1:
          result = saveToDB(df,"T_SELL_OUT",mode="overwrite")
  """
  #Who                 When           What
  #Victor Salesa       14/04/2019     Add job id to save Todb
  
  exception_status = "saveToDB.START_PROCESS"

  df = df.cache()
  df.count()

  if job_id !='':
    if not job_id.isdigit():
      raise Exception("job_i should be an integer")

    #Append a job id column to the dataframe 
    df = df.withColumn("JOB_ID",lit(job_id).cast(DecimalType(31,0)))
    df = df.cache()
    df.count()
  #end if job_id !='':

  exception_status = "saveToDB.PROCESS_ID_ADDED"
  
  if alternate_db_url!=None and len(alternate_db_url.strip())!=0:
     current_jdbcurl = alternate_db_url
  else:
     current_jdbcurl = __JDBC_URL__
  try:
    #Write to Database
    if debug==True:
      print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+"Save to "+table_name+" with mode "+mode)
    (df.write
       .mode(mode)
       .jdbc(current_jdbcurl,table_name)
    )
    if debug==True:
      print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+'Write to db table:'+str(table_name))
      df.show(10000,truncate=False)
    
    #DROP Field 
    if job_id !='':
      df = df.drop("JOB_ID")
      df = df.cache()
      df.count()
    #end if job_id !='':
    
  except Exception as e:
    if job_id!='':
      RollbackDBTable(table_name,rollback_id=job_id)
    #end if job_id!=''
    raise e
  
########################################################################################################################
def StartADPProcess(process_name,debug=False):
  """Generates a new PID Start for the current Process 

        Parameters:
          process_name - Name of the process we are starting
          debug -- Enable debug for the current process
        StartADPProcess
        Return:
          String
          
        Example 1:
          result = StartADPProcess('process_name')
  """
  #Who                 When           What
  #Victor Salesa       18/04/2019     Initial Version
  #Victor Salesa       29/04/2019     Changed code to use CTL.SP_GET_JOB STORED PROCEDURE
  
  try:
    
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)
    
    start_date = datetime.fromtimestamp(time.time()).strftime(__YYYYMMDDhhmmss__)
    query      = '''DECLARE @result SMALLINT, @sequence INT, @errorCode INT, @errorMsg NVARCHAR(4000);
                    EXECUTE @result = [CTL].[SP_GET_JOB] NULL, '{0}', '{1}', NULL, NULL, NULL, @out_job_id = @sequence OUTPUT, @out_error_code = @errorCode OUTPUT, @out_error_msg = @errorMsg OUTPUT;
                    SELECT @result, @sequence, @errorCode, @errorMsg
                 '''.format(start_date,process_name)
    
    exception_status = 'StartADPProcess.START_PROCESS'
   
    db = jaydebeapi.connect(__JDBC_CLASSNAME__, __JAYDEBE_URL__)
    
    exception_status = "StartADPProcess.DB_CONNECT"
    
    curs = db.cursor()
    
    exception_status = "StartADPProcess.GET_CURSOR"
    
    execution_result = curs.execute(query)
    
    exception_status = "StartADPProcess.EXECUTE_QUERY"
    
    query_result_array = curs.fetchone()
    
    query_result = query_result_array[0]
    job_id = query_result_array[1]
    errorCode = query_result_array[2]
    errorMessage = query_result_array[3]
    
    message = ""
    
    if query_result != 1:
      query = " ".join(query.split()).replace("; ",";").replace(";",";\n")
      message = 'Stored Procedure Execution Failed:||"'+query+'"||errorCode:'+str(errorCode)+'|errorMessage:'+str(errorMessage)
      message = " ".join(message.split())
      message = message.replace("|","\n")
      curs.close()
      raise Exception('Stored Procedure Execution Failed: \n'+message)
    #end if query_result != 1
    curs.close()
    
    exception_status = "StartADPProcess.JOB_OPENED"
    
    ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name)
    
    return job_id
  except Exception as e:
    ADP_log_exception(process, logger_name, level_action, log_level,message, sys._getframe().f_code.co_name,  sys.exc_info())
    raise e
    
########################################################################################################################

def EndADPProcess(rows_target,job_id='',log_error_code='NULL',log_error_message='',debug=False):
  """Generates a new PID Stop for the current Process 

        Parameters:
          job_id - job id that it's ending
          rows_target   - Rows finally written to file / database
          debug -- Enable debug for the current process
        EndADPProcess
        Return:
          String
          
        Example 1:
          result = EndADPProcess('process_name',11111111111111111,5556666)
  """
  #Who                 When           What
  #Victor Salesa       18/04/2019     Initial Version
  #Victor Salesa       29/04/2019     Changed code to use CTL.SP_GET_JOB STORED PROCEDURE
  
  try:
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)
    
    message = ""
    
    end_date = datetime.fromtimestamp(time.time()).strftime(__YYYYMMDDhhmmss__)
    query      = '''DECLARE @result SMALLINT, @sequence INT, @errorCode INT, @errorMsg NVARCHAR(4000);
                    EXECUTE @result = [CTL].[SP_GET_JOB] {0}, '{1}', NULL, {2}, {3}, '{4}', @out_job_id = @sequence OUTPUT, @out_error_code = @errorCode OUTPUT, @out_error_msg = @errorMsg OUTPUT;
                    SELECT @result, @sequence, @errorCode, @errorMsg
                 '''.format(job_id,end_date,rows_target,log_error_code,log_error_message)

    exception_status = 'EndADPProcess.START_PROCESS'
   
    db = jaydebeapi.connect(__JDBC_CLASSNAME__, __JAYDEBE_URL__)
    
    exception_status = "EndADPProcess.DB_CONNECT"
    
    curs = db.cursor()
    
    exception_status = "EndADPProcess.GET_CURSOR"
    
    execution_result = curs.execute(query)
    
    exception_status = "EndADPProcess.EXECUTE_QUERY"
    
    query_result_array = curs.fetchone()
    
    query_result = query_result_array[0]
    job_id = query_result_array[1]
    errorCode = query_result_array[2]
    errorMessage = query_result_array[3]
    

    
    if query_result != 1:
      query = " ".join(query.split()).replace("; ",";").replace(";",";\n")
      message = 'Stored Procedure Execution Failed:||"'+query+'"||errorCode:'+str(errorCode)+'|errorMessage:'+str(errorMessage)
      message = " ".join(message.split())
      message = message.replace("|","\n")
      curs.close()
      raise Exception('Stored Procedure Execution Failed: \n'+message)
    #end if query_result != 1
    curs.close()
    
    exception_status = "EndADPProcess.JOB_CLOSED"
    
    ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name)
    
    return job_id
  except Exception as e:
    ADP_log_exception(process, logger_name, level_action, log_level,message, sys._getframe().f_code.co_name,  sys.exc_info())
    raise e

########################################################################################################################    
    
def getNextJobId(process):
  """Get Job Next Id
        Parameters:
        process     -- dataframe to be saved

        Return:
          Integer

        Example 1:
          job_id = getNextJobId('PROCESS_NAME')
  """
    
  #Who                 When           What
  #Victor Salesa       12/04/2019     Initial version
  #Victor Salesa       21/04/2019     Moved Job Id Generation to DB
  try:
    
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)
    
    query = '''DECLARE @result SMALLINT, @sequence INT, @errorCode INT, @errorMsg NVARCHAR(4000);
               EXECUTE @result = [CTL].[SP_GET_JOB_SEQ] @out_sequence = @sequence OUTPUT, @out_error_code = @errorCode OUTPUT, @out_error_msg = @errorMsg OUTPUT;
               SELECT @result, @sequence, @errorCode, @errorMsg
            '''
    
    db = jaydebeapi.connect(__JDBC_CLASSNAME__, __JAYDEBE_URL__)
    
    exception_status = "getNextJobId.CONNECT_DATABASE"
    
    curs = db.cursor()
    
    exception_status = "getNextJobId.GET_CURSOR"
    
    result = curs.execute(query)
    
    exception_status = "getNextJobId.EXECUTE_QUERY"
    
    query_result_array = curs.fetchone()
    
    exception_status = "getNextJobId.GET_ROW"
  
    query_result = query_result_array[0]
    job_id = query_result_array[1]
    errorCode = query_result_array[2]
    errorMessage = query_result_array[3]
    
    if query_result != 0:
      query = " ".join(query.split()).replace("; ",";").replace(";",";\n")
      message = 'Stored Procedure Execution Failed:||"'+query+'"||errorCode:'+str(errorCode)+'|errorMessage:'+str(errorMessage)
      message = " ".join(message.split())
      message = message.replace("|","\n")
      curs.close()
      raise Exception('Stored Procedure Execution Failed: \n'+message)
    #end if query_result != 0
    
    curs.close()
    
    ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name)
    
    return job_id
    #return math.floor(time.time() * 1000000000000000000000)
  except Exception as e:
    ADP_log_exception(process, logger_name, level_action, log_level,message, sys._getframe().f_code.co_name,  sys.exc_info())
    raise e

################################################################################################################################################
    
@udf
def getBasePath(file):
  """BasePath of the file provided

        Parameters:
        file
        
        Return:
          string
  """
    
  #Who                 When           What
  #Victor Salesa       19/10/2018     Initial version
  
  try:
    return '/'.join((file.split('/'))[:-1])
  except Exception as e:
    return file
  
################################################################################################################################################

def RollbackDBTable(table_name,rollback_id=''):
  """Rollback a datanase table if process id is defined in the table

      Parameters:
      table_name  -- hive table name.
      rollback_id -- id of the process id data
      Return:
        Integer
   """

  #Who                 When           What
  #Victor Salesa       15/04/2019     Initial version
  ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)
  
  query_delete = '''DELETE FROM {0} WHERE JOB_ID={1}'''.format(table_name,rollback_id)
  
  query_delete_tmpt = '''DECLARE @result SMALLINT, @rowsDeleted INT, @errorCode INT, @errorMsg NVARCHAR(4000);
                        EXECUTE @result = [CTL].[SP_DEL_JOB_ID] '{0}', {1}, @out_rows_deleted = @rowsDeleted OUTPUT, @out_error_code = @errorCode OUTPUT, @out_error_msg  = @errorMsg OUTPUT;
                        SELECT @result, @rowsDeleted, @errorCode, @errorMsg
                     '''
  exception_status = ''
  
  try:
    
    if not rollback_id.isdigit():
      raise Exception("rollback_id should be an integer")
    #end if not rollback_id.isdigit()
    
    query_delete = query_delete_tmpt.format(table_name,rollback_id)
  
    exception_status = "RollbackDBTable.CHECK_ROLLBACK_ID"
    
    db = jaydebeapi.connect(__JDBC_CLASSNAME__, __JAYDEBE_URL__)

    exception_status = "RollbackDBTable.CONNECT_DATABASE"
  
    curs = db.cursor()
    
    exception_status = "RollbackDBTable.GET_CURSOR"

    execution_result = curs.execute(query_delete)
    
    query_result_array = curs.fetchone()
    
    query_result = query_result_array[0]
    rowsDeleted = query_result_array[1]
    errorCode = query_result_array[2]
    errorMessage = query_result_array[3]
    
    exception_status = "RollbackDBTable.EXECUTE_QUERY"
    
    if query_result !=0:
      query_delete = " ".join(query_delete.split()).replace("; ",";").replace(";",";\n")
      message = 'Stored Procedure Execution Failed:||"'+query_delete+'"||errorCode:'+str(errorCode)+'|errorMessage:'+str(errorMessage)
      message = " ".join(message.split())
      message = message.replace("|","\n")
      curs.close()
      raise Exception('Stored Procedure Execution Failed: \n'+message)
    # end if query_result !=0:
    curs.close()
    exception_status = "RollbackDBTable.ROLLBACK_DB_TABLE"
    
    if rowsDeleted <= 0:
      ADP_log_warning(process, logger_name, level_action, log_level, "Job id {0} doesn't exist in table {1}".format(str(rollback_id),table_name), sys._getframe().f_code.co_name)
  
    ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name)
  
    return rowsDeleted
  except Exception as e:
    ADP_log_exception(process, logger_name, level_action, log_level,message, sys._getframe().f_code.co_name,  sys.exc_info())
    raise e

################################################################################################################################################

def RollbackCanonicalTable(table_name,rollback_id='latest'):
  """Rollback a canonical table if process id is defined in the table

      Parameters:
      table_name  -- hive table name.
      rollback_id -- id of the partition to be removed if latest is set then will remove the biggest number id
      Return:
        Boolean
   """

  #Who                 When           What
  #Victor Salesa       15/04/2019     Initial version
  #Victor Salesa       02/05/2019     Avoid fail when id not exists
  try:
    ADP_log_info(process, logger_name, level_action, log_level, "BEGIN", sys._getframe().f_code.co_name)
    
    if rollback_id=='latest':
      if table_name.lower() in sqlContext.tableNames("default"):
        if "PROCESS_ID" in spark.read.table(table_name).columns:
          rollback_id = (spark.read.table(table_name).select("PROCESS_ID")
                                                 .distinct()
                                                 .orderBy("PROCESS_ID",ascending=True)
                                                 .groupBy()
                                                 .agg({"PROCESS_ID": "max"})
                                                 .collect()[0][0])
        else:
          raise Exception("PROCESS_ID field doesn't exit in table: "+table_name)
        #end if table_name.lower() in sqlContext.tableNames("default")
      else:
        raise Exception("Table name:"+table_name+" doesn't exit")
      #end if table_name.lower() in sqlContext.tableNames("default"):  
    else: 
      if not rollback_id.isdigit():
        raise Exception("rollback_id should be an integer")
      #end if not rollback_id.isdigit()
    #end if rollback_id='latest':
        
    if table_name.lower() in sqlContext.tableNames("default"):
      rollback_count = spark.read.table(table_name).filter(col("PROCESS_ID")==rollback_id).count()
      
      ADP_log_debug(process, logger_name, level_action, log_level, "Rollback count:"+str(rollback_count), sys._getframe().f_code.co_name)

      if rollback_count > 0:
        rollback_path = (spark.read.table(table_name)
                           .filter(col("PROCESS_ID")==rollback_id)
                           .select(getBasePath(input_file_name()).alias("ROLLBACK_PATH"))
                        ).distinct().collect()[0].ROLLBACK_PATH
        
        ADP_log_debug(process, logger_name, level_action, log_level, "Rollback path:"+rollback_path, sys._getframe().f_code.co_name)
        
        rollback_files = sc.parallelize(blob_ls(rollback_path)).toDF().select("path").withColumn("deleted",blob_delete_file_sql(col("path")))

        ADP_log_debug(process, logger_name, level_action, log_level, "Files about to be deleted", sys._getframe().f_code.co_name)
        
        deleted_files = rollback_files.collect()

        ADP_log_debug(process, logger_name, level_action, log_level, "Files deleted: " + str(deleted_files), sys._getframe().f_code.co_name)
        
        spark.catalog.refreshTable(table_name)

        ADP_log_debug(process, logger_name, level_action, log_level, "Table Refreshed: " + table_name, sys._getframe().f_code.co_name)
      else:
        ADP_log_warning(process, logger_name, level_action, log_level, "No data for the rollback_id: "+str(rollback_id)+"in table "+table_name, sys._getframe().f_code.co_name)
      #end if rollback_count > 0
      
    else:
      raise Exception("Table name:"+table_name+" doesn't exit")
    #end table_name.lower() in sqlContext.tableNames("default")
    
    ADP_log_info(process, logger_name, level_action, log_level, "END", sys._getframe().f_code.co_name)
  except Exception as e:
    ADP_log_exception(process, logger_name, level_action, log_level, "", sys._getframe().f_code.co_name,  sys.exc_info())
    raise e

################################################################################################################################################

def saveAsCanonical(df,path,mode="overwrite",table_name="",field_partitions=[],debug=False,job_id=''):
   #Save the datraframe as a file
    """Save Dataframe in Canonical Format

        Parameters:
        df          -- dataframe to be saved
        path        -- path to store the file
        mode        -- save mode overwrite/append/....
        table_name  -- hive table name. If not specified or "" or None will take file name of destination path

        Return:
          Boolean

        Example 1:
          result = saveAsCanonical(df,__PHARMATIC_MASTER_DATA_CANONICAL_PRODUCTS_PATH__,mode="overwrite")
    """
    
    #Who                 When           What
    #Victor Salesa       19/10/2018     Initial version
    #Victor Salesa       19/11/2018     Removed repartition (1) from canonical
    #Victor Salesa       27/02/2019     Changed function to write to parquet
    #Victor Salesa       06/03/2019     Changed function to write to hive-parquet
    #Ana Perez           07/03/2019     Moved save operation from if debug==True inside
    #Victor Salesa       08/03/2019     Added lower() to check table exists as tables query return names in lower case
    #Victor Salesa       27/03/2019     Move the refresh table to the right position
    #Victor Salesa       09/04/2019     Added "and mode!='overwrite'" to force overwrite mode saves data in blob instead of hive metastore
    #Victor Salesa       12/04/2019     Add a new job id column to the dataframe before writing if specified
    #Victor Salesa       15/04/2019     Added rollback function call to remove all data regarding the id 
    try:
      #As the list object is mutable we should create a "local" copy of the list to be soure we are not modifying the original object
      field_partitions = field_partitions[:]
      
      exception_status = "saveAsCanonical.START_PROCESS"

      df = df.cache()
      df.count()
      
      if job_id !='':
        if not job_id.isdigit():
          raise Exception("job_i should be an integer")
          
        #Append a job id partition to be able to rollback
        field_partitions.append("PROCESS_ID")

        #Append a job id column to the dataframe 
        df = df.withColumn("PROCESS_ID",lit(job_id))
        df = df.cache()
        df.count()
      #end if job_id !='':
      
      exception_status = "saveAsCanonical.PROCESS_ID_ADDED"
      
      if table_name==None or len(table_name.strip())==0:
        table_name=(path.split('/'))[-1]
        if debug==True:
          print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+" Table name not specified defaulted to '"+table_name+"'")
      else:
        if debug==True:
          print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+" Table name is '"+table_name+"'")
         
      #if exists table append or overwrite
      if table_name.lower() in sqlContext.tableNames("default") and mode!='overwrite':
        if debug==True:
          print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+" table '"+table_name+"' exists")
          
        if len(field_partitions)==0:  
          (df.write
            .saveAsTable(table_name, mode=mode) 
          )
          if debug==True:
            print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+'Write to canonical table:'+str(table_name)+' path: '+str(path))
            df.show(10000,truncate=False)
        else:
          (df.write
            .partitionBy(*iter(field_partitions))
            .saveAsTable(table_name, mode=mode) 
          )
          if debug==True:
            print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+'Write to canonical table:'+str(table_name)+' path: '+str(path))
            df.show(10000,truncate=False)
      else:
        if debug==True:
          print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+" table '"+table_name+"' doesn't exist")
        if len(field_partitions)==0:
          #if does not exists table, create
          (df.write
            .option('path',path)
            .saveAsTable(table_name, mode=mode) 
          )
          if debug==True:
            print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+'Write to canonical table:'+str(table_name)+' path: '+str(path))
            df.show(10000,truncate=False)
        else:
          (df.write
            .partitionBy(*iter(field_partitions))
            .option('path',path)
            .saveAsTable(table_name, mode=mode) 
          )
      spark.catalog.refreshTable(table_name)
      if debug==True:
        print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+'Refresh canonical table:'+str(table_name))
      
      #DROP Field 
      if job_id !='':
        df = df.drop("PROCESS_ID")
        df = df.cache()
        df.count()
      #end if job_id !='':
      return True
    except Exception as e:
      print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+" Error Trying to save: "+path+" - Exception was: " + str(e) )
      ADP_log_exception(process, logger_name, level_action, log_level, " Error Trying to save: "+path+" - Exception was: ", sys._getframe().f_code.co_name,  sys.exc_info())
      #If writing process fails and any register in canonical table exists it will be deleted and table refreshed
      if job_id!='':
        RollbackCanonicalTable(table_name,rollback_id=job_id)
      #end if job_id!='':
      
      
      traceback.print_exc()
      raise e
    
################################################################################

def SliceDFColumn(column_to_split_name,splitted_column_names,splitted_column_lengths,purge = '',debug=False):
  def inner(df):
    """Slice df column based on column names list and lengths.


        Parameters:
        column_to_split_name      -- column name containing the file name
        splitted_column_names     -- list with column names to be produced2  
        splitted_column_lengths   -- list of widths for the column names to be producted
        debug                     -- enable debug
        
        Return:
          pyspark.sql.DataFrame

        Example 1:

          __PH_UNIQUE_CODE_LEN__  = 5; __SPEC_VER_LEN__        = 3; __RELEASE_VERSION_LEN__ = 2   
          __DATE_LEN__            = 8; __ORIGIN_LEN__          = 1; __FILE_TYPE_LEN__       = 2

          __SP_FILENAME_LENGHTS__        = [__PH_UNIQUE_CODE_LEN__,__SPEC_VER_LEN__,__RELEASE_VERSION_LEN__,__DATE_LEN__,__ORIGIN_LEN__,__FILE_TYPE_LEN__]
          __SP_FILENAME_COLUMN_NAMES__   = ['pharmacy_unique_code','spec_version','release_version','data_date','origin','file_type']
          __SP_FILENAME_POSITIONS__      = [pos for pos in range(len(__SP_FILENAME_COLUMN_NAMES__))]

          filelist = dbutils.fs.ls("wasbs://container1@adppocstoragev2.blob.core.windows.net/01-Landing")

          filelist_df = createDFFromList(filelist)

          SliceDFColumn(filelist_df,"name",__SP_FILENAME_COLUMN_NAMES__,__SP_FILENAME_LENGHTS__,'SELL-OUT_').show()

          +--------------------+------------+---------------+---------+------+---------+
          |pharmacy_unique_code|spec_version|release_version|data_date|origin|file_type|
          +--------------------+------------+---------------+---------+------+---------+
          |               60552|         293|             01| 20171106|     G|       SL|
          +--------------------+------------+---------------+---------+------+---------+        

    """  
    if debug==True:
      print("["+datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+"]"+ '---------------------------Start Slicing----------------------------------------')
    
    start_list = np.cumsum([1] + splitted_column_lengths).tolist()[:-1]
    
    splitted_columns = ([
      col(column_to_split_name).substr(
          start_list[i], 
          splitted_column_lengths[i]
      ).alias(splitted_column_names[i]) for i in range(len(start_list))
    ])
    
    return(
      df.withColumn(column_to_split_name,regexp_replace(column_to_split_name,purge,''))
        .select(expr("*"),*splitted_columns)
    )
    
  return inner

##################################################################################################################################################################
##################################################################################################################################################################
##################################################################################################################################################################
##################################################################################################################################################################

# Define validation functions
def no_val(val):
  """Placeholder Function to be used in a Validation Model to include a Dummy Field with "No Validation" 
    Parameters:
      val    -- column to be validated
    Return:
      Boolean

    Example 1:
     For examples on this function please see "GenerateValCols"
  """
  #Who                 When           What
  #Victor Salesa       23/01/2019     Initial Version
  
  return lit(True);

##################################################################################################################################################################

def digit_val(val):
  """Function to be used in a Validation Model to perform "Is Digit Value Validation" 
    Parameters:
      val    -- column to be validated
    Return:
      Boolean
      
    Example 1:
     For examples on this function please see "GenerateValCols"
  """
  #Who                 When           What
  #Victor Salesa       23/01/2019     Initial Version
  
  return isDigit(val)

##################################################################################################################################################################
  
def date_val_ddmmyy(val):
  """Function to be used in a Validation Model to perform "Is Date Value with dd/mm/yyyy Validation" 
    Parameters:
      val    -- column to be validated
      
    Return:
      Boolean

    Example 1:
     For examples on this function please see "GenerateValCols"
  """
  #Who                 When           What
  #Victor Salesa       23/01/2019     Initial Version
  
  return isDate(val,format='dd/mm/yy')

##################################################################################################################################################################

def date_val_yyyymmddHHmmss(val):
  """Function to be used in a Validation Model to perform "Is Date Value with yyyymmddHHmmss Validation" 
    Parameters:
      val    -- column to be validated
      
    Return:
      Boolean

    Example 1:
     For examples on this function please see "GenerateValCols"
  """
  #Who                 When           What
  #Victor Salesa       23/01/2019     Initial Version
  
  return isDate(val,format='yyyymmddHHmmss')

##################################################################################################################################################################
  
def yesno_val(val):
  """Function to be used in a Validation Model to perform "Is a Yes or No Valu
  e" 
    Parameters:
      val    -- column to be validated
      
    Return:
      Boolean

    Example 1:
     For examples on this function please see "GenerateValCols"
  """
  #Who                 When           What
  #Victor Salesa       23/01/2019     Initial Version
  
  return col(val).isin(["Yes","No","Y","N"]) 

##################################################################################################################################################################
##################################################################################################################################################################
##################################################################################################################################################################
##################################################################################################################################################################