# Metadata driven Extract, Transform And Load process
#### Project Tapestry - https://github.com/mrjonlunn/tapestry
##### Created by: Jon Lunn
##### Version 0.2

In [None]:
# Add your own parameters here, and call it from a pipeline if you like
filePath = 'Files/Landing/testsource/testentity/NYC1000.csv'
configFolder = '/lakehouse/default/Files/PipelineMetaData/config/'

In [None]:
%run Standard_NoteBooks

In [None]:
# Gets the config metadata
config = metadataLoader(filePath, configFolder)

# Sets a load of config variables for later use

#Standard options
configFileType = config["fileType"] #string
configSource = config["dataset"]["source"] #string
configEntity = config["dataset"]["entity"] #string
customSchema = jsonToSchema(config["fileOptions"]["customSchema"]) #struct

# File options
if config["fileType"] == 'csv':
    header = config["fileOptions"]["header"] #string
    delimiter = config["fileOptions"]["delimiter"] #string
    multiLine = config["fileOptions"]["multiLine"] #string
    escape = config["fileOptions"]["escape"] #string
    
elif fileType == 'json':
    print('not ready yet')

# Table options
# Raw layer
rawLakehouseName = config["tableOptions"]["raw"]["lakehouseName"] #string
rawLayerName = config["tableOptions"]["raw"]["layerName"] #string
rawTableName = config["tableOptions"]["raw"]["tableName"] #string
rawInsertType = config["tableOptions"]["raw"]["insertType"] #string
rawCreateTableIfNotExists = config["tableOptions"]["raw"]["createTableIfNotExists"] #boolean
rawPartitionType = config["tableOptions"]["raw"]["partitionType"] #string
rawPartitionDateFormat = config["tableOptions"]["raw"]["partitionDateFormat"] #string
rawDataframePartitionColumns = config["tableOptions"]["raw"]["dataframePartitionColumns"] #List
rawTablePartitionColumns = config["tableOptions"]["raw"]["tablePartitionColumns"] #List
rawPartitionRowSize = config["tableOptions"]["raw"]["partitionRowSize"] #string
rawMergeOnColumns = config["tableOptions"]["raw"]["mergeOnColumns"] #List
rawMergeUpdateColumns = config["tableOptions"]["raw"]["mergeUpdateColumns"] #List

In [None]:
# Loaders based on the file type
# Loader interfaces in loaders note book

if configFileType == 'csv':
    df = loadCSV(filePath, header, delimiter, multiLine, customSchema, escape)

elif configFileType == 'json':
    #df = loadJson()
    print('not done yet')
elif configFileType == 'xml':
    #df = loadJson()
    print('not done yet')
elif configFileType == 'parquet':
    #df = loadJson()
    print('not done yet')
else:
    raise Exception('File extension not recognised, or the extension of the file is not the same as the file type defined in the config file')

In [None]:
# Process the dataframe, and get it ready for writing to a table

# Extend the dataframe with the columns to partition on
if rawPartitionType.lower() == 'date':
    df = createDatePartitions(df, rawPartitionDateFormat, rawDataframePartitionColumns)
# elif rawPartitionType.lower() == 'reference':
#     df = createReferencePartitions(df, rawPartitionDateFormat, rawDataframePartitionColumns)
# elif rawPartitionType.lower() == 'businessKeys':
#     df = createBusinessKeysPartitions(df, rawPartitionDateFormat, rawDataframePartitionColumns)

# Get dataframe schema

writeSchema = df.schema
# Does the table exist?
# Notes: In checking the schema we could do schema merge, however that should be handled by a version. 
# If you are happy using schema merge into Raw remove the raise exception, 
# and create a merge schema flag in the JSON confiig and change the write process to the table for append and merge
if spark.catalog.tableExists(rawTableName, rawLakehouseName) == False:
    createTableIfNotExists(rawLakehouseName, rawLayerName, rawTableName, writeSchema, rawTablePartitionColumns, rawPartitionType)
elif spark.catalog.tableExists(rawTableName, rawLakehouseName) == True:
    if schemaDFToTable(rawLakehouseName, rawTableName, writeSchema) == False:
        raise Exception('There is a schema mis-match between the dataframe and the table')




In [None]:
if rawLayerName != "":
    adjustedTableName = rawLakehouseName + '.' +rawLayerName + '_' + rawTableName
else:
    adjustedTableName = rawTableName

try:
    if rawInsertType.lower() == 'append':
        df.write.format("delta").mode("append").saveAsTable(adjustedTableName)
        
    elif rawInsertType.lower() == 'overwrite':
        df.write.format("delta").mode("overwrite").saveAsTable(adjustedTableName)

    elif rawInsertType.lower() == 'merge':

        mergeOnColumns = " AND ".join([f"target.{col} = source.{col}" for col in rawMergeOnColumns])
        updateColumns = {f"target.{col}": f"source.{col}" for col in rawMergeUpdateColumns}

        deltaTable.alias("target").merge(
            source=df.alias("source"), 
            condition = mergeOnColumns
        ).whenMatchedUpdate(
            set = updateColumns
        ).whenNotMatchedInsertAll().execute()

    moveLandingFile(filePath, 'Success')
except:
    moveLandingFile(filePath, 'Failure')   
    
