## Input parameters.

In [1]:
# SourceTablesCSVFolderABFSPath = 'Files/MSM Data'
# SourceTablesModelJsonABFSPath = 'Files/MSM Data/model.json'
# SourceTablesTargetFolderABFSPath = 'Files/MSM Bronze Layer/msm'

StatementMeta(, e221b5a4-77a3-4d82-a8f2-c740a60ad94c, 5, Finished, Available)

In [1]:
SourceTablesCSVFolderABFSPath = 'abfss://dataverse-sustainability@#STORAGE_ACCOUNT_NAME#.dfs.core.windows.net/'
SourceTablesModelJsonABFSPath = 'abfss://dataverse-sustainability@#STORAGE_ACCOUNT_NAME#.dfs.core.windows.net/model.json'
SourceTablesTargetFolderABFSPath = 'abfss://bronze@#STORAGE_ACCOUNT_NAME#.dfs.core.windows.net/msm'

StatementMeta(, 31760ad3-7712-404b-b419-7116d8d3aba5, 5, Finished, Available)

## Import required libraries.

In [2]:
import os, json
from pyspark.sql.types import *

spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")
spark.conf.set("spark.sql.parquet.int96RebaseModeInWrite", "LEGACY")

StatementMeta(, 31760ad3-7712-404b-b419-7116d8d3aba5, 6, Finished, Available)

In [3]:
def readFile(filePath):
    rdd = spark.sparkContext.wholeTextFiles(filePath)
    return rdd.collect()[0][1]

StatementMeta(, , , Waiting, )

## Load source tables.

In [4]:

modelJson = json.loads(readFile(SourceTablesModelJsonABFSPath))

failedFiles = {}
for entity in modelJson['entities']:
    tableName = (entity["name"])
    structFields = []
    for field in entity["attributes"]:
        ftype = field["dataType"]
        structType = None
        if ftype == 'string' or ftype == 'guid':
            structType = StringType()
        elif ftype == 'dateTimeOffset' or ftype == 'dateTime':
            structType = TimestampType()
        elif ftype == 'int64':
            structType = LongType()
        elif ftype == 'decimal':
            precision = field["cdm:traits"][0]["arguments"][0]["value"]
            scale = field["cdm:traits"][0]["arguments"][1]["value"]
            structType = DecimalType(precision, scale)
        elif ftype == 'boolean':
            structType = BooleanType()
        elif ftype == 'double':
            structType = DoubleType()
        else:
            raise ValueError("Invalid column 'type' found in config file for column = " + columnType)
        
        structFields.append(StructField(field["name"], structType, nullable=True))

    try:
        sourceCsvPath = os.path.join(SourceTablesCSVFolderABFSPath, tableName, '*.csv')
        df = spark.read.csv(sourceCsvPath, schema=StructType(structFields))
        df.write\
            .format("delta")\
            .mode("overwrite")\
            .save(os.path.join(SourceTablesTargetFolderABFSPath, tableName))
    except Exception as e:
        failedFiles[tableName] = e
        if 'Path does not exist:' not in str(e):
            print("An exception occurred:", str(e))

StatementMeta(, , , Waiting, )