In [None]:
from delta.tables import *
from notebookutils import mssparkutils
from pyspark.sql.functions import row_number,max, lit, lower, regexp_replace
from pyspark.sql.window import Window
import requests
import os
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true") # needed for automatic schema evolution in merge

StatementMeta(, 2a7b4a42-cab8-490e-a92a-f4fa65f1f1fc, 26, Finished, Available, Finished)

In [None]:
rawSourcePath = "Files/reservation-transactions"

StatementMeta(, 2a7b4a42-cab8-490e-a92a-f4fa65f1f1fc, 27, Finished, Available, Finished)

In [None]:
def find_first_csv_file(path):
    """
    Recursively search for the first .csv file in the given directory.
    Args:
        path (str): The root directory to start the search.
    Returns:
        str or None: The full path to the first .parquet file found, or None if not found.
    """
    try:
        for entry in mssparkutils.fs.ls(path):
            if entry.isFile and entry.name.endswith(".csv"):
                return entry.path
            elif entry.isDir:
                result = find_first_csv_file(entry.path)
                if result:
                    return result
    except Exception as e:
        print(f"Error accessing {path}: {e}")
    return None

def generate_wildcard_path(full_path: str, raw_source_path: str) -> str:
    # Find the index where the raw source path starts
    idx = full_path.find(raw_source_path)
    if idx == -1:
        raise ValueError("rawSourcePath not found in full path")

    # Extract the base URI before the raw source path
    base_uri = full_path[:idx]
    detailPath = structurePath[idx+len(rawSourcePath):]

    startleveltoAdd = detailPath.count('/') - '/date-date/Guid/name.csv'.count('/')

    # Construct the wildcard path
    wildcard_path = f"{base_uri}{raw_source_path}{'/*' * startleveltoAdd}/*.csv"
    return wildcard_path

StatementMeta(, 2a7b4a42-cab8-490e-a92a-f4fa65f1f1fc, 28, Finished, Available, Finished)

In [None]:
structurePath = find_first_csv_file(rawSourcePath)
wildcard = generate_wildcard_path(structurePath, rawSourcePath)
print("Used path to load data: " + wildcard)

source_df = spark.read.csv(wildcard, header=True)
source_df = source_df.where("lower(ArmSkuName) = 'fabric_capacity_cu_hour'")

StatementMeta(, 2a7b4a42-cab8-490e-a92a-f4fa65f1f1fc, 29, Finished, Available, Finished)

Used path to load data: abfss://b5e66845-fd73-4d5d-b28c-da7dbc2d17ec@onelake.dfs.fabric.microsoft.com/e3e0ae4e-208c-4a38-bf39-9671d2e2e0e6/Files/reservation-transactions/*.csv


# DIM Reservation

In [None]:
tableName = "reservations"
logicalKeyColumn = "ReservationOrderId"
technicalKeyColumn = "ReservationOrderKey"
tableAlreadyExists = spark._jsparkSession.catalog().tableExists('FCA', tableName)


source_merge_df = source_df.select("Amount","BillingFrequency","Currency","Description","EventDate","PurchasingSubscriptionGuid","PurchasingSubscriptionName","Quantity","Region","ReservationOrderId","ReservationOrderName")

if tableAlreadyExists:
    #Merge to table

    print(f"Merge Data for {tableName} table Started")

    target_table = DeltaTable.forPath(spark, f"Tables/{tableName}")
    target_df = target_table.toDF()
    target_df = target_df.select(logicalKeyColumn,technicalKeyColumn)


    max_key = target_df.agg(max(technicalKeyColumn)).collect()[0][0]

    combined_df = source_merge_df.join(target_df,logicalKeyColumn,"leftouter")
    existingRows_df = combined_df.where(combined_df[technicalKeyColumn].isNotNull())
    newRows_df = combined_df.where(combined_df[technicalKeyColumn].isNull())
    window_spec = Window.orderBy(logicalKeyColumn)
    newRows_df = newRows_df.withColumn(technicalKeyColumn, row_number().over(window_spec) + max_key )

    Src_Merge_df = existingRows_df.union(newRows_df)


    merge = (target_table.alias("target")
        .merge(
            Src_Merge_df.alias("source"),
            f"target.{technicalKeyColumn} = source.{technicalKeyColumn}"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        )
    merge.execute()

    print(f"Merge Data for {tableName} Ended")


else:
    print(f"Table {tableName} creation started")
    window_spec = Window.orderBy(logicalKeyColumn)
    source_merge_df = source_merge_df.withColumn(technicalKeyColumn, row_number().over(window_spec))
    source_merge_df.write.mode("overwrite").option("mergeSchema", "true").format("delta").saveAsTable(tableName)
    print(f"Table {tableName} creation Ended")

StatementMeta(, 2a7b4a42-cab8-490e-a92a-f4fa65f1f1fc, 31, Finished, Available, Finished)

Merge Data for reservations table Started
Merge Data for reservations Ended
