In [None]:
from delta.tables import *
from notebookutils import mssparkutils
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import col, when, from_json, date_format, lit, row_number,max, lower
from pyspark.sql.types import StructType, StringType
from pyspark.sql.window import Window

StatementMeta(, f92ff0f1-c298-45e6-bda4-9ad94e766612, 3, Finished, Available, Finished)

In [None]:
fromMonth = 0 #-1, -2,... from datenow -1 day 
toMonth = 0 #-1, -2,... from datenow -1 day
rawSourcePath = "Files/focuscost"

StatementMeta(, f92ff0f1-c298-45e6-bda4-9ad94e766612, 4, Finished, Available, Finished)

## FUNCTIONS

In [3]:
def find_first_parquet_file(path):
    """
    Recursively search for the first .parquet file in the given directory.
    Args:
        path (str): The root directory to start the search.
    Returns:
        str or None: The full path to the first .parquet file found, or None if not found.
    """
    try:
        for entry in mssparkutils.fs.ls(path):
            if entry.isFile and entry.name.endswith(".parquet"):
                return entry.path
            elif entry.isDir:
                result = find_first_parquet_file(entry.path)
                if result:
                    return result
    except Exception as e:
        print(f"Error accessing {path}: {e}")
    return None

def generate_wildcard_path(full_path: str, raw_source_path: str, snapshot_folder: str) -> str:
    # Find the index where the raw source path starts
    idx = full_path.find(raw_source_path)
    if idx == -1:
        raise ValueError("rawSourcePath not found in full path")

    # Extract the base URI before the raw source path
    base_uri = full_path[:idx]
    detailPath = structurePath[idx+len(rawSourcePath):]

    startleveltoAdd = detailPath.count('/') - '/date-date/Guid/name.parquet'.count('/')

    # Construct the wildcard path
    wildcard_path = f"{base_uri}{raw_source_path}{'/*' * startleveltoAdd}/{snapshot_folder}/*/*.parquet"
    return wildcard_path

def generateArrayOFPeriod(from_Month: int, to_month: int):
    # Get today's date
    today = datetime.today()

    # Subtract one day
    yesterday = today - timedelta(days=1)
    first_day = yesterday.replace(day=1)

    periodToLoad = []
    if from_Month == to_month:
        periodDate = first_day + relativedelta(months=to_month)
        periodToLoad.append(periodDate.date())
    else:
        for i in range(from_Month, to_month+1):
            periodDate = first_day + relativedelta(months=i)
            periodToLoad.append(periodDate.date())

    return periodToLoad


def AddCapacityPauseColumn(dfsource):
    # Define the schema for the JSON structure. In this version, only for Fabric billingtype
    schema = StructType().add("BillingType", StringType())

    df_parsed = dfsource.withColumn("parsed_json", from_json(col("x_SkuDetails"), schema))

    # Create the new column based on the condition
    df_transformed = df_parsed.withColumn("CapacityPause", when(col("parsed_json.BillingType") == "Capacity Pause/Delete Surcharge", True).otherwise(False))

    # Optionally drop the intermediate parsed column
    df_final = df_transformed.drop("parsed_json")

    return df_final

def AddBillingTypeColumn(dfsource):
    # Define the schema for the JSON structure. In this version, only for Fabric billingtype
    schema = StructType().add("BillingType", StringType())

    df_parsed = dfsource.withColumn("parsed_json", from_json(col("x_SkuDetails"), schema))

    # Create the new column based on the condition
    df_transformed = df_parsed.withColumn("BillingType", col("parsed_json.BillingType"))

    # Optionally drop the intermediate parsed column
    df_final = df_transformed.drop("parsed_json")

    return df_final



StatementMeta(, f92ff0f1-c298-45e6-bda4-9ad94e766612, 5, Finished, Available, Finished)

## STEP 1 Load Silver:
Load into bronze table
Identify context and prepare load in silver
- Delete previous data
- Clean date format



In [4]:
structurePath = find_first_parquet_file(rawSourcePath)
periodsToLoad = generateArrayOFPeriod(fromMonth, toMonth)

StatementMeta(, f92ff0f1-c298-45e6-bda4-9ad94e766612, 6, Finished, Available, Finished)

In [65]:
for per in periodsToLoad:
    print("Start Period : " + per.strftime("%Y-%m-%d"))

    #drop staging table if exists
    spark.sql("DROP TABLE IF EXISTS focus_staging")

    #generate storage path date part
    fromFormatedDate = per.strftime("%Y%m%d")
    toFormatedDate = (per + relativedelta(months=1) + relativedelta(days=-1)).strftime("%Y%m%d")
    snapshot_folder = fromFormatedDate + "-" + toFormatedDate

    wildcard = generate_wildcard_path(structurePath, rawSourcePath, snapshot_folder)
    print("Used path to load data: " + wildcard)

    try:
        df = spark.read.parquet(wildcard)
        df.write.format('delta').saveAsTable("focus_staging")

        #identify period loaded
        df = spark.sql("SELECT BillingPeriodStart FROM focus_staging LIMIT 1")
        value = df.first()['BillingPeriodStart']

        #clean existing data in silver
        if spark._jsparkSession.catalog().tableExists('Focus', "focus"):
            print("Table exists, snapshot will be clean.")
            spark.sql(f"DELETE FROM focus WHERE BillingPeriodStart = '{value}'")

        #Load data in silver
        focus_staging_df = DeltaTable.forPath(spark,"Tables/focus_staging").toDF()
        focus_staging_df.write.mode("append").option("mergeSchema", "true").format("delta").saveAsTable("focus")

    except AnalysisException as e:
        if "PATH_NOT_FOUND" in str(e):
            print(f"Path not found: {wildcard}")
        else:
            raise # re-raise if it's a different AnalysisException

    print("End Period : " + per.strftime("%Y-%m-%d"))


StatementMeta(, 175f81b0-a6b8-4761-9ef7-d5ebf4bff478, 67, Finished, Available, Finished)

Start Period : 2025-07-01
Used path to load data: abfss://d7ae03b3-c53b-4ee6-af7e-0091be2e7cc4@onelake.dfs.fabric.microsoft.com/edea360e-6c6d-4761-aeca-d57df8bd7b91/Files/focuscost/*/*/20250701-20250731/*/*.parquet
Table exists, snapshot will be clean.
End Period : 2025-07-01


## STEP 2 Load Gold:

- Filter on Fabric data only
- Remove x_ column
- Add FabricPause column

In [6]:
# Preparation of predicate for all ingested periods
date_condition = ", ".join([f"'{date.strftime('%Y-%m-%d')} 00:00:00'" for date in periodsToLoad])

print("Start loading Period(s) in " + date_condition)

#clean existing data in silver
if spark._jsparkSession.catalog().tableExists('Focus', "focus_fabric"):
    print("Table exists, snapshot will be clean.")
    Delete_sql_query = f"""DELETE FROM focus_fabric WHERE BillingPeriodStart IN ({date_condition})"""
    spark.sql(Delete_sql_query)
    print("Clean performed")

x_columns_to_keep = ['x_SkuMeterName','x_SkuMeterSubcategory','x_ResourceGroupName']
compute_columns_to_keep = ["BillingType","ChargePeriodStart_DateKey","MeterKey","CommitmentSavings"]

focus_df = DeltaTable.forPath(spark,"Tables/focus").toDF()
Meters_df = DeltaTable.forPath(spark,"Tables/meters").toDF().select("Name_Lower","MeterKey")
max_key = Meters_df.agg(max("MeterKey")).collect()[0][0]

# Get all column names that do NOT start with 'x_'
columns_to_keep = [col for col in focus_df.columns if not col.startswith("x_")]
columns_to_keep = columns_to_keep + x_columns_to_keep + compute_columns_to_keep

focus_df = focus_df.where(f"""ServiceName = 'Microsoft.Fabric' and BillingPeriodStart IN ({date_condition})""")
focus_df = AddBillingTypeColumn(focus_df)

#identify missing Meters in the referencial
missing_Meters_df = focus_df.select("x_SkuMeterName","ChargeDescription","BillingType") \
                            .withColumn("Name_Lower",lower("x_SkuMeterName")) \
                            .drop("x_SkuMeterName") \
                            .dropDuplicates(["Name_Lower"]) \
                            .withColumn("Category",when(col("ChargeDescription") == "Fabric Cap","Fabric CU").otherwise(col("ChargeDescription"))) \
                            .drop("ChargeDescription") \
                            .withColumn("State",lit("Unknow")) \
                            .withColumn("Main",lit("FALSE"))  \
                            .withColumn("Included",when(col("BillingType").isNotNull() & (col("BillingType") != "Capacity Pause/Delete Surcharge"),lit("TRUE")).otherwise(lit("FALSE"))) \
                            .drop("BillingType")

missing_Meters_df = missing_Meters_df.join(Meters_df,"Name_Lower","leftouter")
missing_Meters_df = missing_Meters_df.where(missing_Meters_df.MeterKey.isNull())
window_spec = Window.orderBy("Name_Lower")
missing_Meters_df = missing_Meters_df.withColumn("MeterKey", row_number().over(window_spec) + max_key)

missing_Meters_df.write.mode("append").option("mergeSchema", "true").format("delta").saveAsTable("Meters")


Meters_df = DeltaTable.forPath(spark,"Tables/meters").toDF()
Meters_df = Meters_df.select("Name_Lower","MeterKey")
focus_df = focus_df.withColumn("Name_Lower", lower("x_SkuMeterName")).join(Meters_df,"Name_Lower", "leftouter").drop("Name_Lower") #retrieve MeterKey
focus_df = focus_df.withColumn("ChargePeriodStart_DateKey", date_format("ChargePeriodStart", "yyyyMMdd").cast("int"))
focus_df = focus_df.withColumn("CommitmentSavings", col("ContractedCost") - col("EffectiveCost"))
focus_df = focus_df.select(columns_to_keep)

focus_df.write.mode("append").option("mergeSchema", "true").format("delta").saveAsTable("focus_fabric")

print("End Loading")


StatementMeta(, f92ff0f1-c298-45e6-bda4-9ad94e766612, 8, Finished, Available, Finished)

Start loading Period(s) in '2025-01-01 00:00:00', '2025-02-01 00:00:00', '2025-03-01 00:00:00', '2025-04-01 00:00:00', '2025-05-01 00:00:00', '2025-06-01 00:00:00', '2025-07-01 00:00:00'
Table exists, snapshot will be clean.
Clean performed
End Loading
