In [None]:
from delta.tables import *
from notebookutils import mssparkutils
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import col, when, from_json, date_format, lit, row_number,max, lower
from pyspark.sql.types import StructType, StringType
from pyspark.sql.window import Window

In [None]:
fromMonth = 0 #-1, -2,... from datenow -1 day
toMonth = 0 #-1, -2,... from datenow -1 day
rawSourcePath = "Files/focuscost"

## FUNCTIONS

In [None]:
def find_first_parquet_file(path):
    """
    Recursively search for the first .parquet file in the given directory.
    Args:
        path (str): The root directory to start the search.
    Returns:
        str or None: The full path to the first .parquet file found, or None if not found.
    """
    try:
        for entry in mssparkutils.fs.ls(path):
            if entry.isFile and entry.name.endswith(".parquet"):
                return entry.path
            elif entry.isDir:
                result = find_first_parquet_file(entry.path)
                if result:
                    return result
    except Exception as e:
        print(f"Error accessing {path}: {e}")
    return None

def generate_wildcard_path(full_path: str, raw_source_path: str, snapshot_folder: str) -> str:
    # Find the index where the raw source path starts
    idx = full_path.find(raw_source_path)
    if idx == -1:
        raise ValueError("rawSourcePath not found in full path")

    # Extract the base URI before the raw source path
    base_uri = full_path[:idx]
    detailPath = structurePath[idx+len(rawSourcePath):]

    startleveltoAdd = detailPath.count('/') - '/date-date/Guid/name.parquet'.count('/')

    # Construct the wildcard path
    wildcard_path = f"{base_uri}{raw_source_path}{'/*' * startleveltoAdd}/{snapshot_folder}/*/*.parquet"
    return wildcard_path

def generateArrayOFPeriod(from_Month: int, to_month: int):
    # Get today's date
    today = datetime.today()

    # Subtract one day
    yesterday = today - timedelta(days=1)
    first_day = yesterday.replace(day=1)

    periodToLoad = []
    if from_Month == to_month:
        periodDate = first_day + relativedelta(months=to_month)
        periodToLoad.append(periodDate.date())
    else:
        for i in range(from_Month, to_month+1):
            periodDate = first_day + relativedelta(months=i)
            periodToLoad.append(periodDate.date())

    return periodToLoad


def AddCapacityPauseColumn(dfsource):
    # Define the schema for the JSON structure. In this version, only for Fabric billingtype
    schema = StructType().add("BillingType", StringType())

    df_parsed = dfsource.withColumn("parsed_json", from_json(col("x_SkuDetails"), schema))

    # Create the new column based on the condition
    df_transformed = df_parsed.withColumn("CapacityPause", when(col("parsed_json.BillingType") == "Capacity Pause/Delete Surcharge", True).otherwise(False))

    # Optionally drop the intermediate parsed column
    df_final = df_transformed.drop("parsed_json")

    return df_final

def AddBillingTypeColumn(dfsource):
    # Define the schema for the JSON structure. In this version, only for Fabric billingtype
    schema = StructType().add("BillingType", StringType())

    df_parsed = dfsource.withColumn("parsed_json", from_json(col("x_SkuDetails"), schema))

    # Create the new column based on the condition
    df_transformed = df_parsed.withColumn("BillingType", col("parsed_json.BillingType"))

    # Optionally drop the intermediate parsed column
    df_final = df_transformed.drop("parsed_json")

    return df_final



## STEP 1 Load Silver:
Load into bronze table
Identify context and prepare load in silver
- Delete previous data
- Clean date format



In [None]:
structurePath = find_first_parquet_file(rawSourcePath)
periodsToLoad = generateArrayOFPeriod(fromMonth, toMonth)

In [None]:
for per in periodsToLoad:
    print("Start Period : " + per.strftime("%Y-%m-%d"))

    #drop staging table if exists
    spark.sql("DROP TABLE IF EXISTS focus_staging")

    #generate storage path date part
    fromFormatedDate = per.strftime("%Y%m%d")
    toFormatedDate = (per + relativedelta(months=1) + relativedelta(days=-1)).strftime("%Y%m%d")
    snapshot_folder = fromFormatedDate + "-" + toFormatedDate

    wildcard = generate_wildcard_path(structurePath, rawSourcePath, snapshot_folder)
    print("Used path to load data: " + wildcard)

    try:
        df = spark.read.parquet(wildcard)
        df.write.format('delta').saveAsTable("focus_staging")

        #identify period loaded
        df = spark.sql("SELECT BillingPeriodStart FROM focus_staging LIMIT 1")
        value = df.first()['BillingPeriodStart']

        #clean existing data in silver
        if spark.catalog.tableExists("focus"):
            print("Table exists, snapshot will be clean.")
            spark.sql(f"DELETE FROM focus WHERE BillingPeriodStart = '{value}'")

        #Load data in silver
        focus_staging_df = DeltaTable.forPath(spark,"Tables/focus_staging").toDF()
        focus_staging_df.write.mode("append").option("mergeSchema", "true").format("delta").saveAsTable("focus")

    except AnalysisException as e:
        if "PATH_NOT_FOUND" in str(e):
            print(f"Path not found: {wildcard}")
        else:
            raise # re-raise if it's a different AnalysisException

    print("End Period : " + per.strftime("%Y-%m-%d"))
