#### Capacity Metrics (by Kind)
by Workspace by Kind by Day

##### Data ingestion strategy:
<mark style="background: #D69AFE;">**MERGE**</mark>

##### Related pipeline:

**Load_Capacity_Metrics_E2E**

##### Source:

**CapacityMetricCloneDQ** via SemPy DAX execute query function

##### Target:

**1 Delta table** in FUAM_Lakehouse 
- **gold_table_name** variable value

In [None]:
import sempy.fabric as fabric
from datetime import datetime, timedelta
import datetime as dt
import pyspark.sql.functions as f
from delta.tables import *

In [None]:
## Parameters
metric_days_in_scope = 14
metric_workspace = "Microsoft Fabric Capacity Metrics v30"
metric_dataset = "Fabric Capacity Metrics"
display_data = True

In [None]:
## Variables
silver_table_name = "FUAM_Lakehouse.capacity_metrics_by_item_kind_by_day"
gold_table_name = "capacity_metrics_by_item_kind_by_day"
gold_table_name_with_prefix = f"Tables/{gold_table_name}"

In [None]:
# Fetch Capacities (without PPT capacities)
query = """
SELECT 
    CapacityId
FROM FUAM_Lakehouse.capacities
WHERE SKU != 'PP3'
AND state = 'Active'
"""
capacities = spark.sql(query)

In [None]:
if display_data:
    display(capacities)

In [None]:
# Iterate days
def iterate_dates(start_date, end_date):
    # Init array
    dates = []
    # Convert string inputs to datetime objects
    start = dt.datetime.strptime(start_date, '%Y-%m-%d')
    end = dt.datetime.strptime(end_date, '%Y-%m-%d')
    
    # Initialize current date as start date
    current_date = start.date()
    
    while current_date <= end.date():

        dates.append(
            {
                "date": current_date,
                "year": current_date.year,
                "month": current_date.month,
                "day": current_date.day
            })
        # Move to the next day
        current_date += dt.timedelta(days=1)

    return dates

In [None]:
# Clean Silver table
try:
    query = "DELETE FROM " + silver_table_name
    spark.sql(query)

except Exception as ex:
    print("Silver table doesn't exist yet.") 

In [None]:
# Iterate capacities and days
for cap in capacities.collect():
    capacity_id = cap[0]
    print(capacity_id)

    try:
        # Get today's date
        today = datetime.now()

        # Calculate the dates between today and days_in_scope
        days_ago = today - timedelta(days=metric_days_in_scope)

        # Format dates in 'yyyy-mm-dd'
        today_str = today.strftime('%Y-%m-%d')
        days_ago_str = days_ago.strftime('%Y-%m-%d')

        date_array = iterate_dates(days_ago_str, end_date=today_str)

        # Iterate days for current capacity
        for date in date_array:

            year = date['year']
            month = date['month']
            day = date['day']
            date_label = str(year) + '-' + str(month) + '-' + str(day)
            print(capacity_id)

            dax_query = f"""
                    DEFINE 
                    MPARAMETER 'CapacityID' = \"{capacity_id}\"

                    VAR __DS0FilterTable = 
                                        FILTER(
                                            KEEPFILTERS(VALUES('MetricsByItemandDay'[Date])),
                                            'MetricsByItemandDay'[Date] = DATE({year}, {month}, {day})
                                        )

                    VAR __DS0Core = 
                                    SUMMARIZECOLUMNS(
                                            Capacities[capacityId],
                                            Items[WorkspaceId],
                                            'MetricsByItemandDay'[Date],
                                            'Items'[ItemKind],
                                            FILTER(Capacities, Capacities[capacityId] = \"{capacity_id}\" ),
                                            __DS0FilterTable,
                                            "S_Dur", SUM('MetricsByItemandDay'[sum_duration]),
                                            "S_CU", SUM('MetricsByItemandDay'[sum_CU]),
                                            "TH_M", SUM('MetricsByItemandDay'[Throttling (min)]),
                                            "C_U", SUM('MetricsByItemandDay'[count_users]),
                                            "C_SO", SUM('MetricsByItemandDay'[count_successful_operations]),
                                            "C_RO", SUM('MetricsByItemandDay'[count_rejected_operations]),
                                            "C_O", SUM('MetricsByItemandDay'[count_operations]),
                                            "C_IO", SUM('MetricsByItemandDay'[count_Invalid_operations]),
                                            "C_FO", SUM('MetricsByItemandDay'[count_failure_operations]),
                                            "C_CO", SUM('MetricsByItemandDay'[count_cancelled_operations])
                                            )
                    EVALUATE
                        ADDCOLUMNS(
                            FILTER(__DS0Core, [S_CU] > 0),
                            "DateKey", FORMAT([Date], "yyyymmdd")
                        ) ORDER BY [S_CU] DESC
                    """

            # Execute DAX query
            capacity_df = fabric.evaluate_dax(workspace=metric_workspace, dataset=metric_dataset, dax_string=dax_query)
            capacity_df.columns = ['CapacityId', 'WorkspaceId', 'Date',  
                                    'ItemKind', 'DurationInSec','TotalCUs', 'ThrottlingInMin', 
                                    'UserCount','SuccessOperationCount', 'RejectedOperationCount','OperationCount',
                                    'InvalidOperationCount','FailureOperationCount','CancelledOperationCount', 'DateKey']
            
            if not(capacity_df.empty):
                # Transfer pandas df to spark df
                capacity_df = spark.createDataFrame(capacity_df)

                if display_data:
                    display(capacity_df)

                # Write prepared bronze_df to silver delta table
                print(f"Appending data. Capacity: {capacity_id}. Date: {date_label}")
                capacity_df.write.mode("append").option("mergeSchema", "true").format("delta").saveAsTable(silver_table_name)
            else:
                print(f"No data. Capacity: {capacity_id}. Date: {date_label}")

    except Exception as ex:
        print(ex)
        continue

In [None]:
# Get Silver table data
query = "SELECT * FROM " + silver_table_name
silver_df = spark.sql(query)

In [None]:
# Check if gold table exists
if spark._jsparkSession.catalog().tableExists('FUAM_Lakehouse', gold_table_name):
    # if exists -> MERGE to gold
    print("Gold table exists and will be merged.")

    gold_df = DeltaTable.forPath(spark, gold_table_name_with_prefix)
    # Merge silver (s = source) to gold (t = target)
    gold_df.alias('t') \
    .merge(
        silver_df.alias('s'),
        "s.CapacityId = t.CapacityId AND s.WorkspaceId = t.WorkspaceId AND s.Date = t.Date AND s.ItemKind = t.ItemKind"
    ) \
    .whenMatchedUpdate(set =
        {
             "DurationInSec": "s.DurationInSec"
            ,"TotalCUs": "s.TotalCUs"
            ,"ThrottlingInMin": "s.ThrottlingInMin"
            ,"UserCount": "s.UserCount"
            ,"SuccessOperationCount": "s.SuccessOperationCount"
            ,"RejectedOperationCount": "s.RejectedOperationCount"
            ,"OperationCount": "s.OperationCount"
            ,"InvalidOperationCount": "s.InvalidOperationCount"
            ,"FailureOperationCount": "s.FailureOperationCount"
            ,"CancelledOperationCount": "s.CancelledOperationCount"
        }
    ) \
    .whenNotMatchedInsert(values =
        {
             "CapacityId": "s.CapacityId"
            ,"WorkspaceId": "s.WorkspaceId"
            ,"Date": "s.Date"
            ,"ItemKind": "s.ItemKind"
            ,"DurationInSec": "s.DurationInSec"
            ,"TotalCUs": "s.TotalCUs"
            ,"ThrottlingInMin": "s.ThrottlingInMin"
            ,"UserCount": "s.UserCount"
            ,"SuccessOperationCount": "s.SuccessOperationCount"
            ,"RejectedOperationCount": "s.RejectedOperationCount"
            ,"OperationCount": "s.OperationCount"
            ,"InvalidOperationCount": "s.InvalidOperationCount"
            ,"FailureOperationCount": "s.FailureOperationCount"
            ,"CancelledOperationCount": "s.CancelledOperationCount"
            ,"DateKey": "s.DateKey"
        }
    ) \
    .execute()

else:
    # else -> INSERT to gold
    print("Gold table will be created.")

    silver_df.write.mode("append").option("mergeSchema", "true").format("delta").saveAsTable(gold_table_name)

In [None]:
# Clean Silver table
query = "DELETE FROM " + silver_table_name
spark.sql(query)