#### Capacities

##### Data ingestion strategy:
<mark style="background: #88D5FF;">**REPLACE**</mark>

##### Related pipeline:

**Load_Capacities_E2E**

##### Source:

**Files** from FUAM_Lakehouse folder **bronze_file_location** variable

##### Target:

**1 Delta table** in FUAM_Lakehouse 
- **gold_table_name** variable value


In [None]:
from datetime import datetime, timedelta
from pyspark.sql.functions import col, explode, upper
from delta.tables import *
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true") # needed for automatic schema evolution in merge 

In [None]:
## Parameters
display_data = True

In [None]:
## Variables
bronze_file_location = f"Files/raw/domains/"
silver_table_name = "FUAM_Staging_Lakehouse.domains_silver"
gold_table_name = "domains_flatten"
gold_table_name_with_prefix = f"Tables/{gold_table_name}"

In [None]:
# Clean Silver table, if exists
if spark.catalog.tableExists(silver_table_name):
    del_query = "DELETE FROM " + silver_table_name
    spark.sql(del_query)

In [None]:
# Get Bronze data
bronze_df = spark.read.option("multiline", "true").json(bronze_file_location)

In [None]:
if display_data:
    display(bronze_df)

In [None]:
# Explode json subset structure
exploded_df = bronze_df.select(explode("domains").alias("d"))

# Extract json objects to tabular form
extracted_df = exploded_df.select(col("d.*"))

extracted_df = extracted_df.withColumnRenamed("id", "DomainId").withColumnRenamed("parentDomainId", "ParentDomainId").withColumnRenamed("displayName", "DomainName").withColumnRenamed("contributorsScope", "DomainContributorsScope").withColumnRenamed("description", "DomainDescription")
silver_df = extracted_df.withColumn("DomainId", upper("DomainId")).withColumn("ParentDomainId", upper("ParentDomainId"))

if display_data:
    display(extracted_df)

In [None]:
# Write prepared bronze_df to silver delta table
silver_df.write.mode("overwrite").option("mergeSchema", "true").format("delta").saveAsTable(silver_table_name)

In [None]:
# Get Silver table data for Domains - Flatten structure
domains_generic_query = """
SELECT 
     gen.DomainId,
     gen.DomainContributorsScope,
     gen.DomainName AS OriginalDomainName,
     gen.ParentDomainId AS OriginalParentDomainId,
     CASE 
          WHEN gen.ParentDomainId IS NULL THEN gen.DomainName 
          ELSE md.DomainName
     END AS MainDomainName,
     CASE 
          WHEN gen.ParentDomainId IS NOT NULL THEN gen.DomainName 
          ELSE 'Without Subdomain'
     END AS SubDomainName,
     CASE 
          WHEN gen.ParentDomainId IS NOT NULL THEN 1 
          ELSE 0 
          END AS IsSubDomain     
FROM """ + silver_table_name + """ AS gen LEFT OUTER JOIN """ + silver_table_name + """ AS md on gen.ParentDomainId = md.DomainId """

domains_generic_silver_df = spark.sql(domains_generic_query)

if display_data:
     display(domains_generic_silver_df)



In [None]:
domains_generic_silver_df.write.mode("overwrite").option("mergeSchema", "true").format("delta").saveAsTable(f"{gold_table_name}")

In [None]:
# Write history of bronze files
mssparkutils.fs.cp(bronze_file_location, bronze_file_location.replace("Files/raw/", "Files/history/") + datetime.now().strftime('%Y/%m/%d') + "/", True)