In [0]:
import dlt
import json
from pyspark.sql.functions import col, regexp_replace, from_json, udf, col, lower, initcap, upper, split, lit, concat, trim, when, isnull, expr, get, size, concat_ws,slice
from pyspark.sql.types import ArrayType, StringType, DoubleType, BooleanType,IntegerType,DateType

# Získání prostředí (např. "dev" nebo "prod")
env = spark.conf.get("pipeline.env")
catalog = "principal_lab_db"
silver_schema = f"{env}_silver"
bronze_schema = f"{env}_bronze"

# Nastavení katalogu
spark.sql(f"USE CATALOG {catalog}")
spark.sql(f"USE SCHEMA {silver_schema}")

# Načtení metadata z lookup tabulky
config = spark.table(f"{catalog}.config_{env}.table_lookup") \
    .filter(col("table_name") == "dim_policies") \
    .select("keys", "scd_type", "description") \
    .first()

# Parsování hodnot
keys_raw = config["keys"]
scd_type_raw = config["scd_type"]
description = config["description"]

# Pokud je 'keys' uložen jako JSON řetězec, tak ho rozparsuj
if isinstance(keys_raw, str):
    business_keys = json.loads(keys_raw)
else:
    business_keys = keys_raw

# Mapa pro SCD typ
scd_type_map = {
    "SCD1": 1,
    "SCD2": 2,
    "1": 1,
    "2": 2,
    None: 0,
    "": 0
}
scd_type = scd_type_map.get(str(scd_type_raw).upper(), 0)

# Debug pro sebe
print(f"Prostředí: {env}")
print(f"Business Keys: {business_keys}")
print(f"SCD Type: {scd_type}")

# Vytvoření view z Bronze vrstvy a čištění
@dlt.view(name="policies_bronze_clean")
def policies_bronze_clean():
    bronze_df = spark.readStream.table(f"{catalog}.{bronze_schema}.policies_bronze")

    cleaned_df = bronze_df \
        .withColumn("premium", col("premium").cast(DoubleType())) \
        .drop("_rescued_data", "source_file", "ingestion_ts") 

    return cleaned_df



# Silver tabulka se SCD2 historií
if scd_type == 2:
    # Tabulka dim_policies ve stříbrné vrstvě (SCD2)
    dlt.create_streaming_table(
        name="dim_policies",
        comment=description,
        table_properties={"quality": "silver"}
    )

    #  tady se vytváří historie přes __START_AT a __END_AT
    dlt.apply_changes(
        target="dim_policies",
        source="policies_bronze_clean",
        keys=business_keys,
        sequence_by=col("snapshot_date"),
        ignore_null_updates=False,
        stored_as_scd_type="2",
        track_history_except_column_list=["snapshot_date"]
    )

else:
    raise ValueError(f"Nepodporovaný nebo chybějící SCD typ: {scd_type_raw}")

#---- expectations rules for customers and agents table ------#
expectation_rules = {
    "customer": {
        "expectations": [{
                "valid_last_name": "last_name IS NOT NULL",
                "valid_income":"income > 0"
            }]
        },
    "agents":{
        "expectations":[{  
            "valid_last_name": "last_name IS NOT NULL",
           }]
        }
}
#------------------------------- customers table start  -------#
config_customers = spark.table(f"{catalog}.config_{env}.table_lookup") \
    .filter(col("table_name") == "dim_customers") \
    .select("keys", "scd_type", "description") \
    .first()

keys_raw_customers = config_customers["keys"]
scd_type_raw_customers = config_customers["scd_type"]
description_customers = config_customers["description"]

if isinstance(keys_raw_customers, str):
    business_keys_customer = json.loads(keys_raw_customers)
else:
    business_keys_customer = keys_raw_customers

scd_type_customers = scd_type_map.get(str(scd_type_raw_customers).upper(), 0)

customer_expectations_expr = "NOT({0})".format(" AND ".join(expectation_rules["customer"]["expectations"][0].values()))

def clean_last_name(value:str) -> str:
    """
    Remove school titles after last name and take last word in last name. May occur values like 'Jimmie Smith Phd'. 
    In this case we want just Smith.
    """
    if value is None:
        return None
    forbidden_values = ['md','phd','dds']
    words = value.strip().split()
    words = [w.lower().capitalize() for w in words if w.lower() not in forbidden_values]
    return words[-1] if words else None


def clean_first_name(value:str) -> str:
    """
    Removes prefixes from first name like Mrs.,etc.. or if first name have only Mrs. or Mr. return NULL.
    """
    if value is None:
        return None
    forbidden_values = ['mr','mrs','mr.','mrs.'] #if we have new prefixes, add here
    words = value.strip().split()
    words = [w.lower().capitalize() for w in words if w.lower() not in forbidden_values]
    return words[0] if words else None

clean_last_name_udf = udf(clean_last_name, StringType())
clean_first_name_udf = udf(clean_first_name, StringType())

#--- table for apply expectations ---#
@dlt.table(
    name="customers_clean_quarantine_rules",
    comment="apply expectations rules for customers",
    partition_cols =["is_quarantined"]
)
@dlt.expect_all(expectation_rules["customer"]["expectations"][0])
def customer_data_clean_quarantine():
    df_customer = dlt.readStream(f"{catalog}.{bronze_schema}.customers_bronze")
    return (
        df_customer
        .withColumn("customer_id",trim("customer_id"))
        .withColumn("first_name", clean_first_name_udf(col("first_name")))
        .withColumn("last_name",clean_last_name_udf(col("last_name")))
        .withColumn("email", lower(trim(col("email"))))
        .withColumn("address_splt",split(trim(col("address")),','))
        .withColumn("address",concat(initcap(get("address_splt",0)),lit(','),initcap(get("address_splt",1)),lit(','),upper(get("address_splt",2))))
        .withColumn("income",col("income").cast(IntegerType()))
        .withColumn("contact_methods_raw", regexp_replace(col("preferences.contact_methods"), r'\\"', '"'))
        .withColumn("contact_methods", from_json(col("contact_methods_raw"), ArrayType(StringType()))) 
        .withColumn("preferred_language", trim(col("preferences.preferred_language"))) 
        .withColumn("newsletter_opt_in", col("preferences.newsletter_opt_in").cast(BooleanType())) 
        .withColumn("is_quarantined",expr(customer_expectations_expr))
        .drop("address_splt","contact_methods_raw","preferences","_rescued_data", "source_file", "ingestion_ts")
    )

#table where is stored customers data which is good
@dlt.table(name='customers_clean_good_records',comment='customers cleanded and validate data')
def customer_clean():
    df_customer = dlt.readStream('customers_clean_quarantine_rules')
    return (
        df_customer
        .filter("is_quarantined=false")
        .drop("is_quarantined")
    )

#table where is stored customers data which is bad
@dlt.table(name='customers_clean_bad_records',comment='customers cleaned and bad data')
def customer_clean():
    df_customer = dlt.readStream('customers_clean_quarantine_rules')
    return (
        df_customer
        .filter("is_quarantined=true")
        .drop("is_quarantined")
    )

#scd 2 table for customers
if scd_type_customers == 2:
    dlt.create_streaming_table(
        name="dim_customers",
        comment=description_customers,
        table_properties={"quality": "silver"}
        )
    dlt.apply_changes(
        target = "dim_customers",
        source = "customers_clean_good_records",
        keys = business_keys_customer,
        sequence_by = col("snapshot_date"),
        ignore_null_updates=False,
        stored_as_scd_type="2",
        track_history_except_column_list=['snapshot_date']
    )
else:
    raise ValueError(f"Nepodporovaný nebo chybějící SCD typ: {scd_type_raw_customers}")

#------------------------------- agents table start  -------#
config_agents = spark.table(f"{catalog}.config_{env}.table_lookup") \
    .filter(col("table_name") == "dim_agents") \
    .select("keys", "scd_type", "description") \
    .first()

keys_raw_agents = config_agents["keys"]
scd_type_raw_agents = config_agents["scd_type"]
description_agents = config_agents["description"]

if isinstance(keys_raw_agents, str):
    business_keys_agents = json.loads(keys_raw_agents)
else:
    business_keys_agents = keys_raw_agents

scd_type_agents = scd_type_map.get(str(scd_type_raw_agents).upper(), 0)

agents_expectations_expr = "NOT({0})".format(" AND ".join(expectation_rules["agents"]["expectations"][0].values()))

def clean_full_name(value:str) -> str:
    """
    Remove prefixes/sufixes from names.
    """
    forbidden_prefixes = ['mr.', 'mrs.', 'mr','mrs','dds','phd','md']
    if value == None:
        return None
    name_spl = value.strip().lower().split(" ")
    cleaned_name = [word.capitalize() for word in name_spl if word not in forbidden_prefixes]
    return ' '.join(cleaned_name)

clean_full_name_udf = udf(clean_full_name, StringType())

#--- agents table for apply expectations ---#
@dlt.table(
    name="agents_clean_quarantine_rules",
    comment="apply expectations rules for agents",
    partition_cols =["is_quarantined"]
)
@dlt.expect_all(expectation_rules["agents"]["expectations"][0])
def agents_data_clean_quarantine():
    df_agents = dlt.readStream(f"{catalog}.{bronze_schema}.agents_bronze")
    return (
        df_agents
        .withColumn("agent_id",trim(col("agent_id")))
        .withColumn("name",clean_full_name_udf(col("name")))
        .withColumn("name_splt",split(col("name")," "))
        .withColumn("first_name", when(isnull(col("name_splt")),None).when(size(col("name_splt"))==1,None).otherwise(concat_ws(" ", slice(col("name_splt"), 1, size(col("name_splt")) - 1))))
        .withColumn("last_name",when(isnull(col("name_splt")), None).when(size(col("name_splt"))==1,None).otherwise(col("name_splt")[size(col("name_splt")) - 1]))
        .withColumn("region",initcap(trim(col("region"))))
        .withColumn("email",lower(trim(col("email"))))
        .withColumn("start_date", col("start_date").cast(DateType()))
        .withColumn("languages_raw",regexp_replace(col("metadata.languages"), r'\\"', '"'))
        .withColumn("languages", from_json(col("languages_raw"),ArrayType(StringType())))
        .withColumn("certifications_raw",regexp_replace(col("metadata.certifications"), r'\\"', '"'))
        .withColumn("certifications", from_json(col("certifications_raw"),ArrayType(StringType())))
        .withColumn("is_quarantined",expr(agents_expectations_expr))
        .drop("name","metadata","name_splt","languages_raw","certifications_raw","_rescued_data", "source_file", "ingestion_ts")
    )

#table where is stored agents data which is good
@dlt.table(name='agents_clean_good_records',comment='agents cleanded and validate data')
def customer_clean():
    df_customer = dlt.readStream('agents_clean_quarantine_rules')
    return (
        df_customer
        .filter("is_quarantined=false")
        .drop("is_quarantined")
    )

#table where is stored agents data which is bad
@dlt.table(name='agents_clean_bad_records',comment='agents cleaned and bad data')
def customer_clean():
    df_customer = dlt.readStream('agents_clean_quarantine_rules')
    return (
        df_customer
        .filter("is_quarantined=true")
        .drop("is_quarantined")
    )

#scd 2 table for agents
if scd_type_customers == 2:
    dlt.create_streaming_table(
        name="dim_agents",
        comment=description_agents,
        table_properties={"quality": "silver"}
        )
    dlt.apply_changes(
        target = "dim_agents",
        source = "agents_clean_good_records",
        keys = business_keys_agents,
        sequence_by = col("snapshot_date"),
        ignore_null_updates=False,
        stored_as_scd_type="2",
        track_history_except_column_list=['snapshot_date']
    )
else:
    raise ValueError(f"Nepodporovaný nebo chybějící SCD typ: {scd_type_raw_customers}")