In [0]:
"""
  DROP TABLE IF EXISTS principal_lab_db.dev_bronze.agents;
  DROP TABLE IF EXISTS principal_lab_db.dev_bronze.customers;
  DROP TABLE IF EXISTS principal_lab_db.dev_bronze.policies;
  DROP TABLE IF EXISTS principal_lab_db.dev_bronze.claims;
  DROP TABLE IF EXISTS principal_lab_db.dev_bronze.products;
"""
import dlt
from pyspark.sql.functions import current_timestamp, col, to_date, regexp_extract, regexp_replace, from_json, when
from pyspark.sql.types import ArrayType, StringType, MapType

# Dostanu DEV nebo PROD
# dbutils.widgets.text("pipeline_env", "test_marek")
# env = dbutils.widgets.get("pipeline_env")
env = spark.conf.get("pipeline.env") 
catalog = "principal_lab_db"
bronze_schema = f"{env}_bronze"

@dlt.table(name="customers_bronze", comment="Bronze table for raw Customers data")
def customers_bronze():
    path = f"/Volumes/principal_lab_db/landing/operational_data/customers"
    preferences_schema = MapType(StringType(), StringType())

    return (spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("header", "true")
            .option("multiLine", True)
            .option("quote", '"')
            .option("escape", '"')
            .load(path)
            .select("*", col("_metadata.file_path").alias("source_file"))
            .withColumn("preferences", from_json(col("preferences"), preferences_schema))
            .withColumn("ingestion_ts", current_timestamp())
            .withColumn("snapshot_date",
                        to_date(
                            regexp_extract(
                                col("source_file"),
                                r'/operational_data/[^/]+/(\d{4}/\d{2}/\d{2})/',
                                1),
                            "yyyy/MM/dd"))
           )

@dlt.table(name="agents_bronze", comment="Bronze table for raw Agents data")
def agents_bronze():
    path = f"/Volumes/principal_lab_db/landing/operational_data/agents"
    metadata_schema = MapType(StringType(), StringType())

    return (spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("header", "true")
            .option("multiLine", True)
            .option("quote", '"')
            .option("escape", '"')
            .load(path)
            .select("*", col("_metadata.file_path").alias("source_file"))
            .withColumn("metadata", from_json(col("metadata"), metadata_schema))
            .withColumn("ingestion_ts", current_timestamp())
            .withColumn("snapshot_date",
                        to_date(
                            regexp_extract(
                                col("source_file"),
                                r'/operational_data/[^/]+/(\d{4}/\d{2}/\d{2})/',
                                1),
                            "yyyy/MM/dd"))
           )

@dlt.table(name="products_bronze", comment="Bronze table for raw Products data")
def products_bronze():
    path = f"/Volumes/principal_lab_db/landing/operational_data/products"

    return (
        spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("header", "true")
            .load(path)
            .select("*", col("_metadata.file_path").alias("source_file"))
            .withColumn("ingestion_ts", current_timestamp())
    )

@dlt.table(name="policies_bronze", comment="Bronze table for raw Policies data")
def policies_bronze():
    path = f"/Volumes/principal_lab_db/landing/operational_data/policies"
    coverages_schema = ArrayType(StringType())

    return (spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("header", "true")
            .option("multiLine", True)
            .option("quote", '"')
            .option("escape", '"')
            .load(path)
            .select("*", col("_metadata.file_path").alias("source_file"))
            .withColumn("coverages", from_json(col("coverages"), coverages_schema))
            .withColumn("ingestion_ts", current_timestamp())
            .withColumn("snapshot_date",
                        to_date(regexp_extract(col("source_file"),
                                               r'/operational_data/[^/]+/(\d{4}/\d{2}/\d{2})/', 1),
                                "yyyy/MM/dd"))
           )

@dlt.table(name="claims_bronze", comment="Bronze table for raw Claims data")
def claims_bronze():
   path = f"/Volumes/principal_lab_db/landing/operational_data/claims"
   return (spark.readStream.format("cloudFiles").option("cloudFiles.format", "csv")
           .option("header", "true").load(path)
           .select("*", col("_metadata.file_path").alias("source_file"))
           .withColumn("ingestion_ts", current_timestamp()))

@dlt.table(name="premium_transactions_bronze", comment="Bronze table for raw Premium Transactions data")
def premium_transactions_bronze():
    path = f"/Volumes/principal_lab_db/landing/operational_data/premium"

    return (
        spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("header", "true")
            .load(path)
            .select("*", col("_metadata.file_path").alias("source_file"))
            .withColumn("ingestion_ts", current_timestamp())
            .withColumn("snapshot_date", to_date(col("snapshot_date"), "yyyy-MM-dd"))
    )
