In [0]:
from pyspark.sql.functions import current_timestamp, input_file_name

# Parametry
input_path = "/Volumes/principal_lab_db/landing/operational_data/agents/"
target_table = "principal_lab_db.dev_bronze.agents"

# Načti všechna data (rekurzivně ze všech podadresářů)
df = spark.read.option("header", True).csv(f"{input_path}*/*/*")
df = df.withColumn("ingestion_ts", current_timestamp()) \
       .withColumn("source_file", input_file_name())

# Zapiš jako Delta tabulku
df.write.format("delta").mode("overwrite").saveAsTable(target_table)


In [0]:
%sql
SELECT * FROM principal_lab_db.dev_bronze.agents

In [0]:
from pyspark.sql.functions import current_timestamp, input_file_name, regexp_extract, to_date

# --- Configuration: raw data path and catalog/schema/table names ---
# Use the DBFS-mounted path prefix '/Volumes/...' rather than 'dbfs:/Volumes'.
RAW_PATH = "/Volumes/principal_lab_db/landing/operational_data"
CATALOG = "principal_lab_db"
SCHEMA = "dev_bronze"
TABLE = "agents"

target_table = f"{CATALOG}.{SCHEMA}.{TABLE}"

# Read all CSV files in dated subfolders: year/month/day (*.csv)
# Pattern: three nested wildcard levels before the file
df = (
    spark.read.format("csv")
         .option("header", True)
         .load(f"{RAW_PATH}/{TABLE}/*/*/*/*.csv")
)

# Enrich with audit columns and snapshot date
df_enriched = (
    df.withColumn("ingestion_ts", current_timestamp())
      .withColumn("source_file", input_file_name())
      .withColumn(
          "snapshot_date",
          to_date(
              regexp_extract(
                  input_file_name(),
                  r"/(\d{4}/\d{2}/\d{2})/", 1
              ),
              "yyyy/MM/dd"
          )
      )
)

# Show a few rows to verify correct ingestion
df_enriched.show(5, truncate=False)


# Notebook 2: Write Agents to Bronze Table without DLT

# Overwrite the Bronze table in Unity Catalog
# Make sure schema has been created and managed location is configured

# Use the same target_table identifier from above

df_enriched.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(target_table)

print(f"✅ Data written to table '{target_table}'.")

In [0]:
from pyspark.sql.functions import current_timestamp, input_file_name, regexp_extract, to_date

# --- Konfigurace ---
RAW_PATH     = "/Volumes/principal_lab_db/landing/operational_data"
CATALOG      = "principal_lab_db"
SCHEMA       = "dev_bronze"
TABLE        = "customers"
TARGET_TABLE = f"{CATALOG}.{SCHEMA}.{TABLE}"

# Načti CSV ze všech podsložek year/month/day
df = (
    spark.read
         .option("header", True)
         .csv(f"{RAW_PATH}/{TABLE}/*/*/*/*.csv")   # TADY jsou TŘI hvězdičky pro YYYY/MM/DD
)

# Obohať o audit a snapshot_date
df_enriched = (
    df.withColumn("ingestion_ts", current_timestamp())
      .withColumn("source_file", input_file_name())
      .withColumn(
          "snapshot_date",
          to_date(
              regexp_extract(
                  input_file_name(),
                  r"/(\d{4}/\d{2}/\d{2})/",
                  1
              ),
              "yyyy/MM/dd"
          )
      )
)

# Vykresli, ať vidíš, že se ti načetly všechny dny
df_enriched.show(5, truncate=False)


In [0]:
# Přepiš (overwrite) tabulku customers
df_enriched.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(TARGET_TABLE)

print(f"✅ Data written to {TARGET_TABLE}")


In [0]:
%sql
select distinct snapshot_date from dev_bronze.agents

In [0]:
# Notebook buňka 1: Načtení a obohacení všech tabulek

from pyspark.sql.functions import current_timestamp, input_file_name, regexp_extract, to_date

# --- Konfigurace ---
RAW_PATH = "/Volumes/principal_lab_db/landing/operational_data"
CATALOG  = "principal_lab_db"
SCHEMA   = "dev_bronze"

# Specifikace tabulek: (jméno, pattern, snapshot?)
spec = [
    ("agents",    "agents/*/*/*/*.csv",   True),    # 3 úrovně: YYYY/MM/DD
    ("customers","customers/*/*/*/*.csv", True),
    ("policies",  "policies/*/*/*/*.csv",  True),
    ("claims",    "claims/*.csv",           False),
    ("products",  "products/*.csv",         False),
]

# Funkce pro obohacení
def enrich(df, snapshot: bool):
    df2 = (
        df
        .withColumn("ingestion_ts", current_timestamp())
        .withColumn("source_file", input_file_name())
    )
    if snapshot:
        df2 = df2.withColumn(
            "snapshot_date",
            to_date(
                regexp_extract(
                    input_file_name(),
                    r"/(\d{4}/\d{2}/\d{2})/", 1
                ),
                "yyyy/MM/dd"
            )
        )
    return df2

# Načteme a obohatíme všechny
dfs = {}
for table, pattern, is_snap in spec:
    path = f"{RAW_PATH}/{pattern}"
    print(f"Ingesting {table} from {path}")
    df = spark.read.option("header", True).csv(path)
    df_enriched = enrich(df, is_snap)
    dfs[table] = df_enriched  # uložíme si do dictu
    df_enriched.show(3, truncate=False)


In [0]:
for table, _, _ in spec:
    TARGET = f"{CATALOG}.{SCHEMA}.{table}"
    print(f"Writing {table} → {TARGET}")
    dfs[table] \
      .write \
      .format("delta") \
      .mode("overwrite") \
      .option("overwriteSchema", "true") \
      .saveAsTable(TARGET)
    print(f"✅ {table} written")