In [0]:
dbutils.widgets.text("catalog_name", "adventureworks")
dbutils.widgets.text("schema_name", "raw")
dbutils.widgets.text("source_volume_base_path", "/Volumes/adventureworks/raw/source_data")
dbutils.widgets.text("tables_to_collate", "sales")



In [0]:
def remove_tables_to_collate(tables_to_collate, tables_to_create):
    # Remove tables from tables_to_create that start with any name in tables_to_collate
    tables_to_create_filtered = [
        table for table in tables_to_create
        if not any(table.name.startswith(ct) for ct in tables_to_collate)
    ]
    return tables_to_create_filtered

def ingest_table(source_path, catalog_name, schema_name, table_name):
    # Log the start of ingestion
    print(f"Ingestion of {table_name} started.")
    # Read CSV data from the source path into a Spark DataFrame
    df = (spark.read.format("csv")
          .option("header", "true")
          .option("inferSchema", "true")
          .load(source_path)
        )
    # Write the DataFrame to a table in the specified catalog and schema, overwriting if exists
    df.write.mode("overwrite").saveAsTable(f"{catalog_name}.{schema_name}.{table_name}")
    # Log the completion of ingestion
    print(f"Ingestion of {table_name} completed. \n")

In [0]:
# Get the base path for source data from widget
source_volume_base_path = dbutils.widgets.get("source_volume_base_path")

# Get the list of tables to collate from widget and split into a list
tables_to_collate = dbutils.widgets.get("tables_to_collate").split(",")

# Get the catalog name from widget
catalog_name = dbutils.widgets.get("catalog_name")

# Get the schema name from widget
schema_name = dbutils.widgets.get("schema_name")

In [0]:
# List all files and directories in the source data base path
tc = dbutils.fs.ls(source_volume_base_path)

# Remove tables to collate from the list of tables to create
tables_to_create = remove_tables_to_collate(tables_to_collate, tc)

# Ingest each table that is not in tables_to_collate
for table_info_obj in tables_to_create:
    # Extract table name by removing trailing slash
    table_name = table_info_obj.name[:-1]
    # Construct the source path for the table
    source_path = f"{source_volume_base_path}/{table_name}/"
    # Ingest the table from the source path
    ingest_table(source_path, catalog_name, schema_name, table_name)

# Ingest each table specified in tables_to_collate using wildcard path
for table_name in tables_to_collate:
    # Construct the source path with wildcard for collated tables
    source_path = f"{source_volume_base_path}/{table_name}*"
    # Ingest the collated table from the source path
    ingest_table(source_path, catalog_name, schema_name, table_name)