In [None]:
import importlib.util
import sys

# Install if missing
if importlib.util.find_spec("dlt") is None:
    %pip install -q boto3==1.40.18
    #%pip install -q dlt[filesystem,az,deltalake]==1.17.0
    %pip install -q "dlt[filesystem,az,deltalake] @ git+https://github.com/dlt-hub/dlt.git@devel"

    sys.exit(0)

In [None]:
import dlt
import os
import tqdm
import tqdm.auto

tqdm.tqdm = tqdm.auto.tqdm

In [None]:
# Add keyvault parameters
KEYVAULT = "https://mattiasthalen-fabric.vault.azure.net"
SECRET__TENANT_ID = "credentials--azure-tenant-id"
SECRET__CLIENT_ID = "credentials--azure-client-id"
SECRET__CLIENT_SECRET = "credentials--azure-client-secret"

In [None]:
# Set environment variables for dlt
abfs_path = notebookutils.lakehouse.get().get("properties").get("abfsPath")

os.environ["DESTINATION__BUCKET_URL"] =  f"{abfs_path}/Tables"
os.environ["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "onelake"
os.environ["CREDENTIALS__AZURE_ACCOUNT_HOST"] = "onelake.blob.fabric.microsoft.com"
os.environ["CREDENTIALS__AZURE_TENANT_ID"] = notebookutils.credentials.getSecret(KEYVAULT, SECRET__TENANT_ID)
os.environ["CREDENTIALS__AZURE_CLIENT_ID"] =notebookutils.credentials.getSecret(KEYVAULT, SECRET__CLIENT_ID)
os.environ["CREDENTIALS__AZURE_CLIENT_SECRET"] = notebookutils.credentials.getSecret(KEYVAULT, SECRET__CLIENT_SECRET)

In [None]:
# Define general pipeline function to feed with sources
def run_pipeline(source, table_format="delta", loader_file_format="parquet", dev_mode=True):
    pipeline = dlt.pipeline(
        pipeline_name="raw",
        destination="filesystem",
        dataset_name="raw",
        progress=dlt.progress.tqdm(),
        dev_mode=dev_mode,
    )
    
    load = pipeline.run(source, table_format=table_format, loader_file_format=loader_file_format)

    return load

In [None]:
# Define the tpch source
@dlt.source()
def tpch(source_path="/lakehouse/default/Files/tpch"):
    from dlt.sources.filesystem import filesystem, read_parquet

    tables = [
        "customer",
        "lineitem",
        "nation",
        "orders",
        "part",
        "partsupp",
        "region",
        "supplier",
    ]

    for table in tables:
        resource = filesystem(
            bucket_url=source_path,
            file_glob=f"{table}/{table}.*.parquet",
            files_per_page=1,
            incremental=dlt.sources.incremental("modification_date"),
        )

        resource_pipe = resource | read_parquet(use_pyarrow=True)

        yield resource_pipe.with_name(f"raw__tpch__{table}").apply_hints(write_disposition="append")

In [None]:
# Run the pipeline
_ = run_pipeline(tpch(), dev_mode=False)