Set a pipeline which can fail at each step.

In [None]:
import dlt


# define a resource with an intentional failure
@dlt.resource(name="example_resource")
def example_resource(should_fail_extraction: bool = False):

    for i in range(10):
        if should_fail_extraction and i == 5:
            raise Exception("Intentional failure at extraction step (i=5)")
        yield {"value": i}

def run_pipeline(should_fail_extraction: bool, should_fail_normalization: bool, should_fail_load: bool):
    pipeline = dlt.pipeline(
        pipeline_name="failure_simulation_pipeline",
        destination="duckdb",
        dataset_name="test_dataset",
        pipelines_dir="pipeline_jobs" # directory to store .dlt files
    )

    try:
        # extract
        extraction = pipeline.extract(example_resource(should_fail_extraction=should_fail_extraction))
        print("Extraction completed successfully.")
        print(f"Extracted data: {extraction}")

        # normalize
        if should_fail_normalization:
            def failing_normalize():
                raise Exception("Intentional failure at normalization step")
            pipeline.normalize = failing_normalize

        normalization = pipeline.normalize()
        print("Normalization completed successfully.")
        print(f"Normalized data: {normalization}")

        # load
        if should_fail_load:
            def failing_load():
                raise Exception("Intentional failure at load step")
            pipeline.load = failing_load

        load_result = pipeline.load()
        print("Load completed successfully.")
        print(f"Load result: {load_result}")

        print("Pipeline completed successfully.")

    except Exception as e:
        print(f"EXCEPTION during pipeline execution: {e}")

    return pipeline

# run the pipeline with different failure scenarios
my_pipeline = run_pipeline(
    should_fail_extraction=False,
    should_fail_normalization=False,
    should_fail_load=False
)


Print output of loaded data

In [None]:
with my_pipeline.sql_client() as c:
    tables_df = c.execute("SHOW TABLES").df()
print("TABLES:")
print(tables_df)

with my_pipeline.sql_client() as c:
    df = c.execute("SELECT * FROM example_resource").df()
df