# Delta experiments

## Import libraries

In [31]:
import polars as pl
from deltalake import DeltaTable, write_deltalake
from poor_man_lakehouse.config import settings

delta_path = "s3://warehouse/default/test_delta"

# Create a mock Polars DataFrame and save it as a Delta. It should have columns date (in yyyy-mm-dd format), country, and value.
df = pl.DataFrame({
    "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
    "country": ["US", "CA", "MX"],
    "value": [100, 200, 300],
})


In [36]:
minio_storage_options = {
    "aws_access_key_id": "riccardo",
    "aws_secret_access_key": "TKT7wkf_xgh@muj1fxv",
    "aws_default_region": "ita-north",
    "aws_endpoint": "https://minio.intella.internal:9091/",
    "aws_allow_http": "true",
    "": "",
}

pl.scan_delta(
    "s3://oneweb-prd/data/oneweb/oneweb/sat/raw", storage_options=minio_storage_options
).head(5).collect()


OSError: Generic S3 error
          [31m↳[0m Error performing GET https://minio.intella.internal:9091/oneweb-prd/data/oneweb/oneweb/sat/raw/_delta_log/_last_checkpoint in 4.833226083s, after 10 retries, max_retries
           [31m↳[0m 10, retry_timeout
            [31m↳[0m 180s  - HTTP error
             [31m↳[0m error sending request
              [31m↳[0m client error (Connect)
               [31m↳[0m invalid peer certificate
                [31m↳[0m UnknownIssuer


25/09/19 18:43:48 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 209178 ms exceeds timeout 120000 ms
25/09/19 18:43:48 WARN SparkContext: Killing executors is not supported by current scheduler.
25/09/19 18:43:50 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$

In [2]:
# Write Delta using delta-rs write_deltalake function
write_deltalake(
    table_or_uri=delta_path,
    data=df.to_arrow(),
    mode="overwrite",
    name="test_delta",
    partition_by=["date"],
    schema_mode="merge",
    storage_options=settings.S3_STORAGE_OPTIONS,
)


In [3]:
DeltaTable(
    "s3://warehouse/default/test_delta",
    storage_options={
        "AWS_ACCESS_KEY_ID": "minioadmin",
        "AWS_SECRET_ACCESS_KEY": "miniopassword",
        "AWS_ENDPOINT_URL": "http://minio:9000",
        "AWS_REGION": "eu-central-1",
        "AWS_ALLOW_HTTP": "true",
        "aws_conditional_put": "etag",
    },
).to_pandas()

Unnamed: 0,date,country,value
0,2023-01-02,CA,200
1,2023-01-03,MX,300
2,2023-01-01,US,100


In [4]:
# Write Delta using delta-rs write_deltalake function
write_deltalake(
    table_or_uri=delta_path,
    data=df.to_arrow(),
    mode="overwrite",
    name="test_delta",
    partition_by=["date"],
    schema_mode="merge",
    storage_options=settings.S3_STORAGE_OPTIONS,
)


One can also use `write_delta` predicate from Polars

In [5]:
df.write_delta(
    target=delta_path,
    mode="overwrite",
    delta_write_options={
        "partition_by": ["date"],
        "schema_mode": "merge",
        "name": "test_delta",
    },
    storage_options=settings.S3_STORAGE_OPTIONS,
)

## Read Delta

We can read delta using the `DeltaTable` class, and then converting to Polars through Arrow (but that will load the whole table with no possibility of filtering data), or using `scan_delta` and filter before materializing it.

In case the Delta is located in a remote storage, we can pass `storage_options` parameter (to both Polars and DeltaTable).

In [6]:
dt = DeltaTable(delta_path, storage_options=settings.S3_STORAGE_OPTIONS)
pl.from_arrow(dt.to_pyarrow_table()).select("date", "country", "value").sort("date")

date,country,value
str,str,i64
"""2023-01-01""","""US""",100
"""2023-01-02""","""CA""",200
"""2023-01-03""","""MX""",300


In [7]:
pl.scan_delta(delta_path, storage_options=settings.S3_STORAGE_OPTIONS).collect().select(
    "date", "country", "value"
).sort("date")

date,country,value
str,str,i64
"""2023-01-01""","""US""",100
"""2023-01-02""","""CA""",200
"""2023-01-03""","""MX""",300


## Read Delta using DuckDB

In [8]:
import duckdb

# Query the Delta table using DuckDB
duckdb.sql(f"""
CREATE SECRET if not exists s3_secret (
    TYPE S3,
    KEY_ID '{settings.AWS_ACCESS_KEY_ID}',
    SECRET '{settings.AWS_SECRET_ACCESS_KEY}',
    ENDPOINT 'minio:9000',
    REGION '{settings.AWS_DEFAULT_REGION}',
    URL_STYLE 'path',
    USE_SSL false
);
            """)
duckdb.sql(f"""
SELECT date, country, value FROM delta_scan('{delta_path}' ORDER BY date)
           """)


┌────────────┬─────────┬───────┐
│    date    │ country │ value │
│  varchar   │ varchar │ int64 │
├────────────┼─────────┼───────┤
│ 2023-01-03 │ MX      │   300 │
│ 2023-01-01 │ US      │   100 │
│ 2023-01-02 │ CA      │   200 │
└────────────┴─────────┴───────┘

## Upsert

Create another DataFrame with a 2 rows, one that will be appended and one that will be overwritten (considering `date` as join column).

In [9]:
upsert_df = pl.DataFrame({
    "date": ["2023-01-01", "2023-01-04"],
    "country": ["US", "IT"],
    "value": [150, 250],
})
upsert_df

date,country,value
str,str,i64
"""2023-01-01""","""US""",150
"""2023-01-04""","""IT""",250


In [10]:
# Upsert the new data into the Delta table
dt.merge(
    source=upsert_df.to_arrow(),
    source_alias="source",
    target_alias="target",
    predicate="source.date = target.date",
    merge_schema=True,
).when_matched_update_all().when_not_matched_insert_all().execute()


[90m[[0m2025-09-19T15:15:51Z [33mWARN [0m datafusion_datasource_parquet::source[90m][0m The SchemaAdapter API will be removed from ParquetSource in a future release. Use PhysicalExprAdapterFactory API instead. See https://github.com/apache/datafusion/issues/16800 for discussion and https://datafusion.apache.org/library-user-guide/upgrading.html#datafusion-49-0-0 for upgrade instructions.
[90m[[0m2025-09-19T15:15:51Z [33mWARN [0m datafusion_datasource_parquet::source[90m][0m The SchemaAdapter API will be removed from ParquetSource in a future release. Use PhysicalExprAdapterFactory API instead. See https://github.com/apache/datafusion/issues/16800 for discussion and https://datafusion.apache.org/library-user-guide/upgrading.html#datafusion-49-0-0 for upgrade instructions.
[90m[[0m2025-09-19T15:15:51Z [33mWARN [0m datafusion_datasource_parquet::source[90m][0m The SchemaAdapter API will be removed from ParquetSource in a future release. Use PhysicalExprAdapterFactory API

{'num_source_rows': 2,
 'num_target_rows_inserted': 1,
 'num_target_rows_updated': 1,
 'num_target_rows_deleted': 0,
 'num_target_rows_copied': 0,
 'num_output_rows': 2,
 'num_target_files_scanned': 3,
 'num_target_files_skipped_during_scan': 0,
 'num_target_files_added': 2,
 'num_target_files_removed': 1,
 'execution_time_ms': 31,
 'scan_time_ms': 64,
 'rewrite_time_ms': 30}

In [11]:
# Verify the upsert
pl.scan_delta(delta_path, storage_options=settings.S3_STORAGE_OPTIONS).collect().select(
    "date", "country", "value"
).sort("date")


date,country,value
str,str,i64
"""2023-01-01""","""US""",150
"""2023-01-02""","""CA""",200
"""2023-01-03""","""MX""",300
"""2023-01-04""","""IT""",250


## Read Delta History

In [12]:
pl.from_dicts(dt.history()).sort("timestamp")

timestamp,operation,operationParameters,readVersion,engineInfo,clientVersion,operationMetrics,version
i64,str,struct[9],i64,str,str,struct[28],i64
1758294163652,"""WRITE""","{null,null,null,null,""Overwrite"",""[""date""]"",null,null,null}",,"""delta-rs:py-1.1.4""","""delta-rs.py-1.1.4""","{57,null,null,null,null,null,null,null,null,null,null,null,null,3,3,0,0,null,null,null,null,null,null,null,null,null,null,null}",0
1758294328403,"""WRITE""","{null,null,null,null,""Overwrite"",""[""date""]"",null,null,null}",,"""delta-rs:py-1.1.4""","""delta-rs.py-1.1.4""","{128,null,null,null,null,null,null,null,null,null,null,null,null,3,3,0,3,null,null,null,null,null,null,null,null,null,null,null}",1
1758294347120,"""WRITE""","{null,null,null,null,""Overwrite"",""[""date""]"",null,null,null}",,"""delta-rs:py-1.1.4""","""delta-rs.py-1.1.4""","{38,null,null,null,null,null,null,null,null,null,null,null,null,3,3,0,3,null,null,null,null,null,null,null,null,null,null,null}",2
1758294728558,"""WRITE""","{null,null,null,null,""Overwrite"",""[""date""]"",null,null,null}",,"""delta-rs:py-1.1.4""","""delta-rs.py-1.1.4""","{28,null,null,null,null,null,null,null,null,null,null,null,null,3,3,0,3,null,null,null,null,null,null,null,null,null,null,null}",3
1758294738597,"""WRITE""","{null,null,null,null,""Overwrite"",""[""date""]"",null,null,null}",,"""delta-rs:py-1.1.4""","""delta-rs.py-1.1.4""","{20,null,null,null,null,null,null,null,null,null,null,null,null,3,3,0,3,null,null,null,null,null,null,null,null,null,null,null}",4
…,…,…,…,…,…,…,…
1758294798753,"""OPTIMIZE""","{null,null,null,null,null,null,""104857600"",""[]"",null}",8,"""delta-rs:py-1.1.4""","""delta-rs.py-1.1.4""","{null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,""{""avg"":827.0,""max"":827,""min"":827,""totalFiles"":3,""totalSize"":2481}"",""{""avg"":798.0,""max"":798,""min"":798,""totalFiles"":3,""totalSize"":2394}"",3,3,3,0,true,3,0,null,null}",9
1758294950379,"""WRITE""","{null,null,null,null,""Overwrite"",""[""date""]"",null,null,null}",,"""delta-rs:py-1.1.4""","""delta-rs.py-1.1.4""","{22,null,null,null,null,null,null,null,null,null,null,null,null,3,3,0,3,null,null,null,null,null,null,null,null,null,null,null}",10
1758294950776,"""WRITE""","{null,null,null,null,""Overwrite"",""[""date""]"",null,null,null}",,"""delta-rs:py-1.1.4""","""delta-rs.py-1.1.4""","{18,null,null,null,null,null,null,null,null,null,null,null,null,3,3,0,3,null,null,null,null,null,null,null,null,null,null,null}",11
1758294951082,"""WRITE""","{null,null,null,null,""Overwrite"",""[""date""]"",null,null,null}",,"""delta-rs:py-1.1.4""","""delta-rs.py-1.1.4""","{16,null,null,null,null,null,null,null,null,null,null,null,null,3,3,0,3,null,null,null,null,null,null,null,null,null,null,null}",12


## Perform Time Travel, Vacuum and Optimize

Delta has some nice features:

- **Time travel**: Restore old version
- **Vacuum**: Remove data not referenced by available table version
- **Optimize**: Compact smaller files together

In [13]:
# Load initial version from Polars
pl.scan_delta(
    delta_path, version=0, storage_options=settings.S3_STORAGE_OPTIONS
).collect().select("date", "country", "value").sort("date")

date,country,value
str,str,i64
"""2023-01-01""","""US""",100
"""2023-01-02""","""CA""",200
"""2023-01-03""","""MX""",300


In [14]:
# Restore initial version
dt.restore(target=0)
pl.scan_delta(delta_path, storage_options=settings.S3_STORAGE_OPTIONS).collect().select(
    "date", "country", "value"
).sort("date")

date,country,value
str,str,i64
"""2023-01-01""","""US""",100
"""2023-01-02""","""CA""",200
"""2023-01-03""","""MX""",300


However, restoring a Delta causes DuckDB to break on read.

In [15]:
import duckdb

# Query the Delta table using DuckDB
duckdb.sql(f"""
SELECT * FROM delta_scan('{delta_path}')
           """)


┌────────────┬─────────┬───────┐
│    date    │ country │ value │
│  varchar   │ varchar │ int64 │
├────────────┼─────────┼───────┤
│ 2023-01-03 │ MX      │   300 │
│ 2023-01-02 │ CA      │   200 │
│ 2023-01-01 │ US      │   100 │
└────────────┴─────────┴───────┘

In [16]:
# Restore latest version
print(dt.restore(target=1))
# Verify the restored version
pl.scan_delta(delta_path, storage_options=settings.S3_STORAGE_OPTIONS).collect().select(
    "date", "country", "value"
).sort("date")

{'numRemovedFile': 3, 'numRestoredFile': 3}


date,country,value
str,str,i64
"""2023-01-01""","""US""",100
"""2023-01-02""","""CA""",200
"""2023-01-03""","""MX""",300


The following command will remove all files that are not referenced by Delta versions in history. It will output a list of deleted files, that are not restorable.

A `retention_hours` period can be specified, along with `dry_run`. `enforce_retention_duration` needs to be set at False in case `retention_hours` is lower than the Delta default one, which is 168 hours in case not specified differently at Delta creation with `configs` param.

In [17]:
# Vacuum Delta
dt.vacuum(retention_hours=0, dry_run=True, enforce_retention_duration=False)

['date=2023-01-01/part-00001-0365cfce-dd8b-48cd-a330-b8cb390b3c50-c000.snappy.parquet',
 'date=2023-01-01/part-00001-14813245-2dda-4f09-b05d-b8b03ebc0664-c000.snappy.parquet',
 'date=2023-01-01/part-00001-1dc0180c-8382-463c-81b6-06c26fcd1bd6-c000.zstd.parquet',
 'date=2023-01-01/part-00001-253262a2-0378-4231-b770-6c890b540721-c000.snappy.parquet',
 'date=2023-01-01/part-00001-4d5ea80d-0540-43e7-90a6-d784b85dc3d9-c000.snappy.parquet',
 'date=2023-01-01/part-00001-7b952d61-64f6-4529-a07f-8a563d26f390-c000.snappy.parquet',
 'date=2023-01-01/part-00001-7f587bad-cb1d-4c92-be86-aa3372c57716-c000.snappy.parquet',
 'date=2023-01-01/part-00001-802b3b38-af95-441f-ad85-bc30f5a14f16-c000.snappy.parquet',
 'date=2023-01-01/part-00001-95c265b6-84ff-42ae-a6a6-4ff12a110d9d-c000.snappy.parquet',
 'date=2023-01-01/part-00001-b178a23e-4337-4de8-8aea-9f8829a5d97c-c000.snappy.parquet',
 'date=2023-01-01/part-00001-ec152e65-8c82-4207-bb3a-47d054f4b695-c000.snappy.parquet',
 'date=2023-01-02/part-00001-03777

*Optimize* will compact smaller files (think about small insertion) together, so that queries can be speed up by looking at less files.

In [18]:
dt.optimize.compact()

{'numFilesAdded': 0,
 'numFilesRemoved': 0,
 'filesAdded': '{"avg":0.0,"max":0,"min":0,"totalFiles":0,"totalSize":0}',
 'filesRemoved': '{"avg":0.0,"max":0,"min":0,"totalFiles":0,"totalSize":0}',
 'partitionsOptimized': 0,
 'numBatches': 0,
 'totalConsideredFiles': 3,
 'totalFilesSkipped': 3,
 'preserveInsertionOrder': True}

## Z-Order

Z-Order is a way to reorganize data in storage in order to optimize queries. It allows for more data-skipping by colocating relevant files together. Think about it a smart sorting of files based on one or multiple columns

In [19]:
dt.optimize.z_order(columns=["country"])

[90m[[0m2025-09-19T15:15:53Z [33mWARN [0m datafusion_datasource_parquet::source[90m][0m The SchemaAdapter API will be removed from ParquetSource in a future release. Use PhysicalExprAdapterFactory API instead. See https://github.com/apache/datafusion/issues/16800 for discussion and https://datafusion.apache.org/library-user-guide/upgrading.html#datafusion-49-0-0 for upgrade instructions.
[90m[[0m2025-09-19T15:15:53Z [33mWARN [0m datafusion_datasource_parquet::source[90m][0m The SchemaAdapter API will be removed from ParquetSource in a future release. Use PhysicalExprAdapterFactory API instead. See https://github.com/apache/datafusion/issues/16800 for discussion and https://datafusion.apache.org/library-user-guide/upgrading.html#datafusion-49-0-0 for upgrade instructions.
[90m[[0m2025-09-19T15:15:53Z [33mWARN [0m datafusion_datasource_parquet::source[90m][0m The SchemaAdapter API will be removed from ParquetSource in a future release. Use PhysicalExprAdapterFactory API

{'numFilesAdded': 3,
 'numFilesRemoved': 3,
 'filesAdded': '{"avg":827.0,"max":827,"min":827,"totalFiles":3,"totalSize":2481}',
 'filesRemoved': '{"avg":798.0,"max":798,"min":798,"totalFiles":3,"totalSize":2394}',
 'partitionsOptimized': 0,
 'numBatches': 3,
 'totalConsideredFiles': 3,
 'totalFilesSkipped': 0,
 'preserveInsertionOrder': True}

## Unity Catalog experiments

In [20]:
import polars as pl

c = pl.Catalog(workspace_url="http://localhost:8080", require_https=False)
c.list_catalogs()

[CatalogInfo(name='unity', comment='Main catalog', properties={}, options={}, storage_location=None, created_at=datetime.datetime(2024, 7, 17, 18, 40, 5, 334000, tzinfo=datetime.timezone.utc), created_by=None, updated_at=None, updated_by=None)]

In [21]:
c.list_tables(catalog_name="unity", namespace="default")

[TableInfo(name='marksheet', comment='Managed table', table_id='c389adfa-5c8f-497b-8f70-26c2cca4976d', table_type='MANAGED', storage_location='file:///home/unitycatalog/etc/data/managed/unity/default/tables/marksheet/', data_source_format='DELTA', columns=[ColumnInfo(name='id', type_name='INT', type_text='int', type_json='{"name":"id","type":"integer","nullable":false,"metadata":{}}', position=0, comment='ID primary key', partition_index=None), ColumnInfo(name='name', type_name='STRING', type_text='string', type_json='{"name":"name","type":"string","nullable":false,"metadata":{}}', position=1, comment='Name of the entity', partition_index=None), ColumnInfo(name='marks', type_name='INT', type_text='int', type_json='{"name":"marks","type":"integer","nullable":true,"metadata":{}}', position=2, comment='Marks of the entity', partition_index=None)], properties={'key1': 'value1', 'key2': 'value2'}, created_at=datetime.datetime(2024, 7, 17, 18, 40, 5, 595000, tzinfo=datetime.timezone.utc), cr

In [22]:
# mock dataframe
df = pl.DataFrame({
    "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
    "country": ["US", "CA", "MX"],
    "value": [100, 200, 300],
})

[TableInfo(name='marksheet', comment='Managed table', table_id='c389adfa-5c8f-497b-8f70-26c2cca4976d', table_type='MANAGED', storage_location='file:///home/unitycatalog/etc/data/managed/unity/default/tables/marksheet/', data_source_format='DELTA', columns=[ColumnInfo(name='id', type_name='INT', type_text='int', type_json='{"name":"id","type":"integer","nullable":false,"metadata":{}}', position=0, comment='ID primary key', partition_index=None), ColumnInfo(name='name', type_name='STRING', type_text='string', type_json='{"name":"name","type":"string","nullable":false,"metadata":{}}', position=1, comment='Name of the entity', partition_index=None), ColumnInfo(name='marks', type_name='INT', type_text='int', type_json='{"name":"marks","type":"integer","nullable":true,"metadata":{}}', position=2, comment='Marks of the entity', partition_index=None)], properties={'key2': 'value2', 'key1': 'value1'}, created_at=datetime.datetime(2024, 7, 17, 18, 40, 5, 595000, tzinfo=datetime.timezone.utc), cr

In [27]:
delta_path = "s3://warehouse/delta_unity_catalog"

tables = [
    table.name for table in c.list_tables(catalog_name="unity", namespace="default")
]
if "test_table" in tables:
    c.delete_table(catalog_name="unity", namespace="default", table_name="test_table")
c.create_table(
    catalog_name="unity",
    namespace="default",
    table_name="test_table",
    schema=df.schema,
    table_type="EXTERNAL",
    data_source_format="DELTA",
    storage_root=delta_path,
)

TableInfo(name='test_table', comment=None, table_id='d357a36c-cb77-41e9-b944-48a9e6cf46e8', table_type='EXTERNAL', storage_location='s3://warehouse/delta_unity_catalog', data_source_format='DELTA', columns=[ColumnInfo(name='date', type_name='STRING', type_text='string', type_json='{"name":"date","type":"string","nullable":true,"metadata":{}}', position=0, comment=None, partition_index=None), ColumnInfo(name='country', type_name='STRING', type_text='string', type_json='{"name":"country","type":"string","nullable":true,"metadata":{}}', position=1, comment=None, partition_index=None), ColumnInfo(name='value', type_name='LONG', type_text='bigint', type_json='{"name":"value","type":"long","nullable":true,"metadata":{}}', position=2, comment=None, partition_index=None)], properties={}, created_at=datetime.datetime(2025, 9, 19, 15, 18, 2, 377000, tzinfo=datetime.timezone.utc), created_by=None, updated_at=datetime.datetime(2025, 9, 19, 15, 18, 2, 377000, tzinfo=datetime.timezone.utc), updated_

In [28]:
c.write_table(
    df=df,
    catalog_name="unity",
    namespace="default",
    delta_mode="overwrite",
    table_name="test_table",
    storage_options=settings.S3_STORAGE_OPTIONS,
)

In [29]:
c.scan_table(
    catalog_name="unity",
    namespace="default",
    table_name="test_table",
    storage_options=settings.S3_STORAGE_OPTIONS,
).collect()

date,country,value
str,str,i64
"""2023-01-01""","""US""",100
"""2023-01-02""","""CA""",200
"""2023-01-03""","""MX""",300


In [30]:
import duckdb

duckdb.sql("""
INSTALL uc_catalog;
INSTALL delta;
LOAD delta;
LOAD uc_catalog;
CREATE OR REPLACE SECRET uc_secret (
	TYPE UC,
	ENDPOINT 'http://127.0.0.1:8080',
    TOKEN 'not used',
    AWS_REGION 'eu-central-1'

);
ATTACH IF NOT EXISTS 'unity' AS test_catalog (TYPE UC_CATALOG, SECRET uc_secret);
SHOW ALL TABLES;
""")

┌──────────┬─────────┬─────────┬──────────────┬──────────────┬───────────┐
│ database │ schema  │  name   │ column_names │ column_types │ temporary │
│ varchar  │ varchar │ varchar │  varchar[]   │  varchar[]   │  boolean  │
├──────────┴─────────┴─────────┴──────────────┴──────────────┴───────────┤
│                                 0 rows                                 │
└────────────────────────────────────────────────────────────────────────┘