# Delta experiments

## Import libraries

In [1]:
import polars as pl
from deltalake import DeltaTable, write_deltalake

from poor_man_lakehouse.config import settings

delta_path = "s3://warehouse/default/test_delta"

# Create a mock Polars DataFrame and save it as a Delta. It should have columns date (in yyyy-mm-dd format), country, and value.
df = pl.DataFrame({
    "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
    "country": ["US", "CA", "MX"],
    "value": [100, 200, 300],
})


In [2]:
# Write Delta using delta-rs write_deltalake function
write_deltalake(
    table_or_uri=delta_path,
    data=df.to_arrow(),
    mode="overwrite",
    name="test_delta",
    partition_by=["date"],
    schema_mode="merge",
    storage_options=settings.S3_STORAGE_OPTIONS,
)


In [3]:
DeltaTable(
    "s3://warehouse/default/test_delta",
    storage_options={
        "AWS_ACCESS_KEY_ID": "minioadmin",
        "AWS_SECRET_ACCESS_KEY": "miniopassword",
        "AWS_ENDPOINT_URL": "http://localhost:9000",
        "AWS_REGION": "eu-central-1",
        "AWS_ALLOW_HTTP": "true",
        "aws_conditional_put": "etag",
    },
).to_pandas()

Unnamed: 0,date,country,value
0,2023-01-02,CA,200
1,2023-01-01,US,100
2,2023-01-03,MX,300


One can also use `write_delta` predicate from Polars

In [5]:
df.write_delta(
    target=delta_path,
    mode="overwrite",
    delta_write_options={
        "partition_by": ["date"],
        "schema_mode": "merge",
        "name": "test_delta",
    },
    storage_options=settings.S3_STORAGE_OPTIONS,
)

## Read Delta

We can read delta using the `DeltaTable` class, and then converting to Polars through Arrow (but that will load the whole table with no possibility of filtering data), or using `scan_delta` and filter before materializing it.

In case the Delta is located in a remote storage, we can pass `storage_options` parameter (to both Polars and DeltaTable).

In [6]:
dt = DeltaTable(delta_path, storage_options=settings.S3_STORAGE_OPTIONS)
pl.from_arrow(dt.to_pyarrow_table()).select("date", "country", "value").sort("date")

date,country,value
str,str,i64
"""2023-01-01""","""US""",100
"""2023-01-02""","""CA""",200
"""2023-01-03""","""MX""",300


In [8]:
pl.scan_delta(delta_path, storage_options=settings.S3_STORAGE_OPTIONS).collect().select(
    "date", "country", "value"
).sort("date")

date,country,value
str,str,i64
"""2023-01-01""","""US""",100
"""2023-01-02""","""CA""",200
"""2023-01-03""","""MX""",300


## Read Delta using DuckDB

In [9]:
import duckdb

# Query the Delta table using DuckDB
duckdb.sql(f"""
CREATE SECRET if not exists s3_secret (
    TYPE S3,
    KEY_ID '{settings.AWS_ACCESS_KEY_ID}',
    SECRET '{settings.AWS_SECRET_ACCESS_KEY}',
    ENDPOINT 'minio:9000',
    REGION '{settings.AWS_DEFAULT_REGION}',
    URL_STYLE 'path',
    USE_SSL false
);
            """)
duckdb.sql(f"""
SELECT date, country, value FROM delta_scan('{delta_path}' ORDER BY date)
           """)


┌────────────┬─────────┬───────┐
│    date    │ country │ value │
│  varchar   │ varchar │ int64 │
├────────────┼─────────┼───────┤
│ 2023-01-03 │ MX      │   300 │
│ 2023-01-02 │ CA      │   200 │
│ 2023-01-01 │ US      │   100 │
└────────────┴─────────┴───────┘

## Upsert

Create another DataFrame with a 2 rows, one that will be appended and one that will be overwritten (considering `date` as join column).

In [10]:
upsert_df = pl.DataFrame({
    "date": ["2023-01-01", "2023-01-04"],
    "country": ["US", "IT"],
    "value": [150, 250],
})
upsert_df

date,country,value
str,str,i64
"""2023-01-01""","""US""",150
"""2023-01-04""","""IT""",250


In [11]:
# Upsert the new data into the Delta table
dt.merge(
    source=upsert_df.to_arrow(),
    source_alias="source",
    target_alias="target",
    predicate="source.date = target.date",
    merge_schema=True,
).when_matched_update_all().when_not_matched_insert_all().execute()


{'num_source_rows': 2,
 'num_target_rows_inserted': 1,
 'num_target_rows_updated': 1,
 'num_target_rows_deleted': 0,
 'num_target_rows_copied': 0,
 'num_output_rows': 2,
 'num_target_files_scanned': 3,
 'num_target_files_skipped_during_scan': 0,
 'num_target_files_added': 2,
 'num_target_files_removed': 1,
 'execution_time_ms': 30,
 'scan_time_ms': 17,
 'rewrite_time_ms': 0}

In [12]:
# Verify the upsert
pl.scan_delta(delta_path, storage_options=settings.S3_STORAGE_OPTIONS).collect().select(
    "date", "country", "value"
).sort("date")


date,country,value
str,str,i64
"""2023-01-01""","""US""",150
"""2023-01-02""","""CA""",200
"""2023-01-03""","""MX""",300
"""2023-01-04""","""IT""",250


## Read Delta History

In [13]:
pl.from_dicts(dt.history()).sort("timestamp")

timestamp,operation,operationParameters,readVersion,engineInfo,operationMetrics,clientVersion,version
i64,str,struct[6],i64,str,struct[17],str,i64
1766075904979,"""WRITE""","{null,null,null,null,""[""date""]"",""Overwrite""}",,"""delta-rs:py-1.2.1""","{18,null,null,null,null,null,null,null,null,null,null,null,null,3,3,0,0}","""delta-rs.py-1.2.1""",0
1766075923970,"""WRITE""","{null,null,null,null,""[""date""]"",""Overwrite""}",,"""delta-rs:py-1.2.1""","{13,null,null,null,null,null,null,null,null,null,null,null,null,3,3,0,3}","""delta-rs.py-1.2.1""",1
1766075925952,"""WRITE""","{null,null,null,null,""[""date""]"",""Overwrite""}",,"""delta-rs:py-1.2.1""","{6,null,null,null,null,null,null,null,null,null,null,null,null,3,3,0,3}","""delta-rs.py-1.2.1""",2
1766075933420,"""MERGE""","{""[{""actionType"":""update""}]"",""source.date = target.date"",""[{""actionType"":""insert""}]"",""[]"",null,null}",2.0,"""delta-rs:py-1.2.1""","{30,2,2,2,1,3,0,0,0,1,1,0,17,null,null,null,null}","""delta-rs.py-1.2.1""",3


## Perform Time Travel, Vacuum and Optimize

Delta has some nice features:

- **Time travel**: Restore old version
- **Vacuum**: Remove data not referenced by available table version
- **Optimize**: Compact smaller files together

In [14]:
# Load initial version from Polars
pl.scan_delta(
    delta_path, version=0, storage_options=settings.S3_STORAGE_OPTIONS
).collect().select("date", "country", "value").sort("date")

date,country,value
str,str,i64
"""2023-01-01""","""US""",100
"""2023-01-02""","""CA""",200
"""2023-01-03""","""MX""",300


In [7]:
# Restore initial version
dt.restore(target=0)
pl.scan_delta(delta_path, storage_options=settings.S3_STORAGE_OPTIONS).collect().select(
    "date", "country", "value"
).sort("date")

date,country,value
str,str,i64
"""2023-01-01""","""US""",100
"""2023-01-02""","""CA""",200
"""2023-01-03""","""MX""",300


However, restoring a Delta causes DuckDB to break on read.

In [None]:
import duckdb

# Query the Delta table using DuckDB
duckdb.sql(f"""
SELECT * FROM delta_scan('{delta_path}')
           """)


In [8]:
# Restore latest version
print(dt.restore(target=1))
# Verify the restored version
pl.scan_delta(delta_path, storage_options=settings.S3_STORAGE_OPTIONS).collect().select(
    "date", "country", "value"
).sort("date")

{'numRemovedFile': 3, 'numRestoredFile': 3}


date,country,value
str,str,i64
"""2023-01-01""","""US""",100
"""2023-01-02""","""CA""",200
"""2023-01-03""","""MX""",300


The following command will remove all files that are not referenced by Delta versions in history. It will output a list of deleted files, that are not restorable.

A `retention_hours` period can be specified, along with `dry_run`. `enforce_retention_duration` needs to be set at False in case `retention_hours` is lower than the Delta default one, which is 168 hours in case not specified differently at Delta creation with `configs` param.

In [9]:
# Vacuum Delta
dt.vacuum(retention_hours=0, dry_run=True, enforce_retention_duration=False)

['date=2023-01-01/part-00000-25f438a1-2e03-49c9-8edb-e80dab799c96-c000.snappy.parquet',
 'date=2023-01-01/part-00000-9ab61169-691d-4794-a2b2-3380b36a1139-c000.snappy.parquet',
 'date=2023-01-02/part-00000-d43514e0-71df-437b-87e9-0540a632899d-c000.snappy.parquet',
 'date=2023-01-02/part-00000-ed9d7412-ecdc-4fb6-9130-a0ad0072b441-c000.snappy.parquet',
 'date=2023-01-03/part-00000-cafde99e-d9ea-4ed9-9fb4-eaf9982c5ae6-c000.snappy.parquet',
 'date=2023-01-03/part-00000-d284464b-f3a9-4d2f-a537-5335764213be-c000.snappy.parquet']

*Optimize* will compact smaller files (think about small insertion) together, so that queries can be speed up by looking at less files.

In [10]:
dt.optimize.compact()

{'numFilesAdded': 0,
 'numFilesRemoved': 0,
 'filesAdded': '{"avg":0.0,"max":0,"min":0,"totalFiles":0,"totalSize":0}',
 'filesRemoved': '{"avg":0.0,"max":0,"min":0,"totalFiles":0,"totalSize":0}',
 'partitionsOptimized': 0,
 'numBatches': 0,
 'totalConsideredFiles': 3,
 'totalFilesSkipped': 3,
 'preserveInsertionOrder': True}

## Z-Order

Z-Order is a way to reorganize data in storage in order to optimize queries. It allows for more data-skipping by colocating relevant files together. Think about it a smart sorting of files based on one or multiple columns

In [11]:
dt.optimize.z_order(columns=["country"])

{'numFilesAdded': 3,
 'numFilesRemoved': 3,
 'filesAdded': '{"avg":785.0,"max":785,"min":785,"totalFiles":3,"totalSize":2355}',
 'filesRemoved': '{"avg":755.0,"max":755,"min":755,"totalFiles":3,"totalSize":2265}',
 'partitionsOptimized': 0,
 'numBatches': 3,
 'totalConsideredFiles': 3,
 'totalFilesSkipped': 0,
 'preserveInsertionOrder': True}

## Unity Catalog experiments

In [30]:
import polars as pl

c = pl.Catalog(workspace_url="http://localhost:8080", require_https=False)
c.list_catalogs()

[CatalogInfo(name='unity', comment='Main catalog', properties={}, options={}, storage_location=None, created_at=datetime.datetime(2024, 7, 17, 18, 40, 5, 334000, tzinfo=datetime.timezone.utc), created_by=None, updated_at=None, updated_by=None)]

In [13]:
c.list_tables(catalog_name="unity", namespace="default")

[TableInfo(name='marksheet', comment='Managed table', table_id='c389adfa-5c8f-497b-8f70-26c2cca4976d', table_type='MANAGED', storage_location='file:///home/unitycatalog/etc/data/managed/unity/default/tables/marksheet', data_source_format='DELTA', columns=[ColumnInfo(name='id', type_name='INT', type_text='int', type_json='{"name":"id","type":"integer","nullable":false,"metadata":{}}', position=0, comment='ID primary key', partition_index=None), ColumnInfo(name='name', type_name='STRING', type_text='string', type_json='{"name":"name","type":"string","nullable":false,"metadata":{}}', position=1, comment='Name of the entity', partition_index=None), ColumnInfo(name='marks', type_name='INT', type_text='int', type_json='{"name":"marks","type":"integer","nullable":true,"metadata":{}}', position=2, comment='Marks of the entity', partition_index=None)], properties={'key2': 'value2', 'key1': 'value1'}, created_at=datetime.datetime(2024, 7, 17, 18, 40, 5, 595000, tzinfo=datetime.timezone.utc), cre

In [20]:
# mock dataframe
df = pl.DataFrame({
    "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
    "country": ["US", "CA", "MX"],
    "value": [100, 200, 300],
})

In [31]:
delta_path = "s3://warehouse/delta_unity_catalog"

tables = [
    table.name for table in c.list_tables(catalog_name="unity", namespace="default")
]
if "test_table" in tables:
    print("Deleting existing table 'test_table'")
    c.delete_table(catalog_name="unity", namespace="default", table_name="test_table")
c.create_table(
    catalog_name="unity",
    namespace="default",
    table_name="test_table",
    schema=df.schema,
    table_type="EXTERNAL",
    data_source_format="DELTA",
    storage_root=delta_path,
)

Deleting existing table 'test_table'


TableInfo(name='test_table', comment=None, table_id='a9dc17e3-feda-4221-a00a-fcd6a90ff56b', table_type='EXTERNAL', storage_location='s3://warehouse/delta_unity_catalog', data_source_format='DELTA', columns=[ColumnInfo(name='date', type_name='STRING', type_text='string', type_json='{"name":"date","type":"string","nullable":true,"metadata":{}}', position=0, comment=None, partition_index=None), ColumnInfo(name='country', type_name='STRING', type_text='string', type_json='{"name":"country","type":"string","nullable":true,"metadata":{}}', position=1, comment=None, partition_index=None), ColumnInfo(name='value', type_name='LONG', type_text='bigint', type_json='{"name":"value","type":"long","nullable":true,"metadata":{}}', position=2, comment=None, partition_index=None)], properties={}, created_at=datetime.datetime(2026, 1, 30, 15, 49, 7, 236000, tzinfo=datetime.timezone.utc), created_by=None, updated_at=datetime.datetime(2026, 1, 30, 15, 49, 7, 236000, tzinfo=datetime.timezone.utc), updated_

In [32]:
c.write_table(
    df=df,
    catalog_name="unity",
    namespace="default",
    delta_mode="overwrite",
    table_name="test_table",
    storage_options=settings.S3_STORAGE_OPTIONS,
)

In [33]:
c.scan_table(
    catalog_name="unity",
    namespace="default",
    table_name="test_table",
    storage_options=settings.S3_STORAGE_OPTIONS,
).collect()

date,country,value
str,str,i64
"""2023-01-01""","""US""",100
"""2023-01-02""","""CA""",200
"""2023-01-03""","""MX""",300


In [29]:
import duckdb

duckdb.sql("""
INSTALL uc_catalog;
INSTALL delta;
LOAD delta;
LOAD uc_catalog;
CREATE OR REPLACE SECRET uc_secret (
	TYPE UC,
	ENDPOINT 'http://localhost:8080',
    TOKEN 'not used',
    AWS_REGION 'eu-central-1'

);
ATTACH OR REPLACE 'unity' AS unity (TYPE UC_CATALOG, SECRET uc_secret);
SHOW ALL TABLES;
""")

IOException: IO Error: Invalid field found while parsing field: type_precision