# Delta experiments

## Import libraries

In [None]:
import polars as pl
from deltalake import DeltaTable, write_deltalake
from poor_man_lakehouse.config import settings

delta_path = "s3://warehouse/default/test_delta"

# Create a mock Polars DataFrame and save it as a Delta. It should have columns date (in yyyy-mm-dd format), country, and value.
df = pl.DataFrame({
    "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
    "country": ["US", "CA", "MX"],
    "value": [100, 200, 300],
})


In [None]:
# Write Delta using delta-rs write_deltalake function
write_deltalake(
    table_or_uri=delta_path,
    data=df.to_arrow(),
    mode="overwrite",
    name="test_delta",
    partition_by=["date"],
    schema_mode="merge",
    storage_options=settings.S3_STORAGE_OPTIONS,
)


In [None]:
DeltaTable(
    "s3://warehouse/default/test_delta",
    storage_options={
        "AWS_ACCESS_KEY_ID": "minioadmin",
        "AWS_SECRET_ACCESS_KEY": "miniopassword",
        "AWS_ENDPOINT_URL": "http://minio:9000",
        "AWS_REGION": "eu-central-1",
        "AWS_ALLOW_HTTP": "true",
        "aws_conditional_put": "etag",
    },
).to_pandas()

In [None]:
# Write Delta using delta-rs write_deltalake function
write_deltalake(
    table_or_uri=delta_path,
    data=df.to_arrow(),
    mode="overwrite",
    name="test_delta",
    partition_by=["date"],
    schema_mode="merge",
    storage_options=settings.S3_STORAGE_OPTIONS,
)


One can also use `write_delta` predicate from Polars

In [None]:
df.write_delta(
    target=delta_path,
    mode="overwrite",
    delta_write_options={
        "partition_by": ["date"],
        "schema_mode": "merge",
        "name": "test_delta",
    },
    storage_options=settings.S3_STORAGE_OPTIONS,
)

## Read Delta

We can read delta using the `DeltaTable` class, and then converting to Polars through Arrow (but that will load the whole table with no possibility of filtering data), or using `scan_delta` and filter before materializing it.

In case the Delta is located in a remote storage, we can pass `storage_options` parameter (to both Polars and DeltaTable).

In [None]:
dt = DeltaTable(delta_path, storage_options=settings.S3_STORAGE_OPTIONS)
pl.from_arrow(dt.to_pyarrow_table()).select("date", "country", "value").sort("date")

In [None]:
pl.scan_delta(delta_path, storage_options=settings.S3_STORAGE_OPTIONS).collect().select(
    "date", "country", "value"
).sort("date")

## Read Delta using DuckDB

In [None]:
import duckdb

# Query the Delta table using DuckDB
duckdb.sql(f"""
CREATE SECRET if not exists s3_secret (
    TYPE S3,
    KEY_ID '{settings.AWS_ACCESS_KEY_ID}',
    SECRET '{settings.AWS_SECRET_ACCESS_KEY}',
    ENDPOINT 'minio:9000',
    REGION '{settings.AWS_DEFAULT_REGION}',
    URL_STYLE 'path',
    USE_SSL false
);
            """)
duckdb.sql(f"""
SELECT date, country, value FROM delta_scan('{delta_path}' ORDER BY date)
           """)


## Upsert

Create another DataFrame with a 2 rows, one that will be appended and one that will be overwritten (considering `date` as join column).

In [None]:
upsert_df = pl.DataFrame({
    "date": ["2023-01-01", "2023-01-04"],
    "country": ["US", "IT"],
    "value": [150, 250],
})
upsert_df

In [None]:
# Upsert the new data into the Delta table
dt.merge(
    source=upsert_df.to_arrow(),
    source_alias="source",
    target_alias="target",
    predicate="source.date = target.date",
    merge_schema=True,
).when_matched_update_all().when_not_matched_insert_all().execute()


In [None]:
# Verify the upsert
pl.scan_delta(delta_path, storage_options=settings.S3_STORAGE_OPTIONS).collect().select(
    "date", "country", "value"
).sort("date")


## Read Delta History

In [None]:
pl.from_dicts(dt.history()).sort("timestamp")

## Perform Time Travel, Vacuum and Optimize

Delta has some nice features:

- **Time travel**: Restore old version
- **Vacuum**: Remove data not referenced by available table version
- **Optimize**: Compact smaller files together

In [None]:
# Load initial version from Polars
pl.scan_delta(
    delta_path, version=0, storage_options=settings.S3_STORAGE_OPTIONS
).collect().select("date", "country", "value").sort("date")

In [None]:
# Restore initial version
dt.restore(target=0)
pl.scan_delta(delta_path, storage_options=settings.S3_STORAGE_OPTIONS).collect().select(
    "date", "country", "value"
).sort("date")

However, restoring a Delta causes DuckDB to break on read.

In [None]:
import duckdb

# Query the Delta table using DuckDB
duckdb.sql(f"""
SELECT * FROM delta_scan('{delta_path}')
           """)


In [None]:
# Restore latest version
print(dt.restore(target=1))
# Verify the restored version
pl.scan_delta(delta_path, storage_options=settings.S3_STORAGE_OPTIONS).collect().select(
    "date", "country", "value"
).sort("date")

The following command will remove all files that are not referenced by Delta versions in history. It will output a list of deleted files, that are not restorable.

A `retention_hours` period can be specified, along with `dry_run`. `enforce_retention_duration` needs to be set at False in case `retention_hours` is lower than the Delta default one, which is 168 hours in case not specified differently at Delta creation with `configs` param.

In [None]:
# Vacuum Delta
dt.vacuum(retention_hours=0, dry_run=True, enforce_retention_duration=False)

*Optimize* will compact smaller files (think about small insertion) together, so that queries can be speed up by looking at less files.

In [None]:
dt.optimize.compact()

## Z-Order

Z-Order is a way to reorganize data in storage in order to optimize queries. It allows for more data-skipping by colocating relevant files together. Think about it a smart sorting of files based on one or multiple columns

In [None]:
dt.optimize.z_order(columns=["country"])

## Unity Catalog experiments

In [None]:
import polars as pl

c = pl.Catalog(workspace_url="http://localhost:8080", require_https=False)
c.list_catalogs()

In [None]:
c.list_tables(catalog_name="unity", namespace="default")

In [None]:
# mock dataframe
df = pl.DataFrame({
    "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
    "country": ["US", "CA", "MX"],
    "value": [100, 200, 300],
})

In [None]:
delta_path = "s3://warehouse/delta_unity_catalog"

tables = [
    table.name for table in c.list_tables(catalog_name="unity", namespace="default")
]
if "test_table" in tables:
    c.delete_table(catalog_name="unity", namespace="default", table_name="test_table")
c.create_table(
    catalog_name="unity",
    namespace="default",
    table_name="test_table",
    schema=df.schema,
    table_type="EXTERNAL",
    data_source_format="DELTA",
    storage_root=delta_path,
)

In [None]:
c.write_table(
    df=df,
    catalog_name="unity",
    namespace="default",
    delta_mode="overwrite",
    table_name="test_table",
    storage_options=settings.S3_STORAGE_OPTIONS,
)

In [None]:
c.scan_table(
    catalog_name="unity",
    namespace="default",
    table_name="test_table",
    storage_options=settings.S3_STORAGE_OPTIONS,
).collect()

In [None]:
import duckdb

duckdb.sql("""
INSTALL uc_catalog;
INSTALL delta;
LOAD delta;
LOAD uc_catalog;
CREATE OR REPLACE SECRET uc_secret (
	TYPE UC,
	ENDPOINT 'http://127.0.0.1:8080',
    TOKEN 'not used',
    AWS_REGION 'eu-central-1'

);
ATTACH IF NOT EXISTS 'unity' AS test_catalog (TYPE UC_CATALOG, SECRET uc_secret);
SHOW ALL TABLES;
""")