### Notes

the iceberg extension in duckdb still seems like it needs work. the iceberg_scan function seems to be looking for files like "version-hint.txt" which pyiceberg does not generate
best to stick with using pyiceberg table objects for duckdb to read from

In [74]:
import duckdb
import polars as pl
from pyiceberg.catalog.sql import SqlCatalog
import uuid
from datetime import date
import numpy as np

In [71]:
cn = duckdb.connect()

In [72]:
warehouse_path = "./icehouse"
catalog = SqlCatalog(
    "default",
    **{
        "uri": f"sqlite:///{warehouse_path}/icyhot.db",
        "warehouse": f"file://{warehouse_path}",
    },
)

In [73]:
catalog.create_namespace("dummy_data")

### Generate Datasets in DuckDB and Polars

In [76]:
rows = 5000

#duckdb
sql = f"""
    select t.row_id, uuid() as txn_key, current_date as rpt_dt
        ,round(random() * 100,2) as some_val
    from generate_series(1,{rows}) t(row_id)
"""

duck_df = cn.execute(sql).arrow()

#polars
polars_df = pl.DataFrame({
    'row_id': pl.arange(0, rows, eager=True),
    'rpt_dt': pl.Series([date.today()] * rows),
    'some_val': pl.Series(np.floor(np.random.rand(rows) * 100).astype(int)),
    'txn_key': pl.Series([str(uuid.uuid4()) for _ in range(rows)])
}).to_arrow()

In [77]:
#create the empty tables
table_ducks = catalog.create_table("dummy_data.duckdb_data",schema = duck_df.schema)
table_polars = catalog.create_table("dummy_data.polars_data", schema = polars_df.schema)

In [78]:
#append the data to each
table_ducks.append(duck_df)
table_polars.append(polars_df)

In [82]:
#validate
print('duckdb table has {0} rows'.format(len(table_ducks.scan().to_arrow())))
print('polars table has {0} rows'.format(len(table_polars.scan().to_arrow())))

duckdb table has 5000 rows
polars table has 5000 rows


In [94]:
#read back in duckdb
cn = table_ducks.scan().to_duckdb(table_name="duck_back")
cn.sql("select count(row_id) as tot_rows, sum(some_val) as agg_val from duck_back").show()

┌──────────┬───────────────────┐
│ tot_rows │      agg_val      │
│  int64   │      double       │
├──────────┼───────────────────┤
│     5000 │ 250860.8700000002 │
└──────────┴───────────────────┘



In [85]:
# read back in polars
df = pl.scan_iceberg(table_polars).collect()

df.select(
    pl.col('row_id').count().alias("tot_rows")
    , pl.col('some_val').sum().alias('agg_val')
)


tot_rows,agg_val
u32,i64
5000,246610
