In [3]:
import duckdb
duckdb.execute("copy (select 1 as x, 2 as y, 'bob' as z) to 'data.parquet'")

<duckdb.duckdb.DuckDBPyConnection at 0x1061a26f0>

In [5]:
from datafusion import SessionContext
import pyarrow as pa
import pyarrow.dataset as ds

In [6]:
ctx = SessionContext()

In [9]:
ctx.register_parquet("data2", "data.parquet")

In [10]:
ctx.sql("select * from data2").show()

DataFrame()
+---+---+-----+
| x | y | z   |
+---+---+-----+
| 1 | 2 | bob |
+---+---+-----+


In [None]:
dummy_data = {
    "id": [1, 2, 3, 4],
    "name": ["Alice", "Bob", "Charlie", "Diana"],
    "age": [25, 30, 35, 40]
}

# Create a PyArrow table
arrow_table = pa.Table.from_pydict(dummy_data)
ctx.from_arrow(arrow_table, "dummy_data1")


In [19]:
ctx.sql("select * from dummy_data1")

id,name,age
1,Alice,25
2,Bob,30
3,Charlie,35
4,Diana,40


In [16]:
sql = """
    select s.id, sum(t.age) as age_tot
    from dummy_data1 as s
        inner join dummy_data1 as t
            using(id)
    group by all
"""
ctx.sql(sql)

id,age_tot
3,35
2,30
4,40
1,25


In [23]:
from pyiceberg.catalog.sql import SqlCatalog
warehouse_path = "./warehouse"
catalog = SqlCatalog(
    "default",
    **{
        "uri": f"sqlite:///:memory:",
        "warehouse": f"file://{warehouse_path}",
    },
)

In [24]:
namespace = "test_ts"
catalog.create_namespace(namespace)

In [25]:
rows = 5000

#duckdb
sql = f"""
    select t.row_id, uuid() as txn_key, current_date as rpt_dt
        ,round(random() * 100,2) as some_val
    from generate_series(1,{rows}) t(row_id)
"""

duck_df = duckdb.execute(sql).arrow()


In [27]:
table_ducks = catalog.create_table(f"{namespace}.duckdb_data",schema = duck_df.schema)
table_ducks.append(duck_df)


In [32]:
ice_table1 = catalog.load_table(f"{namespace}.duckdb_data").scan().to_arrow()

In [33]:
ctx.from_arrow(ice_table1, "duck_data")

row_id,txn_key,rpt_dt,some_val
1,92d9a4da-929e-40bd-9e43-0bb769679464,2025-01-04,52.66
2,836b4ec5-be30-46ee-aa2e-56e014b930af,2025-01-04,21.97
3,49d1820e-a843-490b-95c3-8b8bf6d34445,2025-01-04,71.74
4,7abf44b2-1320-48c3-b70c-a08c2ff5be39,2025-01-04,60.26
5,e2294a4a-48ac-462f-8908-a8f90432afa8,2025-01-04,38.09
6,1c197542-23c6-48f2-a7ef-af8899e75a54,2025-01-04,18.2
7,6e39f02c-73a1-4a5a-8e2c-6c28d6f292b9,2025-01-04,97.73
8,c907bdf2-1c05-4adb-83db-db92ab519459,2025-01-04,72.29
9,7b94639b-8112-481b-8a67-0d9f7241d4d9,2025-01-04,98.63
10,a7f17e69-a9cd-4192-82e4-264b5b8b5593,2025-01-04,24.35


In [34]:
ctx.sql("select rpt_dt, count(*) as rec_cntm, sum(some_val) as tot_val from duck_data group by all")

rpt_dt,rec_cntm,tot_val
2025-01-04,5000,249318.5700000003
