# Notebook for Ibis experiments

Please note that this notebook require the table *prova*, which can be created inside the *pyspark_experiments.ipynb* notebook.

In [1]:
import ibis
import polars as pl

from poor_man_lakehouse.dremio_connector.builder import DremioConnection
from poor_man_lakehouse.ibis_connector.builder import IbisConnection
from poor_man_lakehouse.spark_connector.builder import retrieve_current_spark_session

spark = retrieve_current_spark_session()

ibis.options.interactive = True

d = DremioConnection()
conn = IbisConnection()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/02/07 19:24:57 WARN Utils: Your hostname, MBP-di-Graziano.homenet.telecomitalia.it, resolves to a loopback address: 127.0.0.1; using 192.168.1.81 instead (on interface en0)
26/02/07 19:24:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/Users/graziano/apache-spark/4.0.1/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/graziano/.ivy2.5.2/cache
The jars for the packages stored in: /Users/graziano/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
org.apache.iceberg#iceberg-spark-runtime-4.0_2.13 added as a dependency
org.apache.iceberg#iceberg-aws-bundle added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
org.postgresql#postgresql added as a dependency
org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.13 added as a dependency
io.unitycatalog#uni

In [3]:
duckdb_conn = conn.get_connection("duckdb")
print(duckdb_conn.list_catalogs())
print(duckdb_conn.list_databases())
print(duckdb_conn.list_tables(database="default"))

['lakekeeper', 'memory', 'system', 'temp']
['default', 'information_schema', 'main', 'pg_catalog']
[]


In [4]:
# Create a sample dataframe
sample_data = pl.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'value': [100, 200, 150, 300, 250]
})

spark_conn = conn.get_connection("pyspark")

spark_df = spark.createDataFrame(sample_data.to_pandas())

spark_df.write.format("iceberg").mode("overwrite").saveAsTable("default.prova")

[32m2026-02-07 19:25:14.834[0m | [1mINFO    [0m | [36mpoor_man_lakehouse.ibis_connector.builder[0m:[36m_pyspark_connection[0m:[36m89[0m - [1mInitializing PySpark connection with Lakekeeper catalog...[0m
                                                                                

In [6]:
for engine in ("pyspark", "duckdb", "polars"):
    print(f"Testing read_table for {engine}...")
    print(conn.read_table("default", "prova", engine).execute())

Testing read_table for pyspark...
   id     name  value
0   1    Alice    100
1   2      Bob    200
2   3  Charlie    150
3   4    David    300
4   5      Eve    250
Testing read_table for duckdb...
   id     name  value
0   1    Alice    100
1   2      Bob    200
2   3  Charlie    150
3   4    David    300
4   5      Eve    250
Testing read_table for polars...
   id     name  value
0   1    Alice    100
1   2      Bob    200
2   3  Charlie    150
3   4    David    300
4   5      Eve    250


In [5]:
for engine in ("pyspark", "duckdb"):
    print(f"Testing connection for {engine}...")
    print(conn.sql("select * from lakekeeper.default.prova", engine).execute())

Testing connection for pyspark...
   id     name  value
0   1    Alice    100
1   2      Bob    200
2   3  Charlie    150
3   4    David    300
4   5      Eve    250
Testing connection for duckdb...
   id     name  value
0   1    Alice    100
1   2      Bob    200
2   3  Charlie    150
3   4    David    300
4   5      Eve    250


In [21]:
for engine in ("pyspark", "polars", "duckdb"):
    print(f"Testing read_table and Ibis syntax for {engine}...")
    t = conn.read_table("default", "prova", engine)
    expr = t["value"] > 150 # pyright: ignore[reportOperatorIssue]
    print(t.filter(expr).select(t.name.get_name()).execute())


Testing read_table and Ibis syntax for pyspark...
    name
0    Bob
1  David
2    Eve
Testing read_table and Ibis syntax for polars...
    name
0    Bob
1  David
2    Eve
Testing read_table and Ibis syntax for duckdb...
    name
0    Bob
1  David
2    Eve


In [24]:
for engine in ("pyspark", "polars", "duckdb"):
    print(f"Testing list tables for {engine}...")
    print(conn.list_tables(engine))

Testing list tables for pyspark...
['prova']
Testing list tables for polars...
['default.prova']
Testing list tables for duckdb...
['prova']
