In [None]:
#https://duckdb.org/community_extensions/extensions/bigquery.html

In [None]:
%pip install duckdb

#### Testing Using DuckDB To Query Google BigQuery
- works fine when dealing with a single project for read/write bq tables
- tried the biglake iceberg tables and you had to use the bigquery_execute function which is effectively a passthrough
- did not like when i tried to co-mingle a duckdb query to load to a bigquery external table

In [1]:
import duckdb

cn = duckdb.connect()

cn.execute("""
    INSTALL bigquery FROM community;
    LOAD bigquery;
""")

<duckdb.duckdb.DuckDBPyConnection at 0x1042e9f30>

In [2]:
import os
bq_project = os.getenv("GOOGLE_CLOUD_PROJECT")
cn.execute(f"ATTACH 'project={bq_project}' as bq (TYPE bigquery)")

<duckdb.duckdb.DuckDBPyConnection at 0x1042e9f30>

In [5]:
sql = """
SELECT *
FROM bq.test_ds.states_flat
LIMIT 3
"""

cn.sql(sql).show()

┌─────────────────────────────┬───────────────┬────────────┐
│            Path             │     Value     │  filename  │
│           varchar           │    varchar    │  varchar   │
├─────────────────────────────┼───────────────┼────────────┤
│ states/state/name/@nickname │ Granola State │ California │
│ states/state/name/#text     │ California    │ California │
│ states/state/population     │ 39538223      │ California │
└─────────────────────────────┴───────────────┴────────────┘



In [None]:
#create a standard table
row_cnt = 5_000
sql = f"""
    CREATE OR REPLACE TABLE bq.test_ds.duck_table
    as
    select t.row_id, uuid() as txn_key, current_date as rpt_dt
        ,round(random() * 100,2) as some_val
    from generate_series(1,{row_cnt}) t(row_id)
"""

cn.execute(sql)

<duckdb.duckdb.DuckDBPyConnection at 0x1056a97b0>

In [5]:
#create an iceberg table
row_cnt = 5_000
gcs_bucket = os.getenv("GCS_BUCKET")
sql = f"""
CALL bigquery_execute('bq','
CREATE OR REPLACE TABLE test_ds.iceyhot1 (
    row_id INT, txn_key STRING, rpt_dt DATE, some_val FLOAT64
)
WITH CONNECTION us.test_cn_matt
OPTIONS (
    file_format = ''PARQUET'',
    table_format = ''ICEBERG'',
    storage_uri = ''gs://{gcs_bucket}/icehouse1/iceyhot1''
)
')
"""
#print(sql)
cn.execute(sql)

<duckdb.duckdb.DuckDBPyConnection at 0x1042e9f30>

In [4]:
row_cnt = 5_000
sql = f"""
INSERT INTO bq.test_ds.duck_table (row_id, txn_key, rpt_dt, some_val)
select t.row_id, uuid() as txn_key, current_date as rpt_dt
    ,round(random() * 100,2) as some_val
from generate_series(1,{row_cnt}) t(row_id)
"""

cn.execute(sql)

: 

In [4]:
cn.sql("Select * from bq.test_ds.icyhot1 limit 5")

CatalogException: Catalog Error: Table with name icyhot1 does not exist!
Did you mean "iceyhot1"?

In [None]:
### duckdb extension doesnt handle bigquery external tables well
## might want to result to the standard google bq api and use a pyarrow dataframe to transfer over