### Overview testing s3 tables with duckdb via attach statement
- can create and drop tables
- can insert recs
- cannot delete, update, insert

In [1]:
from setup_env import set_aws_creds
set_aws_creds()
import os
aws_acct_id = os.getenv("aws_acct_nbr")

In [13]:
import duckdb


cn = duckdb.connect()
cn.execute("install iceberg; load iceberg;")

cn.execute("create or replace secret aws_sec (type s3, provider credential_chain)")

cn.execute(f"""
   attach or replace 'arn:aws:s3tables:us-east-1:{aws_acct_id}:bucket/icehouse-tbl-bucket1' as s3cat (
       type iceberg,
       endpoint_type s3_tables
    )
""")

cn.execute("create table s3cat.ns1.test_for_athena as select 1 as x, 'a' as y")

<_duckdb.DuckDBPyConnection at 0x10cd99070>

In [8]:
cn.sql(f"""
  attach or replace '{aws_acct_id}:s3tablescatalog/icehouse-tbl-bucket1' as s3cat_sm (
        type iceberg,
        endpoint_type glue
    )
""")

In [None]:
#athena can't read it this way either...
cn.execute("""
    create table s3cat_sm.ns1.tbl5 as
           select 1 as x, 'a' as y       
""")

ParserException: Parser Error: syntax error at or near "using"

LINE 2:     create table s3cat_sm.ns1.tbl6 using (iceberg) as
                                           ^

In [12]:
cn.execute("drop table s3cat_sm.ns1.tbl5")

<_duckdb.DuckDBPyConnection at 0x10bd785f0>

In [None]:
#create a table (athena shows the schema but can't read any of the tables - its an interesting error; might want to show screenshot)
cn.execute("""
    create table if not exists s3cat.ns1.tbl1 (
        id bigint,
        data string
    )
""")

<_duckdb.DuckDBPyConnection at 0x121a13f30>

In [18]:
cn.execute("insert into s3cat.ns1.tbl1 values (1, 'a'), (2, 'b'), (3, 'c')")

<_duckdb.DuckDBPyConnection at 0x121a13f30>

In [None]:
#update a record (no dice)
cn.execute("update s3cat.ns1.tbl1 set data='z' where id=2")

In [None]:
#delete a record (also no dice)
cn.execute("delete from s3cat.ns1.tbl1 where id=3") 

In [19]:
#read
cn.sql("select * from s3cat.ns1.tbl1")

┌───────┬─────────┐
│  id   │  data   │
│ int64 │ varchar │
├───────┼─────────┤
│     1 │ a       │
│     2 │ b       │
│     3 │ c       │
└───────┴─────────┘

In [20]:
cn.execute("drop table s3cat.ns1.tbl1")

<_duckdb.DuckDBPyConnection at 0x121a13f30>

### let's do it with pyiceberg then

- to get this to work, had to enable lake formation, add the iam role as a lake formation admin
- then had to explicitely give the id the ability to read all tables in the namespace
- what's interesting is that duckdb didn't require any lake formation stuff; pyiceberg is using LF but duckdb is somehow backdooring it's way

In [2]:
from pyiceberg.catalog import load_catalog
catalog = load_catalog(
        "glue",
        **{
            "type": "rest",
            "uri": "https://glue.us-east-1.amazonaws.com/iceberg",
            "warehouse": f"{aws_acct_id}:s3tablescatalog/icehouse-tbl-bucket1",
            "rest.sigv4-enabled": "true",
            "rest.signing-name": "glue",
            "rest.signing-region": "us-east-1"
        }
    )


In [None]:
#read via pyiceberg (so pyiceberg can read it; athena cannot)
table = catalog.load_table("ns1.tbl1")
cn.register("tbl1", table.scan().to_arrow() )
cn.sql("select * from tbl1")


┌───────┬─────────┐
│  id   │  data   │
│ int32 │ varchar │
├───────┼─────────┤
│     1 │ a       │
│     2 │ b       │
│     3 │ c       │
└───────┴─────────┘

In [10]:
def generate_test_data(cn, num_rows=100):
    cn.execute(f"""
        CREATE OR REPLACE VIEW v_data_gen AS
        SELECT 
            t.row_id, 
            uuid()::varchar as txn_key,  -- Cast to varchar to avoid binary UUID issues
            current_date as rpt_dt,
            round(random() * 100, 2) as some_val
        FROM generate_series(1, {num_rows}) t(row_id)
    """)
    
    return True

In [11]:
generate_test_data(cn, 10)

True

In [None]:
#create a new table via pyiceberg
df = cn.execute("select * from v_data_gen")
ar_tbl = df.arrow().read_all()

ice_tbl = catalog.create_table("ns1.tbl2", schema=ar_tbl.schema)
ice_tbl.append(ar_tbl)

In [14]:
#upsert
df2 = cn.execute("""select * from v_data_gen limit 5""").arrow().read_all()
ice_tbl.upsert(df2, join_cols=["row_id"])

UpsertResult(rows_updated=5, rows_inserted=0)