In [30]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
    .appName('local')\
    .getOrCreate()

In [49]:
spark.catalog.listCatalogs()

[CatalogMetadata(name='iceberg', description=None),
 CatalogMetadata(name='spark_catalog', description=None)]

In [50]:
# List all configs
spark.sparkContext.getConf().getAll()

# Or filter by catalog configs
[k for k in spark.sparkContext.getConf().getAll() if k[0].startswith("spark.sql.catalog.")]


[('spark.sql.catalog.spark_catalog',
  'org.apache.iceberg.spark.SparkSessionCatalog'),
 ('spark.sql.catalog.iceberg.uri', 'thrift://hive-metastore:9083'),
 ('spark.sql.catalog.iceberg.s3.endpoint', 'http://minio:9000'),
 ('spark.sql.catalog.iceberg.s3.path-style-access', 'true'),
 ('spark.sql.catalog.iceberg.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO'),
 ('spark.sql.catalog.iceberg.warehouse', 's3a:/iceberg'),
 ('spark.sql.catalog.iceberg', 'org.apache.iceberg.spark.SparkCatalog'),
 ('spark.sql.catalog.iceberg.type', 'hive'),
 ('spark.sql.catalog.spark_catalog.type', 'hive')]

In [51]:
spark.sql("SHOW CATALOGS").show()
spark.sql("SHOW DATABASES IN iceberg").show()

+-------------+
|      catalog|
+-------------+
|      iceberg|
|spark_catalog|
+-------------+

+---------+
|namespace|
+---------+
|  default|
|      raw|
+---------+



In [31]:
# List all tables in the namespace
tables = spark.catalog.listTables("iceberg.raw")

# Drop each table
for t in tables:
    spark.sql(f"DROP TABLE IF EXISTS iceberg.raw.{t.name}")

# Now drop the namespace (database)
spark.sql("DROP DATABASE IF EXISTS iceberg.raw")

DataFrame[]

In [32]:
spark.sql("DROP DATABASE IF EXISTS iceberg.raw CASCADE")

DataFrame[]

In [33]:
spark.sql("""
    CREATE DATABASE IF NOT EXISTS iceberg.raw COMMENT '' LOCATION 's3a://iceberg/raw/'
""")

DataFrame[]

In [35]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS iceberg.raw.sample (
        id bigint,
        data string,
        category string,
        ts timestamp)
    USING iceberg
    PARTITIONED BY (bucket(16, id), days(ts), category)
""")

DataFrame[]

In [46]:
# spark.sql("DESCRIBE NAMESPACE iceberg.raw")
# spark.sql("DESCRIBE DATABASE iceberg.raw").show(truncate=False)
spark.sql("DESCRIBE FORMATTED iceberg.raw.sample").show(truncate=False)
# spark.table("iceberg.raw.sample").inputFiles()

+----------------------------+-------------------------------------------------+-------+
|col_name                    |data_type                                        |comment|
+----------------------------+-------------------------------------------------+-------+
|id                          |bigint                                           |null   |
|data                        |string                                           |null   |
|category                    |string                                           |null   |
|ts                          |timestamp                                        |null   |
|                            |                                                 |       |
|# Partitioning              |                                                 |       |
|Part 0                      |bucket(16, id)                                   |       |
|Part 1                      |days(ts)                                         |       |
|Part 2              

In [10]:
spark.sql("""
    INSERT INTO iceberg.raw.sample VALUES
        (1, 'a', 'cat1', TIMESTAMP '2023-01-01 10:00:00'),
        (2, 'b', 'cat2', TIMESTAMP '2023-01-02 12:00:00'),
        (3, 'c', 'cat1', TIMESTAMP '2023-01-03 14:30:00'),
        (4, 'd', 'cat3', TIMESTAMP '2023-01-04 09:15:00'),
        (5, 'e', 'cat2', TIMESTAMP '2023-01-05 16:45:00')
""")

DataFrame[]

In [21]:
spark.sql("SELECT * FROM raw.sample").show()

+---+----+--------+-------------------+
| id|data|category|                 ts|
+---+----+--------+-------------------+
|  3|   c|    cat1|2023-01-03 14:30:00|
|  1|   a|    cat1|2023-01-01 10:00:00|
|  2|   b|    cat2|2023-01-02 12:00:00|
|  4|   d|    cat3|2023-01-04 09:15:00|
|  3|   c|    cat1|2023-01-03 14:30:00|
|  5|   e|    cat2|2023-01-05 16:45:00|
|  1|   a|    cat1|2023-01-01 10:00:00|
|  2|   b|    cat2|2023-01-02 12:00:00|
|  4|   d|    cat3|2023-01-04 09:15:00|
|  5|   e|    cat2|2023-01-05 16:45:00|
+---+----+--------+-------------------+



In [22]:
spark.sql("SELECT * FROM iceberg.raw.sample").show()

+---+----+--------+-------------------+
| id|data|category|                 ts|
+---+----+--------+-------------------+
|  3|   c|    cat1|2023-01-03 14:30:00|
|  3|   c|    cat1|2023-01-03 14:30:00|
|  1|   a|    cat1|2023-01-01 10:00:00|
|  1|   a|    cat1|2023-01-01 10:00:00|
|  2|   b|    cat2|2023-01-02 12:00:00|
|  2|   b|    cat2|2023-01-02 12:00:00|
|  4|   d|    cat3|2023-01-04 09:15:00|
|  4|   d|    cat3|2023-01-04 09:15:00|
|  5|   e|    cat2|2023-01-05 16:45:00|
|  5|   e|    cat2|2023-01-05 16:45:00|
+---+----+--------+-------------------+

