## Introduction to Iceberg Architecture

#### Launching a Spark Session with Iceberg

In [1]:
import cml.data_v1 as cmldata

CONNECTION_NAME = "go01-aw-dl"
conn = cmldata.get_connection(CONNECTION_NAME)
spark = conn.get_spark_session()

# Sample usage to run query through spark
EXAMPLE_SQL_QUERY = "show databases"
spark.sql(EXAMPLE_SQL_QUERY).show()

Setting spark.hadoop.yarn.resourcemanager.principal to pauldefusco
Hive Session ID = c49f2e85-357b-4403-8d9c-63f646cc2604


+--------------------+
|           namespace|
+--------------------+
|         01_car_data|
|           01_car_dw|
|      adash_car_data|
|             airline|
|          airline_dw|
|            airlines|
|        airlines_csv|
|       airlines_csv1|
|   airlines_csv_vish|
|    airlines_iceberg|
|   airlines_iceberg1|
|airlines_iceberg_...|
|      airlines_mjain|
|          airquality|
|          atlas_demo|
|            bankdemo|
|              bhagan|
|             cdedemo|
|        cdp_overview|
|        cgsifacebook|
+--------------------+
only showing top 20 rows



In [2]:
spark.sparkContext.getConf().getAll()

[('spark.eventLog.enabled', 'true'),
 ('spark.repl.local.jars',
  'file:///runtime-addons/spark320-17-hf1-6xa3lk/opt/spark/optional-lib/iceberg-spark-runtime-3.2_2.12-0.14.1.1.17.7215.0-31.jar'),
 ('spark.kubernetes.executor.podNamePrefix', 'cdsw-uonfwybcz7ux1xd0'),
 ('spark.network.crypto.enabled', 'true'),
 ('spark.sql.hive.hwc.execution.mode', 'spark'),
 ('spark.kerberos.renewal.credentials', 'ccache'),
 ('spark.sql.catalog.spark_catalog',
  'org.apache.iceberg.spark.SparkSessionCatalog'),
 ('spark.dynamicAllocation.maxExecutors', '49'),
 ('spark.eventLog.dir', 'file:///sparkeventlogs'),
 ('spark.hadoop.yarn.resourcemanager.principal', 'pauldefusco'),
 ('spark.kubernetes.driver.annotation.cluster-autoscaler.kubernetes.io/safe-to-evict',
  'false'),
 ('spark.ui.port', '20049'),
 ('spark.yarn.access.hadoopFileSystems',
  's3a://go01-demo/warehouse/tablespace/external/hive'),
 ('spark.sql.extensions',
  'com.qubole.spark.hiveacid.HiveAcidAutoConvertExtension,org.apache.iceberg.spark.ex

### Iceberg Architecture

![alt text](../img/iceberg-metadata.png)

#### Iceberg Catalog

Iceberg comes with catalogs that enable SQL commands to manage tables and load them by name. Catalogs are configured using properties under spark.sql.catalog.(catalog_name).

In [3]:
# Show catalog and database
spark.sql("SHOW CURRENT NAMESPACE").show()

+-------------+---------+
|      catalog|namespace|
+-------------+---------+
|spark_catalog|  default|
+-------------+---------+



In [4]:
# Create a new database
spark.sql("CREATE DATABASE IF NOT EXISTS spark_catalog.lakehouse_catalog")
spark.sql("USE spark_catalog.lakehouse_catalog")

DataFrame[]

In [5]:
# Show catalog and database
spark.sql("SHOW CURRENT NAMESPACE").show()

+-------------+-----------------+
|      catalog|        namespace|
+-------------+-----------------+
|spark_catalog|lakehouse_catalog|
+-------------+-----------------+



#### Create an Iceberg Table with Spark SQL

In [6]:
spark.sql("DROP TABLE IF EXISTS customers_table")

                                                                                

DataFrame[]

In [7]:
spark.sql("CREATE TABLE IF NOT EXISTS customers_table (id BIGINT, state STRING, country STRING, dob TIMESTAMP) USING iceberg PARTITIONED BY ( hours(dob))")

DataFrame[]

#### Verify that a Metadata JSON file has been created under the Metadata directory

In [8]:
metadata_path = "s3a://go01-demo/warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata"

In [9]:
import boto3

s3 = boto3.resource('s3')
my_bucket = s3.Bucket("go01-demo")

for object_summary in my_bucket.objects.filter(Prefix="warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata"):
    print(object_summary.key)
    metadata_file = object_summary.key

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00000-29b2d826-3800-4e48-a79b-78faf446c956.metadata.json


In [10]:
import pandas as pd
spark.read.option("multiline","true").json("s3a://go01-demo/" + metadata_file).toPandas()

Unnamed: 0,current-schema-id,current-snapshot-id,default-sort-order-id,default-spec-id,format-version,last-column-id,last-partition-id,last-updated-ms,location,metadata-log,partition-spec,partition-specs,properties,schema,schemas,snapshot-log,snapshots,sort-orders,table-uuid
0,0,-1,0,0,1,4,1000,1682634884957,s3a://go01-demo/warehouse/tablespace/external/...,[],"[(1000, dob_hour, 4, hour)]","[([Row(field-id=1000, name='dob_hour', source-...","(pauldefusco,)","([(1, id, False, long), (2, state, False, stri...","[([Row(id=1, name='id', required=False, type='...",[],[],"[([], 0)]",40c6c008-a95c-4471-bc69-4c2a805ae6ed


![alt text](../img/s3_metadata.png)

#### Notice that no snapshots or other files have been created as data has not yet been inserted.

In [11]:
spark.sql("SELECT * FROM lakehouse_catalog.customers_table.history").show()

+---------------+-----------+---------+-------------------+
|made_current_at|snapshot_id|parent_id|is_current_ancestor|
+---------------+-----------+---------+-------------------+
+---------------+-----------+---------+-------------------+



In [12]:
#spark.sql("SELECT * FROM lakehouse_catalog.customers_table.metadata_log_entries").show()

In [13]:
spark.sql("SELECT * FROM lakehouse_catalog.customers_table.snapshots;").show()

+------------+-----------+---------+---------+-------------+-------+
|committed_at|snapshot_id|parent_id|operation|manifest_list|summary|
+------------+-----------+---------+---------+-------------+-------+
+------------+-----------+---------+---------+-------------+-------+



In [14]:
spark.sql("SELECT * FROM lakehouse_catalog.customers_table.files;").show()

+-------+---------+-----------+-------+---------+------------+------------------+------------+------------+-----------------+----------------+------------+------------+------------+-------------+------------+-------------+
|content|file_path|file_format|spec_id|partition|record_count|file_size_in_bytes|column_sizes|value_counts|null_value_counts|nan_value_counts|lower_bounds|upper_bounds|key_metadata|split_offsets|equality_ids|sort_order_id|
+-------+---------+-----------+-------+---------+------------+------------------+------------+------------+-----------------+----------------+------------+------------+------------+-------------+------------+-------------+
+-------+---------+-----------+-------+---------+------------+------------------+------------+------------+-----------------+----------------+------------+------------+------------+-------------+------------+-------------+



In [15]:
spark.sql("SELECT * FROM lakehouse_catalog.customers_table.manifests;").show()

+-------+----+------+-----------------+-----------------+----------------------+-------------------------+------------------------+------------------------+---------------------------+--------------------------+-------------------+
|content|path|length|partition_spec_id|added_snapshot_id|added_data_files_count|existing_data_files_count|deleted_data_files_count|added_delete_files_count|existing_delete_files_count|deleted_delete_files_count|partition_summaries|
+-------+----+------+-----------------+-----------------+----------------------+-------------------------+------------------------+------------------------+---------------------------+--------------------------+-------------------+
+-------+----+------+-----------------+-----------------+----------------------+-------------------------+------------------------+------------------------+---------------------------+--------------------------+-------------------+



In [16]:
spark.sql("SELECT * FROM lakehouse_catalog.customers_table.all_data_files;").show()

+-------+---------+-----------+-------+---------+------------+------------------+------------+------------+-----------------+----------------+------------+------------+------------+-------------+------------+-------------+
|content|file_path|file_format|spec_id|partition|record_count|file_size_in_bytes|column_sizes|value_counts|null_value_counts|nan_value_counts|lower_bounds|upper_bounds|key_metadata|split_offsets|equality_ids|sort_order_id|
+-------+---------+-----------+-------+---------+------------+------------------+------------+------------+-----------------+----------------+------------+------------+------------+-------------+------------+-------------+
+-------+---------+-----------+-------+---------+------------+------------------+------------+------------+-----------------+----------------+------------+------------+------------+-------------+------------+-------------+



In [17]:
spark.sql("SELECT * FROM lakehouse_catalog.customers_table.all_manifests;").show()

+-------+----+------+-----------------+-----------------+----------------------+-------------------------+------------------------+------------------------+---------------------------+--------------------------+-------------------+---------------------+
|content|path|length|partition_spec_id|added_snapshot_id|added_data_files_count|existing_data_files_count|deleted_data_files_count|added_delete_files_count|existing_delete_files_count|deleted_delete_files_count|partition_summaries|reference_snapshot_id|
+-------+----+------+-----------------+-----------------+----------------------+-------------------------+------------------------+------------------------+---------------------------+--------------------------+-------------------+---------------------+
+-------+----+------+-----------------+-----------------+----------------------+-------------------------+------------------------+------------------------+---------------------------+--------------------------+-------------------+-------

### Table Insert

In [18]:
from pyspark.sql.functions import date_format

In [19]:
spark.sql("INSERT INTO lakehouse_catalog.customers_table VALUES (1, 'CA', 'USA', cast(date_format('2000-01-01 00:00:00', 'yyyy-MM-dd HH:mm:ss') as timestamp))")

                                                                                

DataFrame[]

#### Validate that data has been added to the data folder

In [20]:
QUERY = "select h.made_current_at,\
            s.operation,\
            h.snapshot_id,\
            h.is_current_ancestor,\
            s.summary['spark.app.id']\
        from lakehouse_catalog.customers_table.history h\
        join lakehouse_catalog.customers_table.snapshots s\
            on h.snapshot_id = s.snapshot_id\
            order by made_current_at;"

In [21]:
spark.sql(QUERY).toPandas()

                                                                                

Unnamed: 0,made_current_at,operation,snapshot_id,is_current_ancestor,summary[spark.app.id]
0,2023-04-27 22:34:53.810,append,6887654183493915966,True,spark-application-1682634844485


![alt text](../img/s3_data_1.png)

![alt text](../img/s3_data_2.png)

#### Show all of the table’s data files and each file’s metadata.

#### Notice there are now two json files and two avro files. 

The second json file reflects the new table version after the insert. Now, a new table read operation will point to this new file and ignore the previous one.

The avro file with the "snap" prefix is the manifest list. The other avro file created is the corresponding manifest file.

In [22]:
s3 = boto3.resource('s3')
my_bucket = s3.Bucket("go01-demo")

metadata_file_list = []

for object_summary in my_bucket.objects.filter(Prefix="warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata"):
    print(object_summary.key +"\n")
    metadata_file_list.append(object_summary.key)

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00000-29b2d826-3800-4e48-a79b-78faf446c956.metadata.json

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00001-523fe7c1-2e41-4466-a883-a91e5e910729.metadata.json

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/7a1a260d-99b5-4e22-b1d3-6da99f76c5f2-m0.avro

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/snap-6887654183493915966-1-7a1a260d-99b5-4e22-b1d3-6da99f76c5f2.avro



In [23]:
metadata_file_list

['warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00000-29b2d826-3800-4e48-a79b-78faf446c956.metadata.json',
 'warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00001-523fe7c1-2e41-4466-a883-a91e5e910729.metadata.json',
 'warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/7a1a260d-99b5-4e22-b1d3-6da99f76c5f2-m0.avro',
 'warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/snap-6887654183493915966-1-7a1a260d-99b5-4e22-b1d3-6da99f76c5f2.avro']

Showing Metadata Files (JSON)

In [24]:
import pandas as pd

print("Showing " + metadata_file_list[0])
spark.read.option("multiline","true").json("s3a://go01-demo/" + metadata_file_list[0]).toPandas()

Showing warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00000-29b2d826-3800-4e48-a79b-78faf446c956.metadata.json


                                                                                

Unnamed: 0,current-schema-id,current-snapshot-id,default-sort-order-id,default-spec-id,format-version,last-column-id,last-partition-id,last-updated-ms,location,metadata-log,partition-spec,partition-specs,properties,schema,schemas,snapshot-log,snapshots,sort-orders,table-uuid
0,0,-1,0,0,1,4,1000,1682634884957,s3a://go01-demo/warehouse/tablespace/external/...,[],"[(1000, dob_hour, 4, hour)]","[([Row(field-id=1000, name='dob_hour', source-...","(pauldefusco,)","([(1, id, False, long), (2, state, False, stri...","[([Row(id=1, name='id', required=False, type='...",[],[],"[([], 0)]",40c6c008-a95c-4471-bc69-4c2a805ae6ed


In [25]:
print("Showing " + metadata_file_list[1])
spark.read.option("multiline","true").json("s3a://go01-demo/" + metadata_file_list[1]).toPandas()

Showing warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00001-523fe7c1-2e41-4466-a883-a91e5e910729.metadata.json


Unnamed: 0,current-schema-id,current-snapshot-id,default-sort-order-id,default-spec-id,format-version,last-column-id,last-partition-id,last-updated-ms,location,metadata-log,partition-spec,partition-specs,properties,refs,schema,schemas,snapshot-log,snapshots,sort-orders,table-uuid
0,0,6887654183493915966,0,0,1,4,1000,1682634893810,s3a://go01-demo/warehouse/tablespace/external/...,[(s3a://go01-demo/warehouse/tablespace/externa...,"[(1000, dob_hour, 4, hour)]","[([Row(field-id=1000, name='dob_hour', source-...","(pauldefusco,)","((6887654183493915966, branch),)","([(1, id, False, long), (2, state, False, stri...","[([Row(id=1, name='id', required=False, type='...","[(6887654183493915966, 1682634893810)]",[(s3a://go01-demo/warehouse/tablespace/externa...,"[([], 0)]",40c6c008-a95c-4471-bc69-4c2a805ae6ed


Showing Manifest Lists (AVRO - prefixed by "SNAP")

In [26]:
print("Showing " + metadata_file_list[3])
spark.read.format("avro").load("s3a://go01-demo/" + metadata_file_list[3]).toPandas()

Showing warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/snap-6887654183493915966-1-7a1a260d-99b5-4e22-b1d3-6da99f76c5f2.avro


                                                                                

Unnamed: 0,manifest_path,manifest_length,partition_spec_id,added_snapshot_id,added_data_files_count,existing_data_files_count,deleted_data_files_count,partitions,added_rows_count,existing_rows_count,deleted_rows_count
0,s3a://go01-demo/warehouse/tablespace/external/...,6192,0,6887654183493915966,1,0,0,"[(False, False, [56, 3, 4, 0], [56, 3, 4, 0])]",1,0,0


Showing Manifest Files (Avro) i.e. list of table partitions mapped to snapshot ID

In [27]:
print("Showing " + metadata_file_list[2])
spark.read.format("avro").load("s3a://go01-demo/" + metadata_file_list[2]).toPandas()

Showing warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/7a1a260d-99b5-4e22-b1d3-6da99f76c5f2-m0.avro


                                                                                

Unnamed: 0,status,snapshot_id,data_file
0,1,6887654183493915966,(s3a://go01-demo/warehouse/tablespace/external...


### Table Update

Create a staging table

In [28]:
spark.sql("DROP TABLE IF EXISTS lakehouse_catalog.staging")

DataFrame[]

In [29]:
spark.sql("CREATE TABLE IF NOT EXISTS lakehouse_catalog.staging\
            (id BIGINT, state STRING, country STRING, dob TIMESTAMP)\
            USING iceberg\
            PARTITIONED BY ( hours(dob))")

DataFrame[]

In [30]:
spark.sql("INSERT INTO lakehouse_catalog.staging\
            VALUES (1, 'ID', 'USA', cast(date_format('2000-01-01 00:00:00', 'yyyy-MM-dd HH:mm:ss') as timestamp)),\
            (2, 'CA', 'USA', cast(date_format('2000-01-03 00:10:00', 'yyyy-MM-dd HH:mm:ss') as timestamp)),\
            (3, 'AZ', 'USA', cast(date_format('2000-01-04 00:01:00', 'yyyy-MM-dd HH:mm:ss') as timestamp)),\
            (4, 'NV', 'USA', cast(date_format('2000-01-02 00:02:00', 'yyyy-MM-dd HH:mm:ss') as timestamp)),\
            (5, 'OR', 'USA', cast(date_format('2000-01-03 00:03:00', 'yyyy-MM-dd HH:mm:ss') as timestamp)),\
            (10, 'WA', 'USA', cast(date_format('2000-01-04 00:03:00', 'yyyy-MM-dd HH:mm:ss') as timestamp)),\
            (3, 'UT', 'USA', cast(date_format('2000-01-01 00:04:00', 'yyyy-MM-dd HH:mm:ss') as timestamp)),\
            (11, 'CO', 'USA', cast(date_format('2000-01-01 00:03:00', 'yyyy-MM-dd HH:mm:ss') as timestamp)),\
            (6, 'CO', 'USA', cast(date_format('2000-01-01 00:03:00', 'yyyy-MM-dd HH:mm:ss') as timestamp))")

                                                                                

DataFrame[]

Merge Into Customers Table

In [31]:
spark.sql("MERGE INTO lakehouse_catalog.customers_table\
            USING (SELECT * FROM lakehouse_catalog.staging) a\
            ON customers_table.id = a.id\
            WHEN MATCHED THEN UPDATE SET customers_table.state = a.state\
            WHEN NOT MATCHED THEN INSERT *")

                                                                                

DataFrame[]

In [32]:
spark.sql("SELECT * FROM lakehouse_catalog.customers_table.snapshots;").toPandas()

Unnamed: 0,committed_at,snapshot_id,parent_id,operation,manifest_list,summary
0,2023-04-27 22:34:53.810,6887654183493915966,,append,s3a://go01-demo/warehouse/tablespace/external/...,{'spark.app.id': 'spark-application-1682634844...
1,2023-04-27 22:35:23.955,4263374782247318559,6.887654e+18,overwrite,s3a://go01-demo/warehouse/tablespace/external/...,"{'added-data-files': '4', 'total-equality-dele..."


In [33]:
spark.sql("SELECT * FROM lakehouse_catalog.customers_table.manifests;").toPandas()

Unnamed: 0,content,path,length,partition_spec_id,added_snapshot_id,added_data_files_count,existing_data_files_count,deleted_data_files_count,added_delete_files_count,existing_delete_files_count,deleted_delete_files_count,partition_summaries
0,0,s3a://go01-demo/warehouse/tablespace/external/...,6373,0,4263374782247318559,4,0,0,0,0,0,"[(False, False, 2000-01-01-00, 2000-01-04-00)]"
1,0,s3a://go01-demo/warehouse/tablespace/external/...,6193,0,4263374782247318559,0,0,1,0,0,0,"[(False, False, 2000-01-01-00, 2000-01-01-00)]"


In [34]:
spark.sql("SELECT * FROM lakehouse_catalog.customers_table.all_data_files;").toPandas()

                                                                                

Unnamed: 0,content,file_path,file_format,spec_id,partition,record_count,file_size_in_bytes,column_sizes,value_counts,null_value_counts,nan_value_counts,lower_bounds,upper_bounds,key_metadata,split_offsets,equality_ids,sort_order_id
0,0,s3a://go01-demo/warehouse/tablespace/external/...,PARQUET,0,"(262968,)",4,1277,"{1: 57, 2: 74, 3: 61, 4: 80}","{1: 4, 2: 4, 3: 4, 4: 4}","{1: 0, 2: 0, 3: 0, 4: 0}",{},"{1: [1, 0, 0, 0, 0, 0, 0, 0], 2: [67, 79], 3: ...","{1: [11, 0, 0, 0, 0, 0, 0, 0], 2: [85, 84], 3:...",,[4],,0
1,0,s3a://go01-demo/warehouse/tablespace/external/...,PARQUET,0,"(262992,)",1,1130,"{1: 39, 2: 37, 3: 38, 4: 39}","{1: 1, 2: 1, 3: 1, 4: 1}","{1: 0, 2: 0, 3: 0, 4: 0}",{},"{1: [4, 0, 0, 0, 0, 0, 0, 0], 2: [78, 86], 3: ...","{1: [4, 0, 0, 0, 0, 0, 0, 0], 2: [78, 86], 3: ...",,[4],,0
2,0,s3a://go01-demo/warehouse/tablespace/external/...,PARQUET,0,"(263016,)",2,1176,"{1: 46, 2: 43, 3: 61, 4: 47}","{1: 2, 2: 2, 3: 2, 4: 2}","{1: 0, 2: 0, 3: 0, 4: 0}",{},"{1: [2, 0, 0, 0, 0, 0, 0, 0], 2: [67, 65], 3: ...","{1: [5, 0, 0, 0, 0, 0, 0, 0], 2: [79, 82], 3: ...",,[4],,0
3,0,s3a://go01-demo/warehouse/tablespace/external/...,PARQUET,0,"(263040,)",2,1177,"{1: 47, 2: 43, 3: 61, 4: 47}","{1: 2, 2: 2, 3: 2, 4: 2}","{1: 0, 2: 0, 3: 0, 4: 0}",{},"{1: [3, 0, 0, 0, 0, 0, 0, 0], 2: [65, 90], 3: ...","{1: [10, 0, 0, 0, 0, 0, 0, 0], 2: [87, 65], 3:...",,[4],,0
4,0,s3a://go01-demo/warehouse/tablespace/external/...,PARQUET,0,"(262968,)",1,1106,"{1: 33, 2: 31, 3: 32, 4: 39}","{1: 1, 2: 1, 3: 1, 4: 1}","{1: 0, 2: 0, 3: 0, 4: 0}",{},"{1: [1, 0, 0, 0, 0, 0, 0, 0], 2: [67, 65], 3: ...","{1: [1, 0, 0, 0, 0, 0, 0, 0], 2: [67, 65], 3: ...",,[4],,0


#### There is a new metadata file (json) prefixed by 0002.

#### There is a new manifest list file (avro) prefixed by "snap"

#### There is a new manifest file (avro)

In [35]:
s3 = boto3.resource('s3')
my_bucket = s3.Bucket("go01-demo")

metadata_file_list = []

for object_summary in my_bucket.objects.filter(Prefix="warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata"):
    print(object_summary.key +"\n")
    metadata_file_list.append(object_summary.key)

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00000-29b2d826-3800-4e48-a79b-78faf446c956.metadata.json

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00001-523fe7c1-2e41-4466-a883-a91e5e910729.metadata.json

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00002-bcc08b11-bbad-4a48-9eb3-dfcf8226221d.metadata.json

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/7a1a260d-99b5-4e22-b1d3-6da99f76c5f2-m0.avro

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/c9cd5045-5a65-4a8a-9eab-27799c1f48a2-m0.avro

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/c9cd5045-5a65-4a8a-9eab-27799c1f48a2-m1.avro

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/snap-4263374782247318559-1-c9cd5045-5a65-4a8a-9eab-27799c1f48a2.avro

warehouse/tablespace/external/hive/lakehouse_catalo

Showing Metadata Files (JSON)

In [36]:
print("Showing " + metadata_file_list[0])
spark.read.option("multiline","true").json("s3a://go01-demo/" + metadata_file_list[0]).toPandas()

Showing warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00000-29b2d826-3800-4e48-a79b-78faf446c956.metadata.json


Unnamed: 0,current-schema-id,current-snapshot-id,default-sort-order-id,default-spec-id,format-version,last-column-id,last-partition-id,last-updated-ms,location,metadata-log,partition-spec,partition-specs,properties,schema,schemas,snapshot-log,snapshots,sort-orders,table-uuid
0,0,-1,0,0,1,4,1000,1682634884957,s3a://go01-demo/warehouse/tablespace/external/...,[],"[(1000, dob_hour, 4, hour)]","[([Row(field-id=1000, name='dob_hour', source-...","(pauldefusco,)","([(1, id, False, long), (2, state, False, stri...","[([Row(id=1, name='id', required=False, type='...",[],[],"[([], 0)]",40c6c008-a95c-4471-bc69-4c2a805ae6ed


In [37]:
print("Showing " + metadata_file_list[1])
spark.read.option("multiline","true").json("s3a://go01-demo/" + metadata_file_list[1]).toPandas()

Showing warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00001-523fe7c1-2e41-4466-a883-a91e5e910729.metadata.json


Unnamed: 0,current-schema-id,current-snapshot-id,default-sort-order-id,default-spec-id,format-version,last-column-id,last-partition-id,last-updated-ms,location,metadata-log,partition-spec,partition-specs,properties,refs,schema,schemas,snapshot-log,snapshots,sort-orders,table-uuid
0,0,6887654183493915966,0,0,1,4,1000,1682634893810,s3a://go01-demo/warehouse/tablespace/external/...,[(s3a://go01-demo/warehouse/tablespace/externa...,"[(1000, dob_hour, 4, hour)]","[([Row(field-id=1000, name='dob_hour', source-...","(pauldefusco,)","((6887654183493915966, branch),)","([(1, id, False, long), (2, state, False, stri...","[([Row(id=1, name='id', required=False, type='...","[(6887654183493915966, 1682634893810)]",[(s3a://go01-demo/warehouse/tablespace/externa...,"[([], 0)]",40c6c008-a95c-4471-bc69-4c2a805ae6ed


In [38]:
print("Showing " + metadata_file_list[2])
spark.read.option("multiline","true").json("s3a://go01-demo/" + metadata_file_list[2]).toPandas()

Showing warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00002-bcc08b11-bbad-4a48-9eb3-dfcf8226221d.metadata.json


Unnamed: 0,current-schema-id,current-snapshot-id,default-sort-order-id,default-spec-id,format-version,last-column-id,last-partition-id,last-updated-ms,location,metadata-log,partition-spec,partition-specs,properties,refs,schema,schemas,snapshot-log,snapshots,sort-orders,table-uuid
0,0,4263374782247318559,0,0,1,4,1000,1682634923955,s3a://go01-demo/warehouse/tablespace/external/...,[(s3a://go01-demo/warehouse/tablespace/externa...,"[(1000, dob_hour, 4, hour)]","[([Row(field-id=1000, name='dob_hour', source-...","(pauldefusco,)","((4263374782247318559, branch),)","([(1, id, False, long), (2, state, False, stri...","[([Row(id=1, name='id', required=False, type='...","[(6887654183493915966, 1682634893810), (426337...",[(s3a://go01-demo/warehouse/tablespace/externa...,"[([], 0)]",40c6c008-a95c-4471-bc69-4c2a805ae6ed


Showing Manifest Lists (AVRO - prefixed by "SNAP")

In [39]:
print("Showing " + metadata_file_list[6])
spark.read.format("avro").load("s3a://go01-demo/" + metadata_file_list[6]).toPandas()

Showing warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/snap-4263374782247318559-1-c9cd5045-5a65-4a8a-9eab-27799c1f48a2.avro


Unnamed: 0,manifest_path,manifest_length,partition_spec_id,added_snapshot_id,added_data_files_count,existing_data_files_count,deleted_data_files_count,partitions,added_rows_count,existing_rows_count,deleted_rows_count
0,s3a://go01-demo/warehouse/tablespace/external/...,6373,0,4263374782247318559,4,0,0,"[(False, False, [56, 3, 4, 0], [128, 3, 4, 0])]",9,0,0
1,s3a://go01-demo/warehouse/tablespace/external/...,6193,0,4263374782247318559,0,0,1,"[(False, False, [56, 3, 4, 0], [56, 3, 4, 0])]",0,0,1


In [40]:
print("Showing " + metadata_file_list[7])
spark.read.format("avro").load("s3a://go01-demo/" + metadata_file_list[7]).toPandas()

Showing warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/snap-6887654183493915966-1-7a1a260d-99b5-4e22-b1d3-6da99f76c5f2.avro


Unnamed: 0,manifest_path,manifest_length,partition_spec_id,added_snapshot_id,added_data_files_count,existing_data_files_count,deleted_data_files_count,partitions,added_rows_count,existing_rows_count,deleted_rows_count
0,s3a://go01-demo/warehouse/tablespace/external/...,6192,0,6887654183493915966,1,0,0,"[(False, False, [56, 3, 4, 0], [56, 3, 4, 0])]",1,0,0


Showing Manifest Files (Avro) i.e. list of table partitions mapped to snapshot ID

In [41]:
print("Showing " + metadata_file_list[3])
spark.read.format("avro").load("s3a://go01-demo/" + metadata_file_list[3]).toPandas()

Showing warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/7a1a260d-99b5-4e22-b1d3-6da99f76c5f2-m0.avro


Unnamed: 0,status,snapshot_id,data_file
0,1,6887654183493915966,(s3a://go01-demo/warehouse/tablespace/external...


In [42]:
print("Showing " + metadata_file_list[4])
spark.read.format("avro").load("s3a://go01-demo/" + metadata_file_list[4]).toPandas()

Showing warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/c9cd5045-5a65-4a8a-9eab-27799c1f48a2-m0.avro


Unnamed: 0,status,snapshot_id,data_file
0,2,4263374782247318559,(s3a://go01-demo/warehouse/tablespace/external...


In [43]:
print("Showing " + metadata_file_list[5])
spark.read.format("avro").load("s3a://go01-demo/" + metadata_file_list[5]).toPandas()

Showing warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/c9cd5045-5a65-4a8a-9eab-27799c1f48a2-m1.avro


Unnamed: 0,status,snapshot_id,data_file
0,1,4263374782247318559,(s3a://go01-demo/warehouse/tablespace/external...
1,1,4263374782247318559,(s3a://go01-demo/warehouse/tablespace/external...
2,1,4263374782247318559,(s3a://go01-demo/warehouse/tablespace/external...
3,1,4263374782247318559,(s3a://go01-demo/warehouse/tablespace/external...


### Time Travel 

In [44]:
snapshots_df = spark.sql("SELECT * FROM lakehouse_catalog.customers_table.snapshots;")

In [45]:
first_snapshot = snapshots_df.select("snapshot_id").head(1)[0][0]

#### Validate that the output dataframe only includes one row

In [46]:
spark.read\
    .option("snapshot-id", first_snapshot)\
    .format("iceberg")\
    .load("lakehouse_catalog.customers_table").toPandas()

Unnamed: 0,id,state,country,dob
0,1,CA,USA,2000-01-01


### Drop the Table

In [47]:
spark.sql("DROP TABLE IF EXISTS lakehouse_catalog.staging")

                                                                                

DataFrame[]

Validate that the metadata folder is now empty but the data folder still retains parquet files.

![alt text](../img/s3_droptable_1.png)

![alt text](../img/s3_droptable_2.png)

![alt text](../img/s3_droptable_3.png)

In [54]:
spark.sql("ALTER TABLE lakehouse_catalog.customers_table\
            SET TBLPROPERTIES ('format-version' = '2')")

DataFrame[]

In [55]:
s3 = boto3.resource('s3')
my_bucket = s3.Bucket("go01-demo")

metadata_file_list = []

for object_summary in my_bucket.objects.filter(Prefix="warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata"):
    print(object_summary.key +"\n")
    metadata_file_list.append(object_summary.key)

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00000-29b2d826-3800-4e48-a79b-78faf446c956.metadata.json

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00001-523fe7c1-2e41-4466-a883-a91e5e910729.metadata.json

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00002-bcc08b11-bbad-4a48-9eb3-dfcf8226221d.metadata.json

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00003-24aae4c7-871b-4ad7-b545-123ba8507b6c.metadata.json

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/7a1a260d-99b5-4e22-b1d3-6da99f76c5f2-m0.avro

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/c9cd5045-5a65-4a8a-9eab-27799c1f48a2-m0.avro

warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/c9cd5045-5a65-4a8a-9eab-27799c1f48a2-m1.avro

warehouse/tablespace/external/hive/lakehouse_catalog.db/custome

In [56]:
print("Showing " + metadata_file_list[3])
spark.read.option("multiline","true").json("s3a://go01-demo/" + metadata_file_list[3]).toPandas()

Showing warehouse/tablespace/external/hive/lakehouse_catalog.db/customers_table/metadata/00003-24aae4c7-871b-4ad7-b545-123ba8507b6c.metadata.json


                                                                                

Unnamed: 0,current-schema-id,current-snapshot-id,default-sort-order-id,default-spec-id,format-version,last-column-id,last-partition-id,last-sequence-number,last-updated-ms,location,metadata-log,partition-specs,properties,refs,schemas,snapshot-log,snapshots,sort-orders,table-uuid
0,0,4263374782247318559,0,0,2,4,1000,0,1682634971749,s3a://go01-demo/warehouse/tablespace/external/...,[(s3a://go01-demo/warehouse/tablespace/externa...,"[([Row(field-id=1000, name='dob_hour', source-...","(pauldefusco, merge-on-read, merge-on-read, me...","((4263374782247318559, branch),)","[([Row(id=1, name='id', required=False, type='...","[(6887654183493915966, 1682634893810), (426337...",[(s3a://go01-demo/warehouse/tablespace/externa...,"[([], 0)]",40c6c008-a95c-4471-bc69-4c2a805ae6ed
