In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[*]') \
  .config("spark.jars.packages","org.apache.iceberg:iceberg-spark3-runtime:0.12.1") \
  .config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
  .config("spark.sql.catalog.spark_catalog","org.apache.iceberg.spark.SparkSessionCatalog") \
  .config("spark.sql.catalog.spark_catalog.type","hive") \
  .getOrCreate()

In [3]:
  # Using a local Spark Catalog

spark.sql("CREATE DATABASE IF NOT EXISTS spark_catalog.testdb ")
spark.sql("USE spark_catalog.testdb")
spark.sql("SHOW CURRENT NAMESPACE").show()
#spark.sql("DROP TABLE testtable")

+-------------+---------+
|      catalog|namespace|
+-------------+---------+
|spark_catalog|   testdb|
+-------------+---------+



In [4]:
spark.sql("CREATE TABLE IF NOT EXISTS testtable (id bigint, data string) USING iceberg")

DataFrame[]

In [5]:
spark.read.format("iceberg").load("spark_catalog.testdb.testtable.history").show(20, False)

+-----------------------+-------------------+-------------------+-------------------+
|made_current_at        |snapshot_id        |parent_id          |is_current_ancestor|
+-----------------------+-------------------+-------------------+-------------------+
|2021-11-19 10:08:15.886|3492322305412256778|null               |true               |
|2021-11-19 10:21:55.566|107550928418073495 |3492322305412256778|true               |
|2021-11-19 10:28:30.292|1545408577715836429|107550928418073495 |true               |
|2021-11-19 10:28:33.014|2649908378426818566|1545408577715836429|true               |
|2021-11-19 10:34:33.135|3533974690975069464|2649908378426818566|true               |
+-----------------------+-------------------+-------------------+-------------------+



In [6]:
spark.read.format("iceberg").load("spark_catalog.testdb.testtable.snapshots").show(20, False)

+-----------------------+-------------------+-------------------+---------+----------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|committed_at           |snapshot_id        |parent_id          |operation|manifest_list                                                                                                                                       |summary                                                                                                                                                                                                                                                               

In [7]:
spark.read.format("iceberg").load("spark_catalog.testdb.testtable.files").show(20, False)

+-------+---------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+------------------+------------------+----------------+-----------------+----------------+-----------------------+-----------------------+------------+-------------+------------+
|content|file_path                                                                                                                              |file_format|record_count|file_size_in_bytes|column_sizes      |value_counts    |null_value_counts|nan_value_counts|lower_bounds           |upper_bounds           |key_metadata|split_offsets|equality_ids|
+-------+---------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+------------------+------------------+----------------+-----------------+----------------+-----------------------+------------------

In [8]:
# Insert using Iceberg format
spark.sql("INSERT INTO testtable VALUES (1, 'x'), (2, 'y'), (3, 'z')")

DataFrame[]

In [9]:
# Query using select
spark.sql("SELECT * FROM testtable").show()

+---+----+
| id|data|
+---+----+
|  1|   a|
|  2|   b|
|  3|   c|
|  1|   x|
|  2|   y|
|  3|   z|
|  1|   a|
|  2|   b|
|  3|   c|
|  1|   a|
|  2|   b|
|  3|   c|
|  1|   a|
|  2|   b|
|  3|   c|
|  1|   x|
|  2|   y|
|  3|   z|
+---+----+



In [10]:
# Query using DF - All Data
df = spark.table("spark_catalog.testdb.testtable")
df.show(100)

+---+----+
| id|data|
+---+----+
|  1|   a|
|  2|   b|
|  3|   c|
|  1|   x|
|  2|   y|
|  3|   z|
|  1|   a|
|  2|   b|
|  3|   c|
|  1|   a|
|  2|   b|
|  3|   c|
|  1|   a|
|  2|   b|
|  3|   c|
|  1|   x|
|  2|   y|
|  3|   z|
+---+----+



In [49]:
from datetime import datetime

# current date and time
now = datetime.now()

timestamp = datetime.timestamp(now)
print("timestamp =", timestamp)

timestamp = 1637370685.641575


In [66]:
# Query using a point in time
df = spark.read.option("as-of-timestamp", int(timestamp*1000)).format("iceberg").load("spark_catalog.testdb.testtable")
df.show(100)

+---+----+
| id|data|
+---+----+
|  1|   a|
|  2|   b|
|  3|   c|
|  1|   d|
|  2|   e|
|  3|   f|
|  1|   x|
|  2|   y|
|  3|   z|
|  1|   a|
|  2|   b|
|  3|   c|
|  1|   a|
|  2|   b|
|  3|   c|
|  1|   a|
|  2|   b|
|  3|   c|
|  1|   x|
|  2|   y|
|  3|   z|
+---+----+



In [67]:
# Insert using Iceberg format
spark.sql("INSERT INTO testtable VALUES (1, 'd'), (2, 'e'), (3, 'f')")

DataFrame[]