In [1]:
import os
import sys

sys.path.insert(0, os.path.abspath("../src"))

### Import Library and Create Factory Instance

In [13]:
from keepice_lakehouse import IcebergManagerFactory

factory = IcebergManagerFactory()

spark_manager = factory.get_manager('spark_iceberg')

/home/iceberg/notebooks/keepice/keepice-lakehouse-library/src/keepice_lakehouse/config
/home/iceberg/notebooks/keepice/keepice-lakehouse-library/src/config
/home/iceberg/notebooks/keepice/keepice-lakehouse-library/config


### Create Database

In [2]:
spark_manager.create_database(database_name='test')

### Create Table

In [14]:
schema_dict = {
    "VendorID": "bigint",
    "tpep_pickup_datetime": "timestamp",
    "tpep_dropoff_datetime": "timestamp",
    "passenger_count": "double",
    "trip_distance": "double",
    "RatecodeID": "double",
    "store_and_fwd_flag": "string",
    "PULocationID": "bigint",
    "DOLocationID": "bigint",
    "payment_type": "bigint",
    "fare_amount": "double",
    "extra": "double",
    "mta_tax": "double",
    "tip_amount": "double",
    "tolls_amount": "double",
    "improvement_surcharge": "double",
    "total_amount": "double",
    "congestion_surcharge": "double",
    "airport_fee": "double"
}
spark_manager.create_table(database_name='test', table_name='taxi_test_table', columns=schema_dict, s3_folder_location="s3://warehouse/test/taxi-test-table", partition_column = "days(tpep_pickup_datetime)")

### List Databases and Tables

In [7]:
spark_manager.list_databases().show()
spark_manager.list_tables(database_name='test').show()

+---------+
|namespace|
+---------+
|     test|
+---------+

+---------+---------------+-----------+
|namespace|      tableName|isTemporary|
+---------+---------------+-----------+
|     test|taxi_test_table|      false|
+---------+---------------+-----------+



### Get Table DDL

In [11]:
df = spark_manager.get_table_ddl(database_name='test', table_name='taxi_test_table')
ddl = df.select('createtab_stmt').rdd.flatMap(lambda x: x).collect()[0]
print(ddl)

CREATE TABLE demo.test.taxi_test_table (
  VendorID BIGINT,
  tpep_pickup_datetime TIMESTAMP,
  tpep_dropoff_datetime TIMESTAMP,
  passenger_count DOUBLE,
  trip_distance DOUBLE,
  RatecodeID DOUBLE,
  store_and_fwd_flag STRING,
  PULocationID BIGINT,
  DOLocationID BIGINT,
  payment_type BIGINT,
  fare_amount DOUBLE,
  extra DOUBLE,
  mta_tax DOUBLE,
  tip_amount DOUBLE,
  tolls_amount DOUBLE,
  improvement_surcharge DOUBLE,
  total_amount DOUBLE,
  congestion_surcharge DOUBLE,
  airport_fee DOUBLE)
USING iceberg
PARTITIONED BY (days(tpep_pickup_datetime))
LOCATION 's3://warehouse/test/taxi-test-table'
TBLPROPERTIES (
  'current-snapshot-id' = 'none',
  'format' = 'iceberg/parquet',
  'format-version' = '2',
  'write.parquet.compression-codec' = 'zstd')

