In [None]:
spark_version = "3.5"
scala_version = "2.12"
iceberg_version = "1.7.0"

from pyspark.sql import SparkSession
from pyspark.sql.functions import current_date, rand, floor, expr

catalog_name = "iceberg"
warehouse_path = "./icehouse"

spark = SparkSession.builder \
    .appName("local_iceberg_example") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config(f"spark.sql.catalog.{catalog_name}", "org.apache.iceberg.spark.SparkCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.type", "hadoop") \
    .config(f"spark.sql.catalog.{catalog_name}.warehouse", warehouse_path) \
    .config("spark.jars.packages", f"org.apache.iceberg:iceberg-spark-runtime-{spark_version}_{scala_version}:{iceberg_version}") \
    .config("spark.driver.host", "localhost") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .getOrCreate()

In [8]:
namespace = "test_ns"

In [1]:
import data_generator as dg

In [2]:
row_cnt = 10_000_000
row_cnt = 100_000

In [3]:
df = dg.generate_dummy_order_data(row_cnt,1)

In [4]:
dfo = dg.generate_dummy_order_details(df, 5)

In [None]:
dg.save_csv_files(df, dfo, ".", "v1")

Saving header file to ./raw_data/ord_hdr/order_header_v1.csv
Saving detail file to ./raw_data/ord_dtl/order_detail_v1.csv


In [9]:
dfh = spark.read.csv("./raw_data/ord_hdr/*", header=True)
dfd = spark.read.csv("./raw_data/ord_dtl/*", header=True)

In [None]:
table_name = "ord_hdr_no_partition"
dfh.writeTo(f"{catalog_name}.{namespace}.{table_name}") \
    .using("iceberg") \
    .createOrReplace()

In [None]:
table_name = "ord_dtl_no_partition"
dfd.writeTo(f"{catalog_name}.{namespace}.{table_name}") \
    .using("iceberg") \
    .createOrReplace()

In [None]:
table_name = "ord_hdr_partitioned"
dfh.createOrReplaceTempView("hdr")

spark.sql(f"""
        CREATE OR REPLACE TABLE {catalog_name}.{namespace}.{table_name}
        USING ICEBERG
        PARTITIONED BY (order_date)
        AS SELECT *
        FROM hdr
          """)


generate 100m row table x 2 in iceberg no partitions - done
generate same dataset with partitions - done
test perf

##### steps:
1. generate raw data
2. generate non-partitioned iceberg tables
3. generate partitioned iceberg tables
4. test query perf joining