In [1]:
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, DataFrame
import pandas as pd

# This CATALOG_URL works for the "docker compose" testing and development environment
# Change 'lakekeeper' if you are not running on "docker compose" (f. ex. 'localhost' if Lakekeeper is running locally).
CATALOG_URL = "http://lakekeeper:8181/catalog"
WAREHOUSE = "demo"

SPARK_VERSION = pyspark.__version__
SPARK_MINOR_VERSION = '.'.join(SPARK_VERSION.split('.')[:2])
ICEBERG_VERSION = "1.10.0"

# Connect with Spark

In [2]:
config = {
    f"spark.sql.catalog.lakekeeper": "org.apache.iceberg.spark.SparkCatalog",
    f"spark.sql.catalog.lakekeeper.type": "rest",
    f"spark.sql.catalog.lakekeeper.uri": CATALOG_URL,
    f"spark.sql.catalog.lakekeeper.warehouse": WAREHOUSE,
    f"spark.sql.catalog.lakekeeper.io-impl": "org.apache.iceberg.aws.s3.S3FileIO",
    "spark.sql.extensions": "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
    "spark.sql.defaultCatalog": "lakekeeper",
    "spark.jars.packages": f"org.apache.iceberg:iceberg-spark-runtime-{SPARK_MINOR_VERSION}_2.13:{ICEBERG_VERSION},org.apache.iceberg:iceberg-aws-bundle:{ICEBERG_VERSION}",
}

In [None]:
# spark.sql.shuffle.partitions (need to be smaller for streaming)
# looking at the output of the parquet dir

In [3]:
spark_config = SparkConf().setMaster('local').setAppName("Iceberg-REST")
for k, v in config.items():
    spark_config = spark_config.set(k, v)

spark = SparkSession.builder.config(conf=spark_config).getOrCreate()

spark.sql("USE lakekeeper")

DataFrame[]

In [5]:
spark.sparkContext.getConf().getAll()

[('spark.jars',
  'file:///home/jovyan/.ivy2.5.2/jars/org.apache.iceberg_iceberg-spark-runtime-4.0_2.13-1.10.0.jar,file:///home/jovyan/.ivy2.5.2/jars/org.apache.iceberg_iceberg-aws-bundle-1.10.0.jar'),
 ('spark.driver.host', '0687aa5034d0'),
 ('spark.hadoop.fs.s3a.vectored.read.min.seek.size', '128K'),
 ('spark.master', 'local'),
 ('spark.executor.extraJavaOptions',
  '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-modules=jdk.incubator.vector --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL

In [8]:
if int(spark.conf.get("spark.sql.shuffle.partitions")) > 100:
    print("reducing shuffle partitions to 64")
    spark.conf.set("spark.sql.shuffle.partitions", "64")

reducing shuffle partitions to 64


In [10]:
# this is a favorite of mine. Turn on zstd to make things highly compressed. Less size on disk, less IO bandwidth!
spark.conf.set("spark.sql.parquet.compression.codec", "zstd")

In [11]:
from pyspark.sql.functions import col, to_date, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, FloatType
schema = (StructType([
    StructField("event_time", StringType(), False),
    StructField("event_type", StringType(), False),
    StructField("product_id", IntegerType(), False),
    StructField("category_id", LongType(), False),
    StructField("category_code", StringType(), False),
    StructField("brand", StringType(), False),
    StructField("price", FloatType(), False),
    StructField("user_id", IntegerType(), False),
    StructField("user_session", StringType(), False),
]))

In [12]:
from pathlib import Path
dataset_dir = Path('.').joinpath('../datasets').absolute()

In [13]:
ecomm_raw_dir = dataset_dir.joinpath('ecomm_raw')

In [14]:
october_data = (ecomm_raw_dir.joinpath("2019-Oct.csv")).as_posix()
november_data = (ecomm_raw_dir.joinpath("2019-Nov.csv")).as_posix()

## Create the E-commerce Data for ingestion to Iceberg
> we'll be using the '2019-Oct.csv', '2019-Nov.csv' datasets, doing some minor tweaks and then using these to populate our base Iceberg tables
> 
> Then we'll move on to doing Iceberg Streaming in Part 2

In [15]:
ecomm_oct_df = (
    spark.read.format("csv")
    .option("header", True)
    .schema(schema)
    .load(october_data)
)

In [16]:
ecomm_oct_df.show()

+--------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code|   brand|  price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|2019-10-01 00:00:...|      view|  44600062|2103807459595387724|                NULL|shiseido|  35.79|541312140|72d76fde-8bb3-4e0...|
|2019-10-01 00:00:...|      view|   3900821|2053013552326770905|appliances.enviro...|    aqua|   33.2|554748717|9333dfbd-b87a-470...|
|2019-10-01 00:00:...|      view|  17200506|2053013559792632471|furniture.living_...|    NULL|  543.1|519107250|566511c2-e2e3-422...|
|2019-10-01 00:00:...|      view|   1307067|2053013558920217191|  computers.notebook|  lenovo| 251.74|550050854|7c90fc70-0e80-459...|
|2019-10-01 00:00:...|      view|   1004237|205301355563188265

In [18]:
ecomm_oct_df.count()

42448764

In [20]:
ecomm_nov_df = (
    spark.read.format("csv")
    .option("header", True)
    .schema(schema)
    .load(november_data)
)

In [21]:
ecomm_nov_df.show()

+--------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|2019-11-01 00:00:...|      view|   1003461|2053013555631882655|electronics.smart...|  xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:...|      view|   5000088|2053013566100866035|appliances.sewing...|  janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:...|      view|  17302664|2053013553853497655|                NULL|   creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 00:00:...|      view|   3601530|2053013563810775923|appliances.kitche...|      lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 00:00:...|      view|   1004775|2053013555631882655|elect

In [22]:
ecomm_nov_df.count()

67501979

### Create the Initial Parquet Tables 
> These will live outside of MinIO and the Iceberg Catalog

* We'll create a simple helper function to _aid_ in our journey. We'll call it `write_parquet`.
* Using the new _function_, we'll then need to convert the **csv**->**parquet**, so run the **two** cells under "convert the data". They will produce raw parquet records under the path `datasets/ecomm_raw/parquet/ecomm`.
* We will then be able to run a series of 'daily' transactions to **write** all of the records into our foundational **iceberg** table.

In [23]:
def write_parquet(df: DataFrame, destination: Path, sink_dir: str) -> DataFrame:
    save_path = destination.joinpath('parquet', sink_dir)
    # modifications to the dataframe
    transformed = (
        df
            .withColumn("event_time", to_timestamp(col("event_time"), "yyyy-MM-dd HH:mm:ss z"))
            .withColumn("event_date", to_date(col("event_time")))
    )
    
    return (
        transformed
            .write
            .format("parquet")
            .partitionBy("event_date")
            .mode("append")
            .save(save_path.as_posix())
    )
    

### Convert the Data
We're on a mission to convert CSV to Parquet. Let's do that next.
> Note: This process may take a while if you're using the full dataset (~19GB)
> Note: The function "expects" that we'll be _appending__ to the **ecomm** directory.
> * If you want to modify the behavior of the function, give it a new argument called mode, and default that to "errorIfExists" of "ignore" - so you don't have to worry about deduplication or going nuculear and wiping out the entire ecomm directory!


In [24]:
# note: ecomm_raw_dir is the Path instance to the datasets/ecomm_raw directory
# further note: this could take a few minutes locally. Just turn up your favorite jams and let it ride
# lastly, the safe guard is on to check if the parquet directory already exists. This is to save you from accidentally 
# running the import more than once. This is to save you the trouble this can cause.

if not ecomm_raw_dir.joinpath('parquet', 'ecomm', 'event_date=2019-10-01').is_dir():
  write_parquet(ecomm_oct_df, ecomm_raw_dir, 'ecomm')
else:
  print("oh good. Saved you from having to import again")

In [25]:
if not ecomm_raw_dir.joinpath('parquet', 'ecomm', 'event_date=2019-11-01').is_dir():
  write_parquet(ecomm_nov_df, ecomm_raw_dir, 'ecomm')
else:
  print("same goes for the november set. It exists, so we'll skip for now")

## Read and Write Tables

In [26]:
# let's identify a helper function to check to see if a "table" exists
# this can be done a few different ways, but this one uses the 'underlying' spark.catalog
# and I like that better :)

def table_exists(table_name: str):
    return any(table.name == table_name for table in spark.catalog.listTables())

In [27]:
catalog_namespace = 'icystreams'
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {catalog_namespace}")
spark.sql("SHOW NAMESPACES").show()

+----------+
| namespace|
+----------+
|icystreams|
+----------+



In [31]:
spark.catalog.listDatabases()

[Database(name='icystreams', catalog='lakekeeper', description=None, locationUri='s3://examples/initial-warehouse/01998354-e7a9-7323-a3ca-5bda455c26f6')]

In [32]:
spark.catalog.currentCatalog()

'lakekeeper'

In [33]:
spark.catalog.listDatabases()

[Database(name='icystreams', catalog='lakekeeper', description=None, locationUri='s3://examples/initial-warehouse/01998354-e7a9-7323-a3ca-5bda455c26f6')]

In [34]:
spark.catalog.setCurrentDatabase('icystreams')

In [49]:
def write_iceberg(df: DataFrame, namespace: str, table_name: str, partition_col: str) -> DataFrame:
    to_iceberg = (
        df
            .writeTo(f"{namespace}.{table_name}")
            .partitionedBy(partition_col)
    )
    if table_exists(table_name):
        return to_iceberg.append()
    return to_iceberg.create()

In [95]:
iceberg_table_name = 'ecomm'

write_iceberg(
    (spark
        .read
        .format('parquet')
        .load(parquet_dir.as_posix())
        .where(col("event_date").isin(
            "2019-11-30"
        ))
    ),
    catalog_namespace,
    iceberg_table_name,
    'event_date'
)

In [96]:
spark.sql(f"select count(*) as total from {catalog_namespace}.{iceberg_table_name}").toPandas()

Unnamed: 0,total
0,109950743


In [97]:
spark.sql(f"""
select * from {catalog_namespace}.{iceberg_table_name} 
where event_date BETWEEN DATE("2019-11-15") AND DATE("2019-11-30")
ORDER BY event_date DESC
LIMIT 15
""").toPandas()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_date
0,2019-11-30 09:44:10,view,1004158,2053013555631882655,electronics.smartphone,samsung,682.130005,575219697,d5a319f8-aca2-412c-a797-8e78ba09159f,2019-11-30
1,2019-11-30 09:44:10,cart,17300768,2053013553853497655,,,63.75,519636344,34466b6b-aa50-4571-b327-bcbc015d652e,2019-11-30
2,2019-11-30 09:44:10,view,12703494,2053013553559896355,,cordiant,41.189999,515671054,382a1e92-e118-4efb-aa4a-adeafd1d3936,2019-11-30
3,2019-11-30 09:44:10,view,5300252,2053013563173241677,,polaris,25.709999,579548460,2f88f349-f84d-4d1d-bd44-79f7dcb35b7f,2019-11-30
4,2019-11-30 09:44:10,view,1002544,2053013555631882655,electronics.smartphone,apple,460.5,513983510,69ba2f17-993f-4ddf-899d-41d09d1ecd98,2019-11-30
5,2019-11-30 09:44:10,view,1004833,2053013555631882655,electronics.smartphone,samsung,167.029999,573759994,170b6cd3-8554-45bd-9f74-afd844d6c0bb,2019-11-30
6,2019-11-30 09:44:10,view,1004743,2053013555631882655,electronics.smartphone,xiaomi,72.050003,516587653,da277c8c-6b18-4dfe-8bb4-1217ad6b90f1,2019-11-30
7,2019-11-30 09:44:10,cart,1005100,2053013555631882655,electronics.smartphone,samsung,131.770004,552718105,3f781d61-4880-49b6-87f1-c76b8a672781,2019-11-30
8,2019-11-30 09:44:10,purchase,12500508,2053013556277805513,,,37.040001,514037703,03838da1-e669-4ddd-929f-80ed83f7655c,2019-11-30
9,2019-11-30 09:44:10,view,18000951,2053013558525952589,,samsung,6.5,514704543,62f805df-084f-4495-96e5-b19b30383f7b,2019-11-30


In [161]:
spark.sql(f"select count(*) as total from {catalog_namespace}.{iceberg_table_name}").show()

+---------+
|    total|
+---------+
|109950743|
+---------+


In [8]:
# > Note: This is the nuclear option to scrap the table and all metadata and simply begin again!
#spark.sql(f"DROP TABLE {catalog_namespace}.{iceberg_table_name}")
#spark.sql(f"DROP NAMESPACE {catalog_namespace}")

DataFrame[]