In [136]:
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, DataFrame
import pandas as pd

# This CATALOG_URL works for the "docker compose" testing and development environment
# Change 'lakekeeper' if you are not running on "docker compose" (f. ex. 'localhost' if Lakekeeper is running locally).
CATALOG_URL = "http://lakekeeper:8181/catalog"
WAREHOUSE = "demo"

SPARK_VERSION = pyspark.__version__
SPARK_MINOR_VERSION = '.'.join(SPARK_VERSION.split('.')[:2])
ICEBERG_VERSION = "1.10.0"

# Connect with Spark

In [2]:
config = {
    f"spark.sql.catalog.lakekeeper": "org.apache.iceberg.spark.SparkCatalog",
    f"spark.sql.catalog.lakekeeper.type": "rest",
    f"spark.sql.catalog.lakekeeper.uri": CATALOG_URL,
    f"spark.sql.catalog.lakekeeper.warehouse": WAREHOUSE,
    f"spark.sql.catalog.lakekeeper.io-impl": "org.apache.iceberg.aws.s3.S3FileIO",
    "spark.sql.extensions": "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
    "spark.sql.defaultCatalog": "lakekeeper",
    "spark.jars.packages": f"org.apache.iceberg:iceberg-spark-runtime-{SPARK_MINOR_VERSION}_2.13:{ICEBERG_VERSION},org.apache.iceberg:iceberg-aws-bundle:{ICEBERG_VERSION}",
}

In [None]:
# spark.sql.shuffle.partitions (need to be smaller for streaming)
# looking at the output of the parquet dir

In [3]:
spark_config = SparkConf().setMaster('local').setAppName("Iceberg-REST")
for k, v in config.items():
    spark_config = spark_config.set(k, v)

spark = SparkSession.builder.config(conf=spark_config).getOrCreate()

spark.sql("USE lakekeeper")

DataFrame[]

In [11]:
spark.conf.set("spark.sql.parquet.compression.codec", "zstd")

In [12]:
from pyspark.sql.functions import col, to_date, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, FloatType
schema = (StructType([
    StructField("event_time", StringType(), False),
    StructField("event_type", StringType(), False),
    StructField("product_id", IntegerType(), False),
    StructField("category_id", LongType(), False),
    StructField("category_code", StringType(), False),
    StructField("brand", StringType(), False),
    StructField("price", FloatType(), False),
    StructField("user_id", IntegerType(), False),
    StructField("user_session", StringType(), False),
]))

In [36]:
from pathlib import Path
dataset_dir = Path('.').joinpath('../datasets').absolute()

In [39]:
ecomm_raw_dir = dataset_dir.joinpath('ecomm_raw')

In [129]:
october_data = (ecomm_raw_dir.joinpath("2019-Oct.csv")).as_posix()
november_data = (ecomm_raw_dir.joinpath("2019-Nov.csv")).as_posix()

## Create the E-commerce Data for ingestion to Iceberg
> we'll be using the '2019-Oct.csv', '2019-Nov.csv' datasets, doing some minor tweaks and then using these to populate our base Iceberg tables
> 
> Then we'll move on to doing Iceberg Streaming in Part 2

In [46]:
ecomm_oct_df = (
    spark.read.format("csv")
    .option("header", True)
    .schema(schema)
    .load(october_data)
)

In [None]:
ecomm_oct_df.show()

In [130]:
ecomm_oct_df.count()

42448764

In [131]:
ecomm_nov_df = (
    spark.read.format("csv")
    .option("header", True)
    .schema(schema)
    .load(november_data)
)

In [132]:
ecomm_nov_df.show()

+--------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|2019-11-01 00:00:...|      view|   1003461|2053013555631882655|electronics.smart...|  xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:...|      view|   5000088|2053013566100866035|appliances.sewing...|  janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:...|      view|  17302664|2053013553853497655|                NULL|   creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 00:00:...|      view|   3601530|2053013563810775923|appliances.kitche...|      lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 00:00:...|      view|   1004775|2053013555631882655|elect

In [133]:
ecomm_nov_df.count()

67501979

### Create the Initial Parquet Tables 
> These will live outside of MinIO and the Iceberg Catalog

* We'll create a simple helper function to _aid_ in our journey. We'll call it `write_parquet`.
* Using the new _function_, we'll then need to convert the **csv**->**parquet**, so run the **two** cells under "convert the data". They will produce raw parquet records under the path `datasets/ecomm_raw/parquet/ecomm`.
* We will then be able to run a series of 'daily' transactions to **write** all of the records into our foundational **iceberg** table.

In [87]:
def write_parquet(df: DataFrame, destination: Path, sink_dir: str) -> DataFrame:
    save_path = destination.joinpath('parquet', sink_dir)
    # modifications to the dataframe
    transformed = (
        df
            .withColumn("event_time", to_timestamp(col("event_time"), "yyyy-MM-dd HH:mm:ss z"))
            .withColumn("event_date", to_date(col("event_time")))
    )
    
    return (
        transformed
            .write
            .format("parquet")
            .partitionBy("event_date")
            .mode("append")
            .save(save_path.as_posix())
    )
    

### Convert the Data
We're on a mission to convert CSV to Parquet. Let's do that next.
> Note: This process may take a while if you're using the full dataset (~19GB)
> Note: The function "expects" that we'll be _appending__ to the **ecomm** directory.
> * If you want to modify the behavior of the function, give it a new argument called mode, and default that to "errorIfExists" of "ignore" - so you don't have to worry about deduplication or going nuculear and wiping out the entire ecomm directory!


In [88]:
# note: ecomm_raw_dir is the Path instance to the datasets/ecomm_raw directory
# further note: this could take a few minutes locally. Just turn up your favorite jams and let it ride
# lastly, the safe guard is on to check if the parquet directory already exists. This is to save you from accidentally 
# running the import more than once. This is to save you the trouble this can cause.

if not ecomm_raw_dir.joinpath('parquet', 'ecomm', 'event_date=2019-10-01').is_dir():
  write_parquet(ecomm_oct_df, ecomm_raw_dir, 'ecomm')
else:
  print("oh good. Saved you from having to import again")

In [134]:
if not ecomm_raw_dir.joinpath('parquet', 'ecomm', 'event_date=2019-11-01').is_dir():
  write_parquet(ecomm_nov_df, ecomm_raw_dir, 'ecomm')
else:
  print("same goes for the november set. It exists, so we'll skip for now")

## Read and Write Tables

In [113]:
# let's identify a helper function to check to see if a "table" exists
# this can be done a few different ways, but this one uses the 'underlying' spark.catalog
# and I like that better :)

def table_exists(table_name: str):
    return any(table.name == table_name for table in spark.catalog.listTables())

In [89]:
catalog_namespace = 'icystreams'
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {catalog_namespace}")
spark.sql("SHOW NAMESPACES").show()

+----------+
| namespace|
+----------+
|icystreams|
+----------+



In [10]:
spark.catalog.listDatabases()

[Database(name='icystreams', catalog='lakekeeper', description=None, locationUri='s3://examples/initial-warehouse/0199823a-b849-71e1-be35-c3e2095e828d')]

In [99]:
spark.catalog.currentCatalog()

'lakekeeper'

In [101]:
spark.catalog.listDatabases()

[Database(name='icystreams', catalog='lakekeeper', description=None, locationUri='s3://examples/initial-warehouse/0199823a-b849-71e1-be35-c3e2095e828d')]

In [102]:
spark.catalog.setCurrentDatabase('icystreams')

In [127]:
# read the parquet, and write into Iceberg. We'll tackle this one day at a time, as it can help us create a lot of metadata. 

parquet_dir = ecomm_raw_dir.joinpath('parquet', 'ecomm')
iceberg_table_name = 'ecomm'
for_iceberg = (
    spark
        .read
        .format('parquet')
        .load(parquet_dir.as_posix())
        .where(col("event_date").isin("2019-11-01"))
)

# for_iceberg.show(5)

In [128]:
# we need to check to see if the table exists
to_iceberg = (
    for_iceberg
        .writeTo(f"{catalog_namespace}.ecomm")
        .partitionedBy("event_date")
)

if table_exists(iceberg_table_name):
    # if it does, let's append new records
    to_iceberg.append()
else:
    # otherwise, let's create a new table
    to_iceberg.create()

In [160]:
parquet_dir = ecomm_raw_dir.joinpath('parquet', 'ecomm')
iceberg_table_name = 'ecomm'
for_iceberg = (
    spark
        .read
        .format('parquet')
        .load(parquet_dir.as_posix())
        .where(col("event_date").isin(
            "2019-11-27", "2019-11-28", "2019-11-29", "2019-11-30"
        ))
)

to_iceberg = (
    for_iceberg
        .writeTo(f"{catalog_namespace}.ecomm")
        .partitionedBy("event_date")
)

if table_exists(iceberg_table_name):
    # if it does, let's append new records
    to_iceberg.append()
else:
    # otherwise, let's create a new table
    to_iceberg.create()

In [142]:
spark.sql(f"""
select * from {catalog_namespace}.{iceberg_table_name} 
where event_date BETWEEN DATE("2019-11-01") AND DATE("2019-11-30")
ORDER BY event_date DESC
LIMIT 15
""").toPandas()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_date
0,2019-11-03 01:28:51,view,1801690,2053013554415534427,electronics.video.tv,samsung,369.450012,536833043,d9fc4bb1-c017-4d9d-8200-e1ee200e0866,2019-11-03
1,2019-11-03 01:28:51,view,1307004,2053013558920217191,computers.notebook,lenovo,290.600006,518681862,333c3974-6628-452c-b371-f6a05ae1e6bd,2019-11-03
2,2019-11-03 01:28:51,view,1005263,2053013555631882655,electronics.smartphone,oppo,694.969971,516211022,6d6db1e3-1707-4854-bae6-de5781776a23,2019-11-03
3,2019-11-03 01:28:51,view,2501782,2053013564003713919,appliances.kitchen.oven,oursson,113.489998,540024244,62e5b638-5bcb-4e08-a3e3-04101a7cf491,2019-11-03
4,2019-11-03 01:28:51,view,12702916,2053013553559896355,,cordiant,36.810001,551287382,46e5c6a0-29b7-4e04-9667-99d5249f0f61,2019-11-03
5,2019-11-03 01:28:51,view,12301110,2053013556311359947,construction.tools.drill,dewalt,356.75,565245179,99c8bc6e-9806-4645-bf38-fec449c13f1a,2019-11-03
6,2019-11-03 01:28:51,view,26300189,2053013563584283495,,lucente,351.100006,566845330,4fe71c61-9853-41fa-b28e-1c96cb08a219,2019-11-03
7,2019-11-03 01:28:51,view,2600304,2053013563970159485,,hansa,290.859985,523357956,c155b429-6297-4865-af17-a8122d51ca07,2019-11-03
8,2019-11-03 01:28:51,view,2800396,2053013563835941749,appliances.kitchen.refrigerators,,183.889999,566572592,48d0d1cb-2dfc-471e-bbd0-7727f2e3e31a,2019-11-03
9,2019-11-03 01:28:52,view,30200002,2053013554449088861,,,167.289993,566959749,77d83e3b-9f15-4d41-aaad-4e62f55a8297,2019-11-03


In [161]:
spark.sql(f"select count(*) as total from {catalog_namespace}.{iceberg_table_name}").show()

+---------+
|    total|
+---------+
|109950743|
+---------+



In [8]:
#spark.sql(f"DROP NAMESPACE icystreams")
#spark.sql("DROP TABLE my_namespace.my_table")

DataFrame[]