In [2]:

# Import SparkSession
import os
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("STREAMING_DWH") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [3]:
test_df = spark.read.json("data/landing/Product/1707844682350206.json").show()

+-----------------+----+-----------------+--------------+-------------+----------------+--------------------+--------------+-----+----------+------+
|         category|cogs|contains_caffeine|contains_fruit|contains_nuts|contains_veggies|          event_time|          item|price|product_id|  size|
+-----------------+----+-----------------+--------------+-------------+----------------+--------------------+--------------+-----+----------+------+
|Classic Smoothies| 1.5|            false|          true|        false|           false|2024-02-13 17:18:...|Sunrise Sunset| 4.99|      CS01|24 oz.|
+-----------------+----+-----------------+--------------+-------------+----------------+--------------------+--------------+-----+----------+------+



In [4]:
# To allow automatic schemaInference while reading
spark.conf.set("spark.sql.streaming.schemaInference", True)

# Create the streaming_df to read from input directory
streaming_df = spark \
    .readStream \
    .format("json") \
    .option("maxFilesPerTrigger", 1) \
    .load("data/landing/Product/")

streaming_df.printSchema()

root
 |-- category: string (nullable = true)
 |-- cogs: double (nullable = true)
 |-- contains_caffeine: boolean (nullable = true)
 |-- contains_fruit: boolean (nullable = true)
 |-- contains_nuts: boolean (nullable = true)
 |-- contains_veggies: boolean (nullable = true)
 |-- event_time: string (nullable = true)
 |-- item: string (nullable = true)
 |-- price: double (nullable = true)
 |-- product_id: string (nullable = true)
 |-- size: string (nullable = true)



In [8]:
# To allow automatic schemaInference while reading
spark.conf.set("spark.sql.streaming.schemaInference", True)

def bronze_ingestion_query(table_name):
    ### Generates a source path based on table name, reads all files from that and inserts into bronze schema ###

    source_path = f"data/landing/{table_name}"
    target_path = f"data/bronze/{table_name}"

    query = (
        spark.readStream \
        .format("json") \
        .option("maxFilesPerTrigger", 1) \
        .load(source_path) \
        .writeStream \
        .outputMode("append") \
        .format("delta") \
        .option("path", target_path) \
        .option("checkpointLocation", target_path + "/_checkpoint") \
        .start()
    )
    return query


query1 = bronze_ingestion_query(table_name="Inventory")
query2 = bronze_ingestion_query(table_name="Product")
query3 = bronze_ingestion_query(table_name="Purchase")

spark.streams.awaitAnyTermination()


In [None]:
from delta.tables import *

streaming_df = spark.read \
    .format("delta") \
    .load("data/bronze/Product/")

streaming_df.show(5)

+--------------------+----+-----------------+--------------+-------------+----------------+--------------------+------------------+-----+----------+------+
|            category|cogs|contains_caffeine|contains_fruit|contains_nuts|contains_veggies|          event_time|              item|price|product_id|  size|
+--------------------+----+-----------------+--------------+-------------+----------------+--------------------+------------------+-----+----------+------+
|Superfoods Smoothies| 2.1|            false|          true|        false|           false|2024-02-13 17:18:...|Pomegranate Plunge| 5.99|      SF04|24 oz.|
|Supercharged Smoo...| 2.7|            false|         false|        false|           false|2024-02-13 18:05:...|   Peanut Paradise| 5.99|      SC02|24 oz.|
|Superfoods Smoothies| 2.1|            false|          true|        false|            true|2024-02-13 18:05:...|     Totally Green| 5.99|      SF02|24 oz.|
|Supercharged Smoo...| 2.7|            false|         false|    