In [8]:

# Import SparkSession
import os
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [9]:
test_df = spark.read.json("data/landing/Product/1707844682350206.json").show()

+-----------------+----+-----------------+--------------+-------------+----------------+--------------------+--------------+-----+----------+------+
|         category|cogs|contains_caffeine|contains_fruit|contains_nuts|contains_veggies|          event_time|          item|price|product_id|  size|
+-----------------+----+-----------------+--------------+-------------+----------------+--------------------+--------------+-----+----------+------+
|Classic Smoothies| 1.5|            false|          true|        false|           false|2024-02-13 17:18:...|Sunrise Sunset| 4.99|      CS01|24 oz.|
+-----------------+----+-----------------+--------------+-------------+----------------+--------------------+--------------+-----+----------+------+



In [10]:
# To allow automatic schemaInference while reading
spark.conf.set("spark.sql.streaming.schemaInference", True)

# Create the streaming_df to read from input directory
streaming_df = spark \
    .readStream \
    .format("json") \
    .option("maxFilesPerTrigger", 1) \
    .load("data/landing/Product/")

streaming_df.printSchema()

root
 |-- category: string (nullable = true)
 |-- cogs: double (nullable = true)
 |-- contains_caffeine: boolean (nullable = true)
 |-- contains_fruit: boolean (nullable = true)
 |-- contains_nuts: boolean (nullable = true)
 |-- contains_veggies: boolean (nullable = true)
 |-- event_time: string (nullable = true)
 |-- item: string (nullable = true)
 |-- price: double (nullable = true)
 |-- product_id: string (nullable = true)
 |-- size: string (nullable = true)



In [13]:
# To allow automatic schemaInference while reading
spark.conf.set("spark.sql.streaming.schemaInference", True)

streaming_df = spark.readStream\
    .format("json") \
    .option("maxFilesPerTrigger", 1) \
    .load("data/landing/Product/")

bronzePath = "data/bronze/Product"

query = streaming_df \
    .writeStream \
    .outputMode("append") \
    .format("delta") \
    .option("path", bronzePath) \
    .option("checkpointLocation", bronzePath + "/_checkpoint") \
    .queryName("queryBronzeIngestion") \
    .start()

query.awaitTermination()


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/envs/vscode_pyspark/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/envs/vscode_pyspark/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/envs/vscode_pyspark/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [19]:
from delta.tables import *

streaming_df = spark.read \
    .format("delta") \
    .load("data/bronze/Product/")

streaming_df.show(5)

+--------------------+----+-----------------+--------------+-------------+----------------+--------------------+------------------+-----+----------+------+
|            category|cogs|contains_caffeine|contains_fruit|contains_nuts|contains_veggies|          event_time|              item|price|product_id|  size|
+--------------------+----+-----------------+--------------+-------------+----------------+--------------------+------------------+-----+----------+------+
|Superfoods Smoothies| 2.1|            false|          true|        false|           false|2024-02-13 17:18:...|Pomegranate Plunge| 5.99|      SF04|24 oz.|
|Supercharged Smoo...| 2.7|            false|         false|        false|           false|2024-02-13 18:05:...|   Peanut Paradise| 5.99|      SC02|24 oz.|
|Superfoods Smoothies| 2.1|            false|          true|        false|            true|2024-02-13 18:05:...|     Totally Green| 5.99|      SF02|24 oz.|
|Supercharged Smoo...| 2.7|            false|         false|    