In [0]:
# Create streaming input folder
dbutils.fs.mkdirs("/Volumes/workspace/advecom/advecom_data/streaming/events")

True

In [0]:
# Load ecommerce data
october_df = spark.read.csv(
"/Volumes/workspace/advecom/advecom_data/2019-Oct.csv",
header=True,
inferSchema=True
)

In [0]:
# Check streaming folder before starting stream
display(dbutils.fs.ls("/Volumes/workspace/advecom/advecom_data/streaming/events"))

[]

In [0]:
# Create read stream
stream_df = spark.readStream \
.schema(october_df.schema) \
.option("header",True) \
.csv("/Volumes/workspace/advecom/advecom_data/streaming/events/batch1")


# # Create first batch instance - uncomment while running for first batch 
# october_df.limit(100).write.mode("overwrite").csv(
# "/Volumes/workspace/advecom/advecom_data/streaming/events/batch1"
# )

# # Create second batch instance - uncomment while running for second batch 
# october_df.limit(2000).write.mode("overwrite").csv(
# "/Volumes/workspace/advecom/advecom_data/streaming/events/batch1"
# )

In [0]:
display(dbutils.fs.ls("/Volumes/workspace/advecom/advecom_data/streaming/events"))

path,name,size,modificationTime
dbfs:/Volumes/workspace/advecom/advecom_data/streaming/events/batch1/,batch1/,0,1771868520231


In [0]:
# Create write stream and save in delta format
query = stream_df.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation",
    "/Volumes/workspace/advecom/advecom_data/streaming/checkpoint"
    ) \
    .trigger(availableNow=True) \
    .start("/Volumes/workspace/advecom/advecom_data/streaming/delta")

In [0]:
# Load streamed data
result_df = spark.read.format("delta").load(
"/Volumes/workspace/advecom/advecom_data/streaming/delta"
)

# Check streamed data
display(result_df)

# Note: Two data batches were streamed. First batch 100 records, second batch 2000 records. The count shows 2098, as the header(column names) is deducted from both the instances each.

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-10-01T00:00:00.000Z,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2019-10-01T00:00:01.000Z,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
2019-10-01T00:00:01.000Z,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
2019-10-01T00:00:04.000Z,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d
2019-10-01T00:00:05.000Z,view,1480613,2053013561092866779,computers.desktop,pulser,908.62,512742880,0d0d91c2-c9c2-4e81-90a5-86594dec0db9
2019-10-01T00:00:08.000Z,view,17300353,2053013553853497655,,creed,380.96,555447699,4fe811e9-91de-46da-90c3-bbd87ed3a65d
2019-10-01T00:00:08.000Z,view,31500053,2053013558031024687,,luminarc,41.16,550978835,6280d577-25c8-4147-99a7-abc6048498d6
2019-10-01T00:00:10.000Z,view,28719074,2053013565480109009,apparel.shoes.keds,baden,102.71,520571932,ac1cd4e5-a3ce-4224-a2d7-ff660a105880
2019-10-01T00:00:11.000Z,view,1004545,2053013555631882655,electronics.smartphone,huawei,566.01,537918940,406c46ed-90a4-4787-a43b-59a410c1a5fb
2019-10-01T00:00:11.000Z,view,2900536,2053013554776244595,appliances.kitchen.microwave,elenberg,51.46,555158050,b5bdd0b3-4ca2-4c55-939e-9ce44bb50abd
