In [14]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.1 pyspark-shell'
import pyspark;
sc = pyspark.SparkContext.getOrCreate();
from pyspark.sql import SparkSession;
spark = SparkSession(sc)

In [41]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DoubleType
eventSchema = (StructType()
  .add('InvoiceNo', StringType())
  .add('StockCode', StringType())
  .add('Description', StringType())
  .add('Quantity', IntegerType())
  .add('InvoiceDate', StringType())
  .add('UnitPrice', DoubleType())
  .add('CustomerID', IntegerType())
  .add('Country', StringType())
)

In [71]:
kafka_df = (spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "retail-events")
    .option("startingOffsets", "latest")
    .option("failOnDataLoss", "false")
    .load()
)

In [72]:
query = kafka_df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
    .writeStream \
    .format("console") \
    .option("checkpointLocation", "/tmp/data-lake/") \
    .start()

22/05/21 22:34:23 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
22/05/21 22:34:23 WARN StreamingQueryManager: Stopping existing streaming query [id=9b93a04c-68a5-427f-93ac-eed220da2b88, runId=8ff96370-591d-4d13-8fda-8e4416c19b42], as a new run is being started.


-------------------------------------------
Batch: 9
-------------------------------------------
+---+--------------------+
|key|               value|
+---+--------------------+
|  0|{"InvoiceNo":4894...|
|  1|{"InvoiceNo":4894...|
|  2|{"InvoiceNo":4894...|
|  3|{"InvoiceNo":4894...|
|  4|{"InvoiceNo":4894...|
|  5|{"InvoiceNo":4894...|
|  6|{"InvoiceNo":4894...|
|  7|{"InvoiceNo":4894...|
|  8|{"InvoiceNo":4894...|
+---+--------------------+



In [24]:
query.stop()

In [25]:
query.stop()

In [73]:
from pyspark.sql.functions import col, from_json, to_date
retail_df = (kafka_df\
   .select(from_json(col("value").cast(StringType()), eventSchema).alias("message"), col("timestamp").alias("EventTime"))\
   .select("message.*", "EventTime")
)

In [80]:
base_path = "/tmp/data-lake/retail_events.parquet"
(retail_df
    .withColumn("EventDate", to_date(retail_df.EventTime))
    .writeStream
    .format('parquet')
    .outputMode("append")
    .trigger(once=True)
    .option('checkpointLocation', base_path + '/_checkpoint')
    .start(base_path)
)

22/05/21 22:37:04 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x7f83efaac7f0>

In [81]:
df2 = spark.read.parquet("/tmp/data-lake/retail_events.parquet").show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+--------------------+----------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|           EventTime| EventDate|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+--------------------+----------+
|   489434|    85048|15CM CHRISTMAS GL...|      12|01/12/09 07:45|     6.95|     13085|United Kingdom|2022-05-21 22:36:...|2022-05-21|
|   489434|   79323P|  PINK CHERRY LIGHTS|      12|01/12/09 07:45|     6.75|     13085|United Kingdom|2022-05-21 22:36:...|2022-05-21|
|   489434|   79323W| WHITE CHERRY LIGHTS|      12|01/12/09 07:45|     6.75|     13085|United Kingdom|2022-05-21 22:36:...|2022-05-21|
|   489434|    22041|"RECORD FRAME 7""...|      48|01/12/09 07:45|      2.1|     13085|United Kingdom|2022-05-21 22:36:...|2022-05-21|
|   489434|    21232|STRAWBERRY CERAMI...|      24|01/1