In [21]:
# Create the Spark Session
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Streaming from Kafka") \
    .config("spark.streaming.stopGracefullyOnShutdown", True) \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0") \
    .config("spark.sql.shuffle.partitions", 4) \
    .master("spark://spark-master:7077") \
    .getOrCreate()

In [22]:
# Create the streaming_df to read from kafka
streaming_df = spark.readStream\
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:19092,kafka2:19093,kafka3:19094") \
    .option("subscribe", "events") \
    .option("startingOffsets", "earliest") \
    .load()

In [23]:
# JSON Schema
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, LongType

json_schema = StructType([ \
StructField('timestamp', IntegerType(), True), \
StructField('type', StringType(), True), \
StructField('appName', StringType(), True), \
StructField('appInstance', IntegerType(), True), \
StructField('appID', IntegerType(), True), \
StructField('probeID', StringType(), True), \
StructField('eventID', StringType(), True), \
StructField('correletionID', IntegerType(), True), \
StructField('locationID', StringType(), True), \
StructField('transactionStart', IntegerType(), True), \
StructField('transactionEnd', IntegerType(), True), \
StructField('transactionDuration', IntegerType(), True), \
StructField('clientIPAddress', StringType(), True), \
StructField('clientPort', IntegerType(), True), \
StructField('serverIPAddress', StringType(), True), \
StructField('serverPort', IntegerType(), True), \
StructField('ipProtocol', StringType(), True), \
StructField('category', StringType(), True), \
StructField('bytesFromClient', IntegerType(), True), \
StructField('bytesToClient', IntegerType(), True), \
StructField('bytesFromServer', IntegerType(), True), \
StructField('bytesToServer', IntegerType(), True), \
StructField('subscriberID', LongType(), True), \
StructField('applicationProtocol', StringType(), True), \
StructField('applicationName', StringType(), True), \
StructField('domain', StringType(), True), \
StructField('deviceType', StringType(), True), \
StructField('networkType', StringType(), True), \
StructField('contentType', StringType(), True), \
StructField('lostBytesClient', IntegerType(), True), \
StructField('lostBytesServer', IntegerType(), True), \
StructField('srttMsClient', IntegerType(), True), \
StructField('srttMsServer', IntegerType(), True), \
])

In [24]:
# Parse value from binay to string
json_df = streaming_df.selectExpr("cast(value as string) as value")

# Apply Schema to JSON value column and expand the value
from pyspark.sql.functions import from_json

json_expanded_df = json_df.withColumn("value", from_json(json_df["value"], json_schema)).select("value.*") 

In [25]:
eventQuery = json_expanded_df \
        .writeStream \
        .queryName("qevents1")\
        .format("memory")\
        .start()

IllegalArgumentException: Cannot start query with name qevents1 as a query with that name is already active in this SparkSession

In [20]:
raw = spark.sql("select * from qevents1")
raw.show()

+---------+----+-------+-----------+-----+-------+-------+-------------+----------+----------------+--------------+-------------------+---------------+----------+---------------+----------+----------+--------+---------------+-------------+---------------+-------------+------------+-------------------+---------------+------+----------+-----------+-----------+---------------+---------------+------------+------------+
|timestamp|type|appName|appInstance|appID|probeID|eventID|correletionID|locationID|transactionStart|transactionEnd|transactionDuration|clientIPAddress|clientPort|serverIPAddress|serverPort|ipProtocol|category|bytesFromClient|bytesToClient|bytesFromServer|bytesToServer|subscriberID|applicationProtocol|applicationName|domain|deviceType|networkType|contentType|lostBytesClient|lostBytesServer|srttMsClient|srttMsServer|
+---------+----+-------+-----------+-----+-------+-------+-------------+----------+----------------+--------------+-------------------+---------------+----------+