In [10]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, LongType

KAFKA_BOOTSTRAP_SERVERS = "kafka1:19092,kafka2:19093,kafka3:19094"
KAFKA_TOPIC = "events"


events_schema = StructType([ 
    StructField('timestamp', IntegerType(), True),
    StructField('type', StringType(), True),
    StructField('appName', StringType(), True), 
    StructField('appInstance', IntegerType(), True),
    StructField('appID', IntegerType(), True),
    StructField('probeID', StringType(), True),
    StructField('eventID', StringType(), True),
    StructField('correletionID', IntegerType(), True),
    StructField('locationID', StringType(), True),
    StructField('transactionStart', IntegerType(), True), 
    StructField('transactionEnd', IntegerType(), True), 
    StructField('transactionDuration', IntegerType(), True), 
    StructField('clientIPAddress', StringType(), True),
    StructField('clientPort', IntegerType(), True), 
    StructField('serverIPAddress', StringType(), True), 
    StructField('serverPort', IntegerType(), True), 
    StructField('ipProtocol', StringType(), True), 
    StructField('category', StringType(), True), 
    StructField('bytesFromClient', IntegerType(), True), 
    StructField('bytesToClient', IntegerType(), True), 
    StructField('bytesFromServer', IntegerType(), True), 
    StructField('bytesToServer', IntegerType(), True), 
    StructField('subscriberID', LongType(), True), 
    StructField('applicationProtocol', StringType(), True), 
    StructField('applicationName', StringType(), True), 
    StructField('domain', StringType(), True), 
    StructField('deviceType', StringType(), True), 
    StructField('networkType', StringType(), True), 
    StructField('contentType', StringType(), True), 
    StructField('lostBytesClient', IntegerType(), True), 
    StructField('lostBytesServer', IntegerType(), True), 
    StructField('srttMsClient', IntegerType(), True), 
    StructField('srttMsServer', IntegerType(), True), 
])


spark = SparkSession \
    .builder \
    .appName("stream-from-Kafka") \
    .config("spark.streaming.stopGracefullyOnShutdown", True) \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0") \
    .config("spark.sql.shuffle.partitions", 4) \
    .master("spark://spark-master:7077") \
    .getOrCreate()

# Reduce logging
spark.sparkContext.setLogLevel("WARN")

df = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS) \
    .option("subscribe", KAFKA_TOPIC) \
    .option("startingOffsets", "earliest") \
    .load()

In [12]:
df_events_stream = df\
    .select(
        F.from_json(
            F.decode(F.col("value"), "utf-8"),
            events_schema
        ).alias("value")
    )\
    .select("value.*")\
    .writeStream\
    .outputMode("append")\
    .format("console")\
    .start()