In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, LongType
from pyspark.sql.functions import from_json,col

def get_spark_session():
    spark = SparkSession \
        .builder \
        .appName("stream-from-Kafka") \
        .config("spark.streaming.stopGracefullyOnShutdown", True) \
        .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0") \
        .config("spark.sql.shuffle.partitions", 4) \
        .master("spark://spark-master:7077") \
        .getOrCreate()
    return spark

events_schema = StructType([ 
    StructField('timestamp', StringType(), True),
    StructField('type', StringType(), True),
    StructField('appName', StringType(), True), 
    StructField('appInstance', StringType(), True),
    StructField('appID', StringType(), True),
    StructField('probeID', StringType(), True),
    StructField('eventID', StringType(), True),
    StructField('correletionID', StringType(), True),
    StructField('locationID', StringType(), True),
    StructField('transactionStart', StringType(), True), 
    StructField('transactionEnd', StringType(), True), 
    StructField('transactionDuration', StringType(), True), 
    StructField('clientIPAddress', StringType(), True),
    StructField('clientPort', StringType(), True), 
    StructField('serverIPAddress', StringType(), True), 
    StructField('serverPort', StringType(), True), 
    StructField('ipProtocol', StringType(), True), 
    StructField('category', StringType(), True), 
    StructField('bytesFromClient', StringType(), True), 
    StructField('bytesToClient', StringType(), True), 
    StructField('bytesFromServer', StringType(), True), 
    StructField('bytesToServer', StringType(), True), 
    StructField('subscriberID', StringType(), True), 
    StructField('applicationProtocol', StringType(), True), 
    StructField('applicationName', StringType(), True), 
    StructField('domain', StringType(), True), 
    StructField('deviceType', StringType(), True), 
    StructField('networkType', StringType(), True), 
    StructField('contentType', StringType(), True), 
    StructField('lostBytesClient', StringType(), True), 
    StructField('lostBytesServer', StringType(), True), 
    StructField('srttMsClient', StringType(), True), 
    StructField('srttMsServer', StringType(), True), 
])


In [2]:
spark = get_spark_session()

In [3]:
KAFKA_BOOTSTRAP_SERVERS = "kafka1:19092,kafka2:19093,kafka3:19094"
KAFKA_TOPIC = "events"

df = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS) \
    .option("subscribe", KAFKA_TOPIC) \
    .option("startingOffsets", "earliest") \
    .load()

In [7]:
df = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),events_schema).alias("data")).select("data.*")

In [None]:
my_query = (df.writeStream
        .format("console")
        .outputMode("append")
        .start())

my_query.awaitTermination()
