In [25]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, LongType
from pyspark.sql.functions import from_json,col, to_timestamp, date_format

events_schema = StructType([ 
    StructField('timestamp', StringType(), True),
    StructField('type', StringType(), True),
    StructField('appName', StringType(), True), 
    StructField('appInstance', LongType(), True),
    StructField('appID', StringType(), True),
    StructField('probeID', StringType(), True),
    StructField('eventID', StringType(), True),
    StructField('correletionID', LongType(), True),
    StructField('locationID', StringType(), True),
    StructField('transactionStart', LongType(), True), 
    StructField('transactionEnd', LongType(), True), 
    StructField('transactionDuration', LongType(), True), 
    StructField('clientIPAddress', StringType(), True),
    StructField('clientPort', IntegerType(), True), 
    StructField('serverIPAddress', StringType(), True), 
    StructField('serverPort', IntegerType(), True), 
    StructField('ipProtocol', StringType(), True), 
    StructField('category', StringType(), True), 
    StructField('bytesFromClient', LongType(), True), 
    StructField('bytesToClient', LongType(), True), 
    StructField('bytesFromServer', LongType(), True), 
    StructField('bytesToServer', LongType(), True), 
    StructField('subscriberID', StringType(), True), 
    StructField('applicationProtocol', StringType(), True), 
    StructField('applicationName', StringType(), True), 
    StructField('domain', StringType(), True), 
    StructField('deviceType', StringType(), True), 
    StructField('networkType', StringType(), True), 
    StructField('contentType', StringType(), True), 
    StructField('lostBytesClient', LongType(), True), 
    StructField('lostBytesServer', LongType(), True), 
    StructField('srttMsClient', LongType(), True), 
    StructField('srttMsServer', LongType(), True), 
])


def get_spark_session():
    spark = SparkSession \
        .builder \
        .appName("stream-from-Kafka2") \
        .config("spark.streaming.stopGracefullyOnShutdown", True) \
        .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0") \
        .config("spark.sql.shuffle.partitions", 4) \
        .master("spark://spark-master:7077") \
        .getOrCreate()
    return spark



#| timestamp|         type|             appName|appInstance|appID|             probeID|             eventID|correletionID|  
#locationID|transactionStart|transactionEnd|transactionDuration|clientIPAddress|clientPort|serverIPAddress|serverPort|ipProtocol| 
#category|bytesFromClient|bytesToClient|bytesFromServer|bytesToServer| subscriberID|applicationProtocol| applicationName|          
# domain|   deviceType|networkType|contentType|lostBytesClient|lostBytesServer|srttMsClient|srttMsServer|

def init_stream(spark):
    KAFKA_BOOTSTRAP_SERVERS = "kafka1:19092,kafka2:19093,kafka3:19094"
    #KAFKA_BOOTSTRAP_SERVERS = "172.18.0.3:9092,172.18.0.4:9093,172.18.0.5:9094"
    KAFKA_TOPIC = "events"
    
    df = spark.readStream.format("kafka") \
        .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS) \
        .option("subscribe", KAFKA_TOPIC) \
        .option("startingOffsets", "earliest") \
        .load()    
    df = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),events_schema).alias("data")). \
        select(date_format(to_timestamp("data.transactionEnd"),"yyyyMMddHHmm").alias("bucket"),
              "data.probeID", "data.eventID", "data.locationID", "data.transactionStart", "data.transactionEnd", 
               "data.transactionDuration", "data.clientIPAddress", "data.clientPort", "data.serverIPAddress",
               "data.serverPort", "data.ipProtocol", "data.category", "data.bytesFromClient", "data.bytesToClient",
               "data.bytesFromServer", "data.bytesToServer", "data.subscriberID", "data.applicationProtocol", 
               "data.applicationName", "data.domain", "data.deviceType", "data.networkType", "data.contentType",
               "data.lostBytesClient", "data.lostBytesServer", "data.srttMsClient", "data.srttMsServer"
              )
    return df 

In [26]:
spark = get_spark_session()
df = init_stream(spark)

In [None]:
my_query = (df.writeStream
        .format("console")
        .outputMode("append")
        .start())

my_query.awaitTermination()