# Spark Kafka Data Streaming

In [None]:
import os
import pandas as pd
import json
from pyspark.sql.functions import udf, col, from_json, from_csv, sum as _sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DateType

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1,org.apache.spark:spark-avro_2.12:3.3.1 pyspark-shell'


In [None]:
## Configuration file reader

def read_config(config_file):
    """
    Reads the kafka configuration information that is stored in the system    
    """
    conf = {}    
    with open(config_file) as fh:
        for line in fh:
            line = line.strip()
            if len(line) != 0 and line[0] != "#":
                parameter, value = line.strip().split('=', 1)
                conf[parameter] = value.strip()          
    return conf

In [None]:
# read the local configuration files

config_path = os.path.join(os.path.dirname('/home/ozkary/.kafka/'),'localhost-nosasl.properties')
config = read_config(config_path)
print(config)

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F

spark = SparkSession \
    .builder \
    .appName("Spark-Notebook") \
    .getOrCreate()

## Read from Kafka Stream

In [None]:

topic = 'mta-turnstile'
client_id = 'Spark-Notebook-Session'
group_id = 'turnstile'

use_sasl = "sasl.mechanism" in config and config["sasl.mechanism"] is not None

kafka_options = {
            "kafka.bootstrap.servers": config["bootstrap.servers"],
            "subscribe": topic,
            "startingOffsets": "latest",
            "failOnDataLoss": "false",
            "client.id": client_id,            
            "group.id": group_id,            
            "auto.offset.reset": "latest",
            "checkpointLocation": "checkpoint",
            "minPartitions": "2",
            "enable.auto.commit": "false",
            "enable.partition.eof": "true"                        
        }          

if use_sasl:
    # set the JAAS configuration only when use_sasl is True
    sasl_config = f'org.apache.kafka.common.security.plain.PlainLoginModule required serviceName="kafka" username="{self.settings["sasl.username"]}" password="{self.settings["sasl.password"]}";'

    login_options = {
        "kafka.sasl.mechanisms": self.settings["sasl.mechanism"],
        "kafka.security.protocol": self.settings["security.protocol"],
        "kafka.sasl.username": self.settings["sasl.username"],
        "kafka.sasl.password": self.settings["sasl.password"],  
        "kafka.sasl.jaas.config": sasl_config          
    }
    # merge the login options with the kafka options
    kafka_options = {**kafka_options, **login_options}  

In [None]:
# set the stream source
# default for startingOffsets is "latest"
stream = spark \
    .readStream \
    .format("kafka") \
    .options(**kafka_options) \
    .load()

# .option("key.deserializer", value_deserializer) \
# .option("value.deserializer", value_deserializer) \

stream.printSchema()

In [None]:

def write_to_console(df: pd.DataFrame, output_mode: str = 'append', processing_time: str = '15 seconds') -> None:
    """
        Output stream values to the console
    """
    
    console_query = df.writeStream\
        .outputMode(output_mode) \
        .trigger(processingTime=processing_time) \
        .format("console") \
        .option("truncate", False) \
        .start()
    
    console_query.awaitTermination()   

# write a streaming data frame to storage ./storage
def write_to_storage(df: pd.DataFrame, output_mode: str = 'append', processing_time: str = '15 seconds') -> None:
    """
        Output stream values to the console
    """

    # flatten the timestamp structure to a START_DT and END_DT columns, so they can be written to storage
    df_csv = df.select(
        col("window.start").alias("START_DT"),
        col("window.end").alias("END_DT"),
        "A/C","UNIT","SCP","STATION","LINENAME","DIVISION","DATE","DESC",
        "ENTRIES","EXITS"        
    )
        
    storage_query = df_csv.writeStream\
        .outputMode(output_mode) \
        .trigger(processingTime=processing_time) \
        .format("csv") \
        .option("header", True) \
        .option("path", "./storage") \
        .option("checkpointLocation", "./checkpoint") \
        .option("truncate", False) \
        .start()
    
    storage_query.awaitTermination()

# Define the schema for the incoming data
turnstiles_schema = StructType([
    StructField("`A/C`", StringType()),
    StructField("UNIT", StringType()),
    StructField("SCP", StringType()),
    StructField("STATION", StringType()),
    StructField("LINENAME", StringType()),
    StructField("DIVISION", StringType()),
    StructField("DATE", StringType()),
    StructField("TIME", StringType()),
    StructField("DESC", StringType()),
    StructField("ENTRIES", IntegerType()),
    StructField("EXITS", IntegerType()),
    StructField("ID", StringType()),
    StructField("TIMESTAMP", TimestampType())
])

In [None]:

def parse_messages(stream, schema) -> pd.DataFrame:
    """
    Parse the messages and use the provided schema to type cast the fields
    """
    assert stream.isStreaming is True, "DataFrame doesn't receive streaming data"

    options =  {'header': 'true', 'sep': ','}
    df = stream.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")               
                                
    # print("df =====>",df)
    # split attributes to nested array in one Column
    col = F.split(df['value'], ',')

    # expand col to multiple top-level columns
    for idx, field in enumerate(schema):
        df = df.withColumn(field.name.replace('`',''), col.getItem(idx).cast(field.dataType))

    result = df.select([field.name for field in schema])    
    result.printSchema()
    
    return result
    

In [59]:
def agg_messages(df, window_duration: str, window_slide: str) -> pd.DataFrame:
        """
            Window for n minutes aggregations group by A/C, UNIT, STATION, DATE, DESC
        """
        df = df.na.fill(0)

        # Filter out empty rows
        # df = df.filter(col("A/C").isNotNull() & col("UNIT").isNotNull() & col("STATION").isNotNull())

        df_windowed = df \
            .withWatermark("TIMESTAMP", window_duration) \
            .groupBy(F.window("TIMESTAMP", window_duration, window_slide),"A/C", "UNIT","SCP","LINENAME","DIVISION", "STATION", "DATE", "DESC") \
            .agg(
                _sum("ENTRIES").alias("ENTRIES"),
                _sum("EXITS").alias("EXITS")
            )    
        
        df_windowed.printSchema()            

        return df_windowed

In [58]:
# convert the schema to string
schema_string = turnstiles_schema.simpleString()
df_messages = parse_messages(stream, schema=turnstiles_schema)

window_duration = '2 minutes'
window_slide = '2 minutes'

df_windowed = agg_messages(df_messages,window_duration, window_slide)
    
# add a type column to both dataframes to be able to join them
# df_messages_with_type = df_messages.withColumn("type", F.lit("turnstile"))
# df_windowed_with_type = df_windowed.withColumn("type", F.lit("windowed"))

# union the dataframes to produce a single console output
# df_union = df_messages_with_type.union(df_windowed_with_type)

# write_to_console(df_messages_with_type)

write_to_console(df_windowed)

write_to_storage(df_windowed)

# wait 3 minutes and then stop the query
# time.sleep(180)
# console_query.stop()
# storage_query.stop()


                                                                                

-------------------------------------------
Batch: 123
-------------------------------------------
+------+---+----+-------+-------+-----+----+
|window|A/C|UNIT|STATION|ENTRIES|EXITS|type|
+------+---+----+-------+-------+-----+----+
+------+---+----+-------+-------+-----+----+

23/11/15 16:03:39 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000 milliseconds, but spent 9313 milliseconds


                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+------+---+----+---+--------+--------+-------+----+----+-------+-----+
|window|A/C|UNIT|SCP|LINENAME|DIVISION|STATION|DATE|DESC|ENTRIES|EXITS|
+------+---+----+---+--------+--------+-------+----+----+-------+-----+
+------+---+----+---+--------+--------+-------+----+----+-------+-----+



                                                                                

-------------------------------------------
Batch: 27
-------------------------------------------
+------+---+----+---+--------+--------+-------+----+----+-------+-----+
|window|A/C|UNIT|SCP|LINENAME|DIVISION|STATION|DATE|DESC|ENTRIES|EXITS|
+------+---+----+---+--------+--------+-------+----+----+-------+-----+
+------+---+----+---+--------+--------+-------+----+----+-------+-----+

23/11/15 16:03:46 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000 milliseconds, but spent 14270 milliseconds


In [None]:
# read all the csv files from storage and show the data
df = spark.read.csv('./storage/*.csv', header=True)
df.show()
