# Spark Kafka Data Streaming

In [None]:
# Standard library imports
import os
import json
from datetime import datetime

# Third-party library imports
import pandas as pd
import numpy
import pyspark
from pyspark.sql import DataFrame
from pyspark.sql.functions import udf, col, from_json, from_csv, sum as _sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DateType

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1,org.apache.spark:spark-avro_2.12:3.3.1 pyspark-shell'

print("pyspark ",pyspark.__version__)
print("pandas ", pd.__version__)
print("numpy ", numpy.__version__)


In [None]:
## Configuration file reader

def read_config(config_file):
    """
    Reads the kafka configuration information that is stored in the system    
    """
    conf = {}    
    with open(config_file) as fh:
        for line in fh:
            line = line.strip()
            if len(line) != 0 and line[0] != "#":
                parameter, value = line.strip().split('=', 1)
                conf[parameter] = value.strip()          
    return conf

In [None]:
# read the local configuration files

config_path = os.path.join(os.path.dirname('/home/ozkary/.kafka/'),'localhost-nosasl.properties')
config = read_config(config_path)
print(config)

In [None]:
from pyspark.sql import SparkSession, DataFrame
import pyspark.sql.types as T
import pyspark.sql.functions as F

spark = SparkSession \
    .builder \
    .appName("Spark-Notebook") \
    .getOrCreate()

## Read from Kafka Stream

In [None]:

topic = 'mta-turnstile'
client_id = 'Spark-Notebook-Session'
group_id = 'turnstile'

use_sasl = "sasl.mechanism" in config and config["sasl.mechanism"] is not None

kafka_options = {
            "kafka.bootstrap.servers": config["bootstrap.servers"],
            "subscribe": topic,
            "startingOffsets": "latest",
            "failOnDataLoss": "false",
            "client.id": client_id,            
            "group.id": group_id,            
            "auto.offset.reset": "latest",
            "checkpointLocation": "checkpoint",
            "minPartitions": "2",
            "enable.auto.commit": "false",
            "enable.partition.eof": "true"                        
        }          

if use_sasl:
    # set the JAAS configuration only when use_sasl is True
    sasl_config = f'org.apache.kafka.common.security.plain.PlainLoginModule required serviceName="kafka" username="{settings["sasl.username"]}" password="{settings["sasl.password"]}";'

    login_options = {
        "kafka.sasl.mechanisms": settings["sasl.mechanism"],
        "kafka.security.protocol": settings["security.protocol"],
        "kafka.sasl.username": settings["sasl.username"],
        "kafka.sasl.password": settings["sasl.password"],  
        "kafka.sasl.jaas.config": sasl_config          
    }
    # merge the login options with the kafka options
    kafka_options = {**kafka_options, **login_options}  

In [None]:
def value_deserializer(value: bytes) -> any:
    """
    Message value deserializer
    """
    return json.loads(value) 

# set the stream source
# default for startingOffsets is "latest"
stream = spark \
    .readStream \
    .format("kafka") \
    .options(**kafka_options) \
    .option("key.deserializer", value_deserializer) \
    .option("value.deserializer", value_deserializer) \
    .load()


stream.printSchema()

In [None]:
# Get the current timestamp
current_timestamp = F.current_timestamp()
print(current_timestamp[0])

timestamp = datetime.now()
print(timestamp)

# Format the timestamp as needed
time = timestamp.strftime("%Y%m%d_%H%M%S")

print(time)


In [None]:

def write_to_console(df: DataFrame, output_mode: str = 'append', processing_time: str = '60 seconds') -> None:
    """
        Output stream values to the console
    """
    
    console_query = df.writeStream\
        .outputMode(output_mode) \
        .trigger(processingTime=processing_time) \
        .format("console") \
        .option("truncate", False) \
        .start()
    
    # console_query.awaitTermination()   

def process_mini_batch(df, batch_id, path):
    """Processes a mini-batch, converts to Pandas, and writes to GCP Storage as CSV.gz."""

     # Check if DataFrame is empty
    if df.count() == 0:
        print(f"DataFrame for batch {batch_id} is empty. Skipping processing.")
        return

    # Convert to Pandas DataFrame
    df_pandas = df.toPandas()

    print(df_pandas.head())

    # Get the current timestamp
    timestamp = datetime.now()
    # Format the timestamp as needed
    time = timestamp.strftime("%Y%m%d_%H%M%S")    

    # Write to GCP Storage as CSV.gz
    # with fsspec.open(f"gs://your-bucket/path/batch_{batch_id}.csv.gz", "wb") as f:
    file_path = f"{path}/batch_{batch_id}_{time}.csv"
    df_pandas.to_csv(file_path, index=False)
    # , compression="gzip")


# write a streaming data frame to storage ./storage
def write_to_storage(df: DataFrame, output_mode: str = 'append', processing_time: str = '60 seconds') -> None:
    """
        Output stream values to the console
    """   
    df_csv = df.select(
        "AC", "UNIT", "SCP", "STATION", "LINENAME", "DIVISION", "DATE", "DESC", "TIME","ENTRIES", "EXITS"
    )

    # time_value = df.select("START").first()["START"]
    # path = f"./storage/window_{F.date_format(F.to_timestamp(time_value), 'yyyyMMdd_HHmmss')}"
        
    path = "./storage/"     
    storage_query = df_csv.writeStream \
        .outputMode(output_mode) \
        .trigger(processingTime=processing_time) \
        .format("csv") \
        .option("header", True) \
        .option("path", path) \
        .option("checkpointLocation", "./checkpoint") \
        .foreachBatch(lambda batch, id: process_mini_batch(batch, id, path)) \
        .option("truncate", False) \
        .start()
    
    # .option("compression", "gzip") \

    # try:
    #     # Wait for the streaming query to terminate
    #     storage_query.awaitTermination()
    # except KeyboardInterrupt:
    #     # Handle keyboard interrupt (e.g., Ctrl+C)
    #     storage_query.stop()

# Define the schema for the incoming data
turnstiles_schema = StructType([
    StructField("AC", StringType()),
    StructField("UNIT", StringType()),
    StructField("SCP", StringType()),
    StructField("STATION", StringType()),
    StructField("LINENAME", StringType()),
    StructField("DIVISION", StringType()),
    StructField("DATE", StringType()),
    StructField("TIME", StringType()),
    StructField("DESC", StringType()),
    StructField("ENTRIES", IntegerType()),
    StructField("EXITS", IntegerType()),
    StructField("ID", StringType()),
    StructField("TIMESTAMP", StringType())
])

In [None]:

def parse_messages(stream, schema) -> DataFrame:
    """
    Parse the messages and use the provided schema to type cast the fields
    """
    assert stream.isStreaming is True, "DataFrame doesn't receive streaming data"

    options =  {'header': 'true', 'sep': ','}
    df = stream.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)", "timestamp")               
                                
    # print("df =====>",df)
    # split attributes to nested array in one Column
    col = F.split(df['value'], ',')
    
    # expand col to multiple top-level columns
    for idx, field in enumerate(schema):
        df = df.withColumn(field.name, col.getItem(idx).cast(field.dataType))
        
    # remove quotes from TIMESTAMP column
    df = df.withColumn("TIMESTAMP", F.regexp_replace(F.col("TIMESTAMP"), '"', ''))    
    df = df.withColumn("AC", F.regexp_replace(F.col("AC"), '"', ''))    
    
    result = df.select([field.name for field in schema])    

    df.dropDuplicates(["ID","STATION","TIMESTAMP"])

    result.printSchema()
    
    return result
    

In [None]:
def agg_messages(df, window_duration: str, window_slide: str) -> DataFrame:
        """
            Window for n minutes aggregations group by AC, UNIT, STATION, DATE, DESC
        """
        
        # Ensure TIMESTAMP is in the correct format (timestamp type)    
        date_format = "yyyy-MM-dd HH:mm:ss"        
        df = df.withColumn("TS", F.to_timestamp("TIMESTAMP", date_format))    

        df_windowed = df \
            .withWatermark("TS", window_duration) \
            .groupBy(F.window("TS", window_duration, window_slide),"AC", "UNIT","SCP","STATION","LINENAME","DIVISION", "DATE", "DESC") \
            .agg(
                F.sum("ENTRIES").alias("ENTRIES"),
                F.sum("EXITS").alias("EXITS")
            ).withColumn("START",F.col("window.start")) \
            .withColumn("END", F.col("window.end")) \
            .withColumn("TIME", F.date_format("window.end", "HH:mm:ss")) \
            .drop("window") \
            .select("AC","UNIT","SCP","STATION","LINENAME","DIVISION","DATE","DESC","TIME","START","END","ENTRIES","EXITS")
        
        df_windowed.printSchema()            

        return df_windowed


In [None]:
def add_by_station(df, window_duration: str, window_slide: str) -> DataFrame:
    
    # Ensure TIMESTAMP is in the correct format (timestamp type)    
    date_format = "yyyy-MM-dd HH:mm:ss"        
    df = df.withColumn("TS", F.to_timestamp("TIMESTAMP", date_format))    

    df_windowed = df \
        .withWatermark("TS", window_duration) \
        .groupBy(F.window("TS", window_duration, window_slide), "STATION") \
        .agg(
            F.sum("ENTRIES").alias("ENTRIES"),
            F.sum("EXITS").alias("EXITS")
        ).withColumn("START",F.col("window.start")) \
        .withColumn("END", F.col("window.end")) \
        .withColumn("TIME", F.date_format("window.start", "HH:mm:ss")) \
        .drop("window") \
        .select("STATION","TIME","START","END","ENTRIES","EXITS")
    
    df_windowed.printSchema()
    return df_windowed 

In [None]:
def process_batch(df, id, tag='message'):

    # get the values from the first row
    row = df.first()
    # check if the TIMESTAMP value can be casted as timestamp
    # if not, the row is invalid and we can skip the batch

    # if row is None:
    #     # print(f"Invalid {tag} batch {id}")
    #     return
    
    # ts = row['TIMESTAMP']

    # try:
    #     row['TIMESTAMP'].cast("timestamp")
    # except:
    #     print(f"Invalid TIMESTAMP {ts} value in batch {id}")
    
    print(f"Processing {tag} batch {id} with {df.count()} records. {row}")
    # if df.isEmpty():
    #     print(f"DataFrame is empty in this batch {id}.")
    #     # Handle empty DataFrame as needed
    # else:
    #      print("Data found in this batch.")

In [42]:
# convert the schema to string
schema_string = turnstiles_schema.simpleString()
df_messages = parse_messages(stream, schema=turnstiles_schema)
write_to_console(df_messages)
# write_to_storage(df_messages)

# query = df_messages.writeStream \
#                    .foreachBatch(lambda batch, id: process_batch(batch, id, 'by_message')) \
#                    .start()

window_duration = '5 minutes'
window_slide = '5 minutes'

df_windowed = agg_messages(df_messages,window_duration, window_slide)

# Start the streaming query
# query = df_windowed.writeStream.outputMode("append").format("memory").queryName("output").start()


# query = df_windowed.writeStream \
#                    .foreachBatch(lambda batch, id: process_batch(batch, id, 'by_station')) \
#                    .start()

write_to_storage(df=df_windowed, processing_time=window_duration)


root
 |-- AC: string (nullable = true)
 |-- UNIT: string (nullable = true)
 |-- SCP: string (nullable = true)
 |-- STATION: string (nullable = true)
 |-- LINENAME: string (nullable = true)
 |-- DIVISION: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- TIME: string (nullable = true)
 |-- DESC: string (nullable = true)
 |-- ENTRIES: integer (nullable = true)
 |-- EXITS: integer (nullable = true)
 |-- ID: string (nullable = true)
 |-- TIMESTAMP: string (nullable = true)

24/01/29 14:09:39 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-cd47533e-7061-46bf-ab06-35ead7996814. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/01/29 14:09:39 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets an

[Stage 67:>                                                         (0 + 2) / 2]

24/01/29 14:09:40 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:40 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:40 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:40 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:40 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapsh



24/01/29 14:09:41 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:41 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:41 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:41 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:41 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapsh



24/01/29 14:09:41 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:41 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:41 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:41 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:41 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapsh



24/01/29 14:09:42 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:42 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:42 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:42 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:42 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapsh



24/01/29 14:09:42 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:42 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:42 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:42 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:42 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapsh



24/01/29 14:09:42 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:43 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:43 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:43 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:43 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapsh



24/01/29 14:09:43 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:43 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:43 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:43 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:43 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapsh



24/01/29 14:09:43 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:43 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:43 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:43 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:43 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapsh



24/01/29 14:09:44 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:44 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:44 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:44 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/01/29 14:09:44 WARN HDFSBackedStateStoreProvider: The state for version 9 doesn't exist in loadedMaps. Reading snapsh

                                                                                

     AC  UNIT       SCP        STATION LINENAME DIVISION      DATE     DESC  \
0  A001  R002  02-00-00  Test-Station1   456NQR      BMT  01-29-24  REGULAR   
1  A002  R002  02-00-00  Test-Station2   456NQR      BMT  01-29-24  REGULAR   
2  A001  R001  02-00-00  Test-Station2   456NQR      BMT  01-29-24  REGULAR   
3  A001  R001  02-00-00  Test-Station1   456NQR      BMT  01-29-24  REGULAR   
4  A002  R001  02-00-00  Test-Station1   456NQR      BMT  01-29-24  REGULAR   

       TIME  ENTRIES  EXITS  
0  13:55:00       32     27  
1  13:55:00       33     29  
2  13:55:00       19     19  
3  13:55:00       28     34  
4  13:55:00       48     44  
-------------------------------------------
Batch: 1
-------------------------------------------
+----+----+--------+-------------+--------+--------+--------+--------+-------+-------+-----+------------------------------------+-------------------+
|AC  |UNIT|SCP     |STATION      |LINENAME|DIVISION|DATE    |TIME    |DESC   |ENTRIES|EXITS|ID    

                                                                                

     AC  UNIT       SCP        STATION LINENAME DIVISION      DATE     DESC  \
0  A002  R001  02-00-00  Test-Station1   456NQR      BMT  01-29-24  REGULAR   
1  A001  R001  02-00-00  Test-Station2   456NQR      BMT  01-29-24  REGULAR   
2  A002  R001  02-00-00  Test-Station2   456NQR      BMT  01-29-24  REGULAR   
3  A001  R002  02-00-00  Test-Station2   456NQR      BMT  01-29-24  REGULAR   
4  A001  R001  02-00-00  Test-Station1   456NQR      BMT  01-29-24  REGULAR   

       TIME  ENTRIES  EXITS  
0  14:00:00       22     19  
1  14:00:00       27     33  
2  14:00:00       13     15  
3  14:00:00       24     25  
4  14:00:00       31     25  
-------------------------------------------
Batch: 2
-------------------------------------------
+----+----+--------+-------------+--------+--------+--------+--------+-------+-------+-----+------------------------------------+-------------------+
|AC  |UNIT|SCP     |STATION      |LINENAME|DIVISION|DATE    |TIME    |DESC   |ENTRIES|EXITS|ID    

                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+----+----+--------+-------------+--------+--------+--------+--------+-------+-------+-----+------------------------------------+-------------------+
|AC  |UNIT|SCP     |STATION      |LINENAME|DIVISION|DATE    |TIME    |DESC   |ENTRIES|EXITS|ID                                  |TIMESTAMP          |
+----+----+--------+-------------+--------+--------+--------+--------+-------+-------+-----+------------------------------------+-------------------+
|A001|R002|02-00-00|Test-Station2|456NQR  |BMT     |01-29-24|14:12:01|REGULAR|8      |8    |a2a2e18f-9c59-4951-a372-53802d5105a6|2024-01-29 14:12:01|
|A001|R001|02-00-00|Test-Station1|456NQR  |BMT     |01-29-24|14:12:11|REGULAR|8      |9    |9858d675-edd5-4b06-8bfb-a96d5f6a4b7f|2024-01-29 14:12:11|
|A002|R001|02-00-00|Test-Station1|456NQR  |BMT     |01-29-24|14:12:21|REGULAR|10     |9    |a2dd934d-9e4e-4d31-9ba0-c50d764c4faa|2024-01-29 14:12:21|
|A0

                                                                                

DataFrame for batch 11 is empty. Skipping processing.
-------------------------------------------
Batch: 7
-------------------------------------------
+----+----+--------+-------------+--------+--------+--------+--------+-------+-------+-----+------------------------------------+-------------------+
|AC  |UNIT|SCP     |STATION      |LINENAME|DIVISION|DATE    |TIME    |DESC   |ENTRIES|EXITS|ID                                  |TIMESTAMP          |
+----+----+--------+-------------+--------+--------+--------+--------+-------+-------+-----+------------------------------------+-------------------+
|A002|R002|02-00-00|Test-Station2|456NQR  |BMT     |01-29-24|14:15:02|REGULAR|5      |6    |6fad3a0e-8cb3-4354-a6b6-a5dccf649b48|2024-01-29 14:15:02|
|A002|R002|02-00-00|Test-Station1|456NQR  |BMT     |01-29-24|14:15:12|REGULAR|5      |9    |e139af12-cdc5-412f-ab9e-053b915eebcc|2024-01-29 14:15:12|
|A001|R002|02-00-00|Test-Station2|456NQR  |BMT     |01-29-24|14:15:22|REGULAR|5      |9    |3bf4c3d

In [43]:
# clean-up session 

# Stop any active streaming queries (if applicable)
active_queries = spark.streams.active
if (active_queries) :    
    for query in active_queries:
        print(f'Stopping {query.id} {query.name}')
        query.stop()
else:
    print("No active queries")


# Stop existing SparkSession
# spark.stop()


Stopping c3659f60-3c2f-4dd2-9eb2-0f105d651232 None
Stopping 374c432d-957f-434f-932d-0421f2cb8368 None


In [None]:
import os
import shutil
# Create a new SparkSession to cleanup folders
# spark = SparkSession.builder.appName("cleanup").getOrCreate()
# spark.sql("DROP TABLE IF EXISTS your_checkpoint_table")  # Drop the checkpoint table if using structured streaming
# spark.stop()


def remove_dir(path):
    if os.path.exists(path):
        shutil.rmtree(path)
        print(f'Path {path} removed successfully.')
    else:
        print(f'Path {path} is not found')

try: 

    # Delete checkpoint directory
    checkpoint_dir = "./checkpoint/"
    remove_dir(checkpoint_dir)

    # Delete storage directory
    storage_dir = "./storage/"
    remove_dir(storage_dir)
except Exception as ex:
     print(f"Error found {ex}")

