# Data Ingestion

Before proceeding we need to ensure kafka has the topic we will use.
`raw-taxi-data`.
For the partitions, we are using the `medallion` as the partition key ensuring the related records go to the same partition.

We will produce the data using the csv as the datasource and then we read the stream from kafka, load into dataframe and start doing the EDA.


In [57]:
from pyspark.sql import SparkSession
from delta import *
from delta.tables import *
from pyspark.sql.functions import col, to_json, struct, lit, current_timestamp, expr
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    DoubleType,
    TimestampType,
    IntegerType,
)
import pandas as pd

In [2]:
# IMPORTANT! Remove the empty space in each cell of the header, schema parsing was erroring.
# If erroring run it from the shell.
# !for file in input/trip_data_*.csv; do sed -i '1s/, /,/g' "$file"; done
# !for file in input/sample.csv; do sed -i '1s/, /,/g' "$file"; done

In [3]:
raw_sample_df = pd.read_csv('input/sample.csv', header=0)

In [4]:
raw_sample_df.columns

Index(['medallion', 'hack_license', 'vendor_id', 'rate_code',
       'store_and_fwd_flag', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'trip_time_in_secs', 'trip_distance',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude'],
      dtype='object')

In [58]:
builder = SparkSession.builder.appName("NYC Taxi Data Kafka ETL") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3") \
    .config("spark.sql.session.timeZone", "UTC") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

# noise reduction
spark.sparkContext.setLogLevel("WARN")

ConnectionRefusedError: [Errno 111] Connection refused

In [6]:
raw_taxi_schema = StructType(
    [
        StructField("medallion", StringType(), True),
        StructField("hack_license", StringType(), True),
        StructField("vendor_id", StringType(), True),
        StructField("rate_code", StringType(), True),
        StructField("store_and_fwd_flag", StringType(), True),
        StructField("pickup_datetime", TimestampType(), True),
        StructField("dropoff_datetime", TimestampType(), True),
        StructField("passenger_count", IntegerType(), True),
        StructField("trip_time_in_secs", IntegerType(), True),
        StructField("trip_distance", DoubleType(), True),
        StructField("pickup_longitude", DoubleType(), True),
        StructField("pickup_latitude", DoubleType(), True),
        StructField("dropoff_longitude", DoubleType(), True),
        StructField("dropoff_latitude", DoubleType(), True),
    ]
)


In [7]:
def ingest_csv_to_kafka(csv_path, batch_size=1000):
    """
    Read a CSV file and publish records to Kafka using Spark's partitioning
    """
    print(
        f"Ingesting data from {csv_path} to Kafka topic 'raw-taxi-data'"
    )
    
    # Load the data
    taxi_data = spark.read.csv(csv_path, header=True, schema=raw_taxi_schema)
    
    # Get total count for reporting
    total_count = taxi_data.count()
    print(f"Total records to process: {total_count}")
    
    # Add partition key
    taxi_data = taxi_data.withColumn("kafka_key", col("medallion"))
    
    # Create the JSON structure
    kafka_batch = taxi_data.select(
        col("kafka_key").cast("string"),
        to_json(
            struct(*[col(c) for c in taxi_data.columns if c != "kafka_key"])
        ).alias("value")
    )
    
    # Write to Kafka in one go (Spark will handle the batching internally)
    kafka_batch.write.format("kafka") \
        .option("kafka.bootstrap.servers", "kafka:9092") \
        .option("topic", "raw-taxi-data") \
        .option("maxOffsetsPerTrigger", batch_size) \
        .save()
    
    print(f"Finished ingesting data to Kafka topic 'raw-taxi-data'")

## Producer

We are doing simulation of producing events and then sending them to kafka. Next step is to subscribe to topic and read the events as a stream.

In [22]:
csv_path = "input/trip_data_8.csv"
ingest_csv_to_kafka(csv_path, batch_size=10000)

Ingesting data from input/trip_data_8.csv to Kafka topic 'raw-taxi-data'
Total records to process: 12597109
Finished ingesting data to Kafka topic 'raw-taxi-data'


In [9]:
# spark.stop() # Double check if we might need to improve run-time by restarting, since this is a "data pipeline"

## Consumer

In [10]:
raw_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "raw-taxi-data") \
    .option("startingOffsets", "earliest") \
    .load()

In [11]:
filtered_kafka_stream = raw_stream.select(
    col("key").cast("string").alias("kafka_key"),
    col("value").cast("string").alias("json_data"),
    col("timestamp").alias("kafka_timestamp")
)

In [12]:
#raw_taxi_schema

from pyspark.sql.functions import from_json # extract data from json

raw_data_df = filtered_kafka_stream.select(
    "kafka_key",
    from_json("json_data", raw_taxi_schema).alias("data"),
    "kafka_timestamp"
).select("kafka_key", "kafka_timestamp", "data.*")

In [13]:
raw_data_df.printSchema()

root
 |-- kafka_key: string (nullable = true)
 |-- kafka_timestamp: timestamp (nullable = true)
 |-- medallion: string (nullable = true)
 |-- hack_license: string (nullable = true)
 |-- vendor_id: string (nullable = true)
 |-- rate_code: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_time_in_secs: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)



In [14]:
raw_data_df.explain(True)

== Parsed Logical Plan ==
'Project ['kafka_key, 'kafka_timestamp, data.*]
+- Project [kafka_key#21, from_json(StructField(medallion,StringType,true), StructField(hack_license,StringType,true), StructField(vendor_id,StringType,true), StructField(rate_code,StringType,true), StructField(store_and_fwd_flag,StringType,true), StructField(pickup_datetime,TimestampType,true), StructField(dropoff_datetime,TimestampType,true), StructField(passenger_count,IntegerType,true), StructField(trip_time_in_secs,IntegerType,true), StructField(trip_distance,DoubleType,true), StructField(pickup_longitude,DoubleType,true), StructField(pickup_latitude,DoubleType,true), StructField(dropoff_longitude,DoubleType,true), StructField(dropoff_latitude,DoubleType,true), json_data#22, Some(UTC)) AS data#27, kafka_timestamp#23]
   +- Project [cast(key#7 as string) AS kafka_key#21, cast(value#8 as string) AS json_data#22, timestamp#12 AS kafka_timestamp#23]
      +- StreamingRelationV2 org.apache.spark.sql.kafka010.Kafk

In [15]:
# checking stream retrieval of data
# raw_stream.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").writeStream.format("console").start()

```bash
big_data_project_pyspark  | Batch: 0
big_data_project_pyspark  | -------------------------------------------
big_data_project_pyspark  | +----+--------------------+
big_data_project_pyspark  | | key|               value|
big_data_project_pyspark  | +----+--------------------+
big_data_project_pyspark  | |NULL|{"medallion":"341...|
big_data_project_pyspark  | |NULL|{"medallion":"6D3...|
big_data_project_pyspark  | |NULL|{"medallion":"6D4...|
big_data_project_pyspark  | |NULL|{"medallion":"4C4...|
big_data_project_pyspark  | |NULL|{"medallion":"125...|
big_data_project_pyspark  | |NULL|{"medallion":"3B0...|
big_data_project_pyspark  | |NULL|{"medallion":"1A5...|
big_data_project_pyspark  | |NULL|{"medallion":"A30...|
big_data_project_pyspark  | |NULL|{"medallion":"EF7...|
big_data_project_pyspark  | |NULL|{"medallion":"C64...|
big_data_project_pyspark  | |NULL|{"medallion":"820...|
big_data_project_pyspark  | |NULL|{"medallion":"820...|
big_data_project_pyspark  | |NULL|{"medallion":"C4C...|
big_data_project_pyspark  | |NULL|{"medallion":"6B8...|
big_data_project_pyspark  | |NULL|{"medallion":"EB2...|
big_data_project_pyspark  | |NULL|{"medallion":"EB2...|
big_data_project_pyspark  | |NULL|{"medallion":"6D4...|
big_data_project_pyspark  | |NULL|{"medallion":"AA5...|
big_data_project_pyspark  | |NULL|{"medallion":"733...|
big_data_project_pyspark  | |NULL|{"medallion":"733...|
big_data_project_pyspark  | +----+--------------------+
big_data_project_pyspark  | only showing top 20 rows

```

In [21]:
for query in spark.streams.active:
    print(f"Query name: {query.name}")
    print(f"Status: {query.status}")
    print(f"Is active: {query.isActive}")

Query name: raw_taxi_query
Status: {'message': 'Waiting for next trigger', 'isDataAvailable': False, 'isTriggerActive': False}
Is active: True


### Checkpoint

In [17]:
import time
import os
import uuid

table_name = "raw_taxi_data"
warehouse_dir = spark.conf.get('spark.sql.warehouse.dir').replace('file:', '')
output_path = os.path.join(warehouse_dir, table_name)
checkpoint_path = "streaming/taxis/_checkpoint" 

# output_path = f"{table_name}" # XXX: check this issue later. we already point the store of delta tables to the spark-warehouse dir in the config of spark

raw_query = (raw_data_df.writeStream
  .outputMode("append")
  .format("delta")
  .queryName("raw_taxi_query")
  .trigger(processingTime="10 second")
  .option("checkpointLocation", checkpoint_path)
  .start(output_path) 
)

def create_table_if_exists(output_path, table_name):
    data_exists = False
    for _i in range(60):
        try:
            time.sleep(1)
            files = os.listdir(output_path)
            for _f in files:
                if ".parquet" in _f:
                    if len(os.listdir(f"{output_path}/_delta_log")) > 0:
                        print("data exists")
                        data_exists = True
                        break
            if data_exists:
                spark.sql(f"DROP TABLE IF EXISTS {table_name}")
                spark.sql(f"CREATE TABLE IF NOT EXISTS {table_name} USING DELTA")
                break
        except Exception as e:
            print(e)
            pass

create_table_if_exists(output_path,table_name)

[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/raw_taxi_data'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/raw_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/raw_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/raw_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/raw_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/raw_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/raw_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/raw_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/raw_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/raw_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/raw_taxi_data/

In [18]:
print(raw_query.status)
print(f"Is active: {raw_query.isActive}")
print(f"Recent progress: {raw_query.recentProgress}")

{'message': 'Processing new data', 'isDataAvailable': True, 'isTriggerActive': True}
Is active: True
Recent progress: []


In [36]:
spark.sql(f"DROP TABLE IF EXISTS {table_name}")
spark.sql(f"CREATE TABLE IF NOT EXISTS {table_name} USING DELTA LOCATION '{output_path}'")

DataFrame[]

In [37]:
spark.sql("SHOW TABLES").show()

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
|  default|raw_taxi_data|      false|
+---------+-------------+-----------+



In [39]:
spark.sql("DESCRIBE EXTENDED raw_taxi_data;").show()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|           kafka_key|              string|   NULL|
|     kafka_timestamp|           timestamp|   NULL|
|           medallion|              string|   NULL|
|        hack_license|              string|   NULL|
|           vendor_id|              string|   NULL|
|           rate_code|              string|   NULL|
|  store_and_fwd_flag|              string|   NULL|
|     pickup_datetime|           timestamp|   NULL|
|    dropoff_datetime|           timestamp|   NULL|
|     passenger_count|                 int|   NULL|
|   trip_time_in_secs|                 int|   NULL|
|       trip_distance|              double|   NULL|
|    pickup_longitude|              double|   NULL|
|     pickup_latitude|              double|   NULL|
|   dropoff_longitude|              double|   NULL|
|    dropoff_latitude|              double|   NULL|
|           

In [None]:
#import os
#print(os.listdir(output_path))
#print("Delta log:", os.listdir(f"{output_path}/_delta_log") if os.path.exists(f"{output_path}/_delta_log") else "No delta log")

In [40]:
import json
with open(f"{output_path}/_delta_log/00000000000000000000.json", "r") as f:
    delta_log = [json.loads(line) for line in f]
print("Delta log entries:", len(delta_log))
print("First few log entries:")
for entry in delta_log[:3]:
    print(entry)

Delta log entries: 5
First few log entries:
{'commitInfo': {'timestamp': 1743097340555, 'operation': 'STREAMING UPDATE', 'operationParameters': {'outputMode': 'Append', 'queryId': '0bf902da-0d45-4099-8b1b-8121a9bb7f49', 'epochId': '0'}, 'isolationLevel': 'Serializable', 'isBlindAppend': True, 'operationMetrics': {'numRemovedFiles': '0', 'numOutputRows': '25194218', 'numOutputBytes': '1085501949', 'numAddedFiles': '1'}, 'engineInfo': 'Apache-Spark/3.5.3 Delta-Lake/3.3.0', 'txnId': 'a3c503d0-e07a-4342-9b1b-ffa3d5ce1724'}}
{'metaData': {'id': 'ebc28110-31a2-4892-a9bc-14be5daced42', 'format': {'provider': 'parquet', 'options': {}}, 'schemaString': '{"type":"struct","fields":[{"name":"kafka_key","type":"string","nullable":true,"metadata":{}},{"name":"kafka_timestamp","type":"timestamp","nullable":true,"metadata":{}},{"name":"medallion","type":"string","nullable":true,"metadata":{}},{"name":"hack_license","type":"string","nullable":true,"metadata":{}},{"name":"vendor_id","type":"string","nul

In [None]:
# debugging parquet file
#parquet_file = "spark-warehouse/raw_taxi_data/part-00000-f0351f15-e91c-457a-8015-e1150ef04ff4-c000.snappy.parquet"
#parquet_df = spark.read.parquet(parquet_file)
#print(f"Number of records in parquet file: {parquet_df.count()}")
#display(parquet_df.limit(5))

In [None]:
# display(parquet_df.describe().show())

In [41]:
raw_df_batch = spark.read.format("delta").table("default.raw_taxi_data")

In [42]:
raw_df_batch.explain(True)

== Parsed Logical Plan ==
'UnresolvedRelation [default, raw_taxi_data], [], false

== Analyzed Logical Plan ==
kafka_key: string, kafka_timestamp: timestamp, medallion: string, hack_license: string, vendor_id: string, rate_code: string, store_and_fwd_flag: string, pickup_datetime: timestamp, dropoff_datetime: timestamp, passenger_count: int, trip_time_in_secs: int, trip_distance: double, pickup_longitude: double, pickup_latitude: double, dropoff_longitude: double, dropoff_latitude: double
SubqueryAlias spark_catalog.default.raw_taxi_data
+- Relation spark_catalog.default.raw_taxi_data[kafka_key#2964,kafka_timestamp#2965,medallion#2966,hack_license#2967,vendor_id#2968,rate_code#2969,store_and_fwd_flag#2970,pickup_datetime#2971,dropoff_datetime#2972,passenger_count#2973,trip_time_in_secs#2974,trip_distance#2975,pickup_longitude#2976,pickup_latitude#2977,dropoff_longitude#2978,dropoff_latitude#2979] parquet

== Optimized Logical Plan ==
Relation spark_catalog.default.raw_taxi_data[kafka_k

In [None]:
# desc_stats = delta_df.describe()

# stats
# stats = desc_stats.toPandas()

In [None]:
# stats

## Data Clean-up

In [43]:
spark.sql("SHOW TABLES").show()

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
|  default|raw_taxi_data|      false|
+---------+-------------+-----------+



In [44]:
spark.sql(f"DESCRIBE TABLE raw_taxi_data").show()

+------------------+---------+-------+
|          col_name|data_type|comment|
+------------------+---------+-------+
|         kafka_key|   string|   NULL|
|   kafka_timestamp|timestamp|   NULL|
|         medallion|   string|   NULL|
|      hack_license|   string|   NULL|
|         vendor_id|   string|   NULL|
|         rate_code|   string|   NULL|
|store_and_fwd_flag|   string|   NULL|
|   pickup_datetime|timestamp|   NULL|
|  dropoff_datetime|timestamp|   NULL|
|   passenger_count|      int|   NULL|
| trip_time_in_secs|      int|   NULL|
|     trip_distance|   double|   NULL|
|  pickup_longitude|   double|   NULL|
|   pickup_latitude|   double|   NULL|
| dropoff_longitude|   double|   NULL|
|  dropoff_latitude|   double|   NULL|
+------------------+---------+-------+



In [45]:
print(f"Spark warehouse dir: {spark.conf.get('spark.sql.warehouse.dir')}")



Spark warehouse dir: file:/home/jovyan/spark-warehouse


In [46]:
spark.sql(f"SELECT * FROM {table_name} LIMIT 5").show()

+---------+--------------------+--------------------+--------------------+---------+---------+------------------+-------------------+-------------------+---------------+-----------------+-------------+----------------+---------------+-----------------+----------------+
|kafka_key|     kafka_timestamp|           medallion|        hack_license|vendor_id|rate_code|store_and_fwd_flag|    pickup_datetime|   dropoff_datetime|passenger_count|trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|
+---------+--------------------+--------------------+--------------------+---------+---------+------------------+-------------------+-------------------+---------------+-----------------+-------------+----------------+---------------+-----------------+----------------+
|     NULL|2025-03-27 17:12:...|3418135604CD3F357...|B25386A1F259C8744...|      VTS|        1|              NULL|2013-08-30 07:57:00|2013-08-30 08:30:00|              5|             1980|   

In [47]:
from pyspark.sql.functions import col, when, isnull, isnan, count, lit, expr

# cleaned_data paths
clean_table_name = "clean_taxi_data"

checkpoint_path = "streaming/taxis/_checkpoint" 
clean_checkpoint_path = "streaming/taxis/_checkpoint_clean" 
clean_output_path = os.path.join(warehouse_dir, clean_table_name)

# Define cleaning logic to apply to the stream
def clean_taxi_data(df):
    return df \
        .filter(col("medallion").isNotNull()) \
        .filter(col("trip_distance") > 0) \
        .filter(col("passenger_count") > 0) \
        .filter(col("trip_time_in_secs") > 0) \
        .filter(
            (col("pickup_longitude") != 0) & 
            (col("pickup_latitude") != 0) & 
            (col("dropoff_longitude") != 0) & 
            (col("dropoff_latitude") != 0)
        ) \
        .filter(
            (col("pickup_longitude") >= -180) & 
            (col("pickup_longitude") <= 180) & 
            (col("pickup_latitude") >= -90) & 
            (col("pickup_latitude") <= 90) &
            (col("dropoff_longitude") >= -180) & 
            (col("dropoff_longitude") <= 180) &
            (col("dropoff_latitude") >= -90) & 
            (col("dropoff_latitude") <= 90)
        ) \
        .withColumn(
            "trip_speed_mph", 
            when(col("trip_time_in_secs") > 0, 
                 (col("trip_distance") / (col("trip_time_in_secs") / 3600))
            ).otherwise(0)
        ) \
        .withColumn(
            "is_valid_trip", 
            (col("trip_distance") > 0) & 
            (col("trip_time_in_secs") > 0) &
            (col("trip_speed_mph") < 100)  # Filter unrealistic speeds
        )

# Set up the streaming query from the raw table to the clean table
raw_df = spark.readStream.format("delta").load(output_path)

# Apply the cleaning transformations
clean_df = clean_taxi_data(raw_df)

# Start the streaming query to the clean table
clean_query = (clean_df.writeStream
    .outputMode("append")
    .format("delta")
    .queryName("clean_taxi_query")
    .trigger(processingTime="10 second")
    .option("mergeSchema", "true")
    .option("checkpointLocation", clean_checkpoint_path)
    .start(clean_output_path)
)

# Wait for some data to be processed
import time
print("Waiting for streaming query to process data...")
time.sleep(30)  # Wait 30 seconds to allow some data to be processed

# Check if clean data exists and create table
create_table_if_exists(clean_output_path, clean_table_name)

Waiting for streaming query to process data...
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Er

In [54]:
# Check if clean data exists and create table
create_table_if_exists(clean_output_path, clean_table_name)

[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'


KeyboardInterrupt: 

In [56]:
spark

ConnectionRefusedError: [Errno 111] Connection refused

<pyspark.sql.session.SparkSession at 0xffff4151dfd0>

In [51]:
print(clean_query.status)
print(f"Is active: {clean_query.isActive}")
print(f"Recent progress: {clean_query.recentProgress}")

ConnectionRefusedError: [Errno 111] Connection refused