# Data Ingestion

Before proceeding we need to ensure kafka has the topic we will use.
`raw-taxi-data`.
For the partitions, we are using the `medallion` as the partition key ensuring the related records go to the same partition.

We will produce the data using the csv as the datasource and then we read the stream from kafka, load into dataframe and start doing the EDA.


In [1]:
from pyspark.sql import SparkSession
from delta import *
from delta.tables import *
from pyspark.sql.functions import col, to_json, struct, lit, current_timestamp, expr
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    DoubleType,
    TimestampType,
    IntegerType,
)
import pandas as pd

In [2]:
builder = SparkSession.builder.appName("NYC Taxi Data Kafka ETL") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3") \
    .config("spark.sql.session.timeZone", "UTC") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

# noise reduction
spark.sparkContext.setLogLevel("WARN")

In [3]:
raw_taxi_schema = StructType(
    [
        StructField("medallion", StringType(), True),
        StructField("hack_license", StringType(), True),
        StructField("vendor_id", StringType(), True),
        StructField("rate_code", StringType(), True),
        StructField("store_and_fwd_flag", StringType(), True),
        StructField("pickup_datetime", TimestampType(), True),
        StructField("dropoff_datetime", TimestampType(), True),
        StructField("passenger_count", IntegerType(), True),
        StructField("trip_time_in_secs", IntegerType(), True),
        StructField("trip_distance", DoubleType(), True),
        StructField("pickup_longitude", DoubleType(), True),
        StructField("pickup_latitude", DoubleType(), True),
        StructField("dropoff_longitude", DoubleType(), True),
        StructField("dropoff_latitude", DoubleType(), True),
    ]
)


In [4]:
for query in spark.streams.active:
    print(f"Query name: {query.name}")
    print(f"Status: {query.status}")
    print(f"Is active: {query.isActive}")

### Checkpoint

In [5]:
import time
import os
import uuid

def create_table_if_exists(output_path, table_name):
    data_exists = False
    for _i in range(60):
        try:
            time.sleep(1)
            files = os.listdir(output_path)
            for _f in files:
                if ".parquet" in _f:
                    if len(os.listdir(f"{output_path}/_delta_log")) > 0:
                        print("data exists")
                        data_exists = True
                        break
            if data_exists:
                spark.sql(f"DROP TABLE IF EXISTS {table_name}")
                spark.sql(f"CREATE TABLE IF NOT EXISTS {table_name} USING DELTA")
                break
        except Exception as e:
            print(e)
            pass

In [8]:
spark.sql("SHOW TABLES").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [7]:
raw_df_batch = spark.read.format("delta").table("raw_taxi_data")

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `raw_taxi_data` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.;
'UnresolvedRelation [raw_taxi_data], [], false


In [42]:
raw_df_batch.explain(True)

== Parsed Logical Plan ==
'UnresolvedRelation [default, raw_taxi_data], [], false

== Analyzed Logical Plan ==
kafka_key: string, kafka_timestamp: timestamp, medallion: string, hack_license: string, vendor_id: string, rate_code: string, store_and_fwd_flag: string, pickup_datetime: timestamp, dropoff_datetime: timestamp, passenger_count: int, trip_time_in_secs: int, trip_distance: double, pickup_longitude: double, pickup_latitude: double, dropoff_longitude: double, dropoff_latitude: double
SubqueryAlias spark_catalog.default.raw_taxi_data
+- Relation spark_catalog.default.raw_taxi_data[kafka_key#2964,kafka_timestamp#2965,medallion#2966,hack_license#2967,vendor_id#2968,rate_code#2969,store_and_fwd_flag#2970,pickup_datetime#2971,dropoff_datetime#2972,passenger_count#2973,trip_time_in_secs#2974,trip_distance#2975,pickup_longitude#2976,pickup_latitude#2977,dropoff_longitude#2978,dropoff_latitude#2979] parquet

== Optimized Logical Plan ==
Relation spark_catalog.default.raw_taxi_data[kafka_k

In [None]:
# desc_stats = delta_df.describe()

# stats
# stats = desc_stats.toPandas()

In [None]:
# stats

## Data Clean-up

In [43]:
spark.sql("SHOW TABLES").show()

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
|  default|raw_taxi_data|      false|
+---------+-------------+-----------+



In [44]:
spark.sql(f"DESCRIBE TABLE raw_taxi_data").show()

+------------------+---------+-------+
|          col_name|data_type|comment|
+------------------+---------+-------+
|         kafka_key|   string|   NULL|
|   kafka_timestamp|timestamp|   NULL|
|         medallion|   string|   NULL|
|      hack_license|   string|   NULL|
|         vendor_id|   string|   NULL|
|         rate_code|   string|   NULL|
|store_and_fwd_flag|   string|   NULL|
|   pickup_datetime|timestamp|   NULL|
|  dropoff_datetime|timestamp|   NULL|
|   passenger_count|      int|   NULL|
| trip_time_in_secs|      int|   NULL|
|     trip_distance|   double|   NULL|
|  pickup_longitude|   double|   NULL|
|   pickup_latitude|   double|   NULL|
| dropoff_longitude|   double|   NULL|
|  dropoff_latitude|   double|   NULL|
+------------------+---------+-------+



In [45]:
print(f"Spark warehouse dir: {spark.conf.get('spark.sql.warehouse.dir')}")



Spark warehouse dir: file:/home/jovyan/spark-warehouse


In [46]:
spark.sql(f"SELECT * FROM {table_name} LIMIT 5").show()

+---------+--------------------+--------------------+--------------------+---------+---------+------------------+-------------------+-------------------+---------------+-----------------+-------------+----------------+---------------+-----------------+----------------+
|kafka_key|     kafka_timestamp|           medallion|        hack_license|vendor_id|rate_code|store_and_fwd_flag|    pickup_datetime|   dropoff_datetime|passenger_count|trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|
+---------+--------------------+--------------------+--------------------+---------+---------+------------------+-------------------+-------------------+---------------+-----------------+-------------+----------------+---------------+-----------------+----------------+
|     NULL|2025-03-27 17:12:...|3418135604CD3F357...|B25386A1F259C8744...|      VTS|        1|              NULL|2013-08-30 07:57:00|2013-08-30 08:30:00|              5|             1980|   

In [47]:
from pyspark.sql.functions import col, when, isnull, isnan, count, lit, expr

# cleaned_data paths
clean_table_name = "clean_taxi_data"

checkpoint_path = "streaming/taxis/_checkpoint" 
clean_checkpoint_path = "streaming/taxis/_checkpoint_clean" 
clean_output_path = os.path.join(warehouse_dir, clean_table_name)

# Define cleaning logic to apply to the stream
def clean_taxi_data(df):
    return df \
        .filter(col("medallion").isNotNull()) \
        .filter(col("trip_distance") > 0) \
        .filter(col("passenger_count") > 0) \
        .filter(col("trip_time_in_secs") > 0) \
        .filter(
            (col("pickup_longitude") != 0) & 
            (col("pickup_latitude") != 0) & 
            (col("dropoff_longitude") != 0) & 
            (col("dropoff_latitude") != 0)
        ) \
        .filter(
            (col("pickup_longitude") >= -180) & 
            (col("pickup_longitude") <= 180) & 
            (col("pickup_latitude") >= -90) & 
            (col("pickup_latitude") <= 90) &
            (col("dropoff_longitude") >= -180) & 
            (col("dropoff_longitude") <= 180) &
            (col("dropoff_latitude") >= -90) & 
            (col("dropoff_latitude") <= 90)
        ) \
        .withColumn(
            "trip_speed_mph", 
            when(col("trip_time_in_secs") > 0, 
                 (col("trip_distance") / (col("trip_time_in_secs") / 3600))
            ).otherwise(0)
        ) \
        .withColumn(
            "is_valid_trip", 
            (col("trip_distance") > 0) & 
            (col("trip_time_in_secs") > 0) &
            (col("trip_speed_mph") < 100)  # Filter unrealistic speeds
        )

# Set up the streaming query from the raw table to the clean table
raw_df = spark.readStream.format("delta").load(output_path)

# Apply the cleaning transformations
clean_df = clean_taxi_data(raw_df)

# Start the streaming query to the clean table
clean_query = (clean_df.writeStream
    .outputMode("append")
    .format("delta")
    .queryName("clean_taxi_query")
    .trigger(processingTime="10 second")
    .option("mergeSchema", "true")
    .option("checkpointLocation", clean_checkpoint_path)
    .start(clean_output_path)
)

# Wait for some data to be processed
import time
print("Waiting for streaming query to process data...")
time.sleep(30)  # Wait 30 seconds to allow some data to be processed

# Check if clean data exists and create table
create_table_if_exists(clean_output_path, clean_table_name)

Waiting for streaming query to process data...
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Er

In [54]:
# Check if clean data exists and create table
create_table_if_exists(clean_output_path, clean_table_name)

[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'
[Errno 2] No such file or directory: '/home/jovyan/spark-warehouse/clean_taxi_data/_delta_log'


KeyboardInterrupt: 

In [56]:
spark

ConnectionRefusedError: [Errno 111] Connection refused

<pyspark.sql.session.SparkSession at 0xffff4151dfd0>

In [51]:
print(clean_query.status)
print(f"Is active: {clean_query.isActive}")
print(f"Recent progress: {clean_query.recentProgress}")

ConnectionRefusedError: [Errno 111] Connection refused