In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType, ArrayType

# Create a Spark session
spark = SparkSession.builder \
    .appName("FlightStreamAnalysis") \
    .getOrCreate()

# Define the schema for the flight data
flight_schema = StructType([
    StructField("estimatedLandingTime", StringType(), True),
    StructField("expectedTimeBoarding", StringType(), True),
    StructField("expectedTimeGateClosing", StringType(), True),
    StructField("expectedTimeGateOpen", StringType(), True),
    StructField("expectedTimeOnBelt", StringType(), True),
    StructField("expectedSecurityFilter", StringType(), True),
    StructField("flightDirection", StringType(), True),
    StructField("flightName", StringType(), True),
    StructField("flightNumber", StringType(), True),
    StructField("gate", StringType(), True),
    StructField("pier", StringType(), True),
    StructField("id", StringType(), True),
    StructField("isOperationalFlight", BooleanType(), True),
    StructField("mainFlight", StringType(), True),
    StructField("prefixIATA", StringType(), True),
    StructField("prefixICAO", StringType(), True),
    StructField("airlineCode", StringType(), True),
    StructField("publicEstimatedOffBlockTime", StringType(), True),
    StructField("publicFlightState", StructType([
        StructField("flightStates", ArrayType(StringType()), True)
    ]), True),
    StructField("route", StructType([
        StructField("destinations", ArrayType(StringType()), True),
        StructField("eu", StringType(), True),
        StructField("visa", BooleanType(), True)
    ]), True),
    StructField("scheduleDateTime", StringType(), True),
    StructField("scheduleDate", StringType(), True),
    StructField("scheduleTime", StringType(), True),
    StructField("serviceType", StringType(), True),
    StructField("terminal", StringType(), True),
    StructField("transferPositions", StringType(), True),
    StructField("schemaVersion", StringType(), True)
])

# Define the Kafka parameters
kafka_params = {
    "kafka.bootstrap.servers": "your_kafka_bootstrap_servers",
    "subscribe": "your_kafka_topic"
}

# Read the stream from Kafka
flight_stream = spark \
    .readStream \
    .format("kafka") \
    .options(**kafka_params) \
    .load() \
    .selectExpr("CAST(value AS STRING)")

# Parse the JSON data
flight_stream_parsed = flight_stream \
    .select(from_json(col("value"), flight_schema).alias("flight_data")) \
    .select("flight_data.*")

# Real-Time Arrival Statistics
arrival_statistics = flight_stream_parsed \
    .filter(col("flightDirection") == "A") \
    .withColumn("estimatedLandingTime", col("estimatedLandingTime").cast("timestamp")) \
    .withColumn("actualLandingTime", col("actualLandingTime").cast("timestamp")) \
    .withColumn("arrivalDelay", (col("actualLandingTime").cast("long") - col("estimatedLandingTime").cast("long")) / 60)  # in minutes

# Scheduled vs. Actual Boarding Time Analysis
boarding_time_analysis = flight_stream_parsed \
    .filter(col("flightDirection") == "D") \
    .withColumn("scheduledBoardingTime", col("expectedTimeBoarding").cast("timestamp")) \
    .withColumn("actualBoardingTime", col("actualBoardingTime").cast("timestamp")) \
    .withColumn("boardingDelay", (col("actualBoardingTime").cast("long") - col("scheduledBoardingTime").cast("long")) / 60)  # in minutes

# Gate and Pier Utilization Trends
gate_pier_utilization = flight_stream_parsed \
    .filter(col("flightDirection") == "D") \
    .groupBy("gate", "pier") \
    .count()

# On-Belt Time Analysis
on_belt_time_analysis = flight_stream_parsed \
    .filter(col("flightDirection") == "D") \
    .withColumn("expectedTimeOnBelt", col("expectedTimeOnBelt").cast("timestamp")) \
    .withColumn("actualTimeOnBelt", col("actualTimeOnBelt").cast("timestamp")) \
    .withColumn("onBeltDelay", (col("actualTimeOnBelt").cast("long") - col("expectedTimeOnBelt").cast("long")) / 60)  # in minutes

# Flight Direction Distribution
flight_direction_distribution = flight_stream_parsed \
    .groupBy("flightDirection") \
    .count()

# Start the streaming query
query = arrival_statistics \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

# Wait for the termination of the query
query.awaitTermination()
