In [28]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("SparkStreamFlightData")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

flight_schema = StructType([
    StructField("lastUpdatedAt", StringType(), True),
    StructField("actualLandingTime", StringType(), True),
    StructField("actualOffBlockTime", StringType(), True),
    StructField("aircraftRegistration", StringType(), True),
    StructField("aircraftType", StructType([
        StructField("iataMain", StringType(), True),
        StructField("iataSub", StringType(), True)
    ]), True),
    StructField("baggageClaim", StructType([
        StructField("belts", ArrayType(StringType()), True)
    ]), True),
    StructField("checkinAllocations", StringType(), True),
    StructField("codeshares", StructType([
        StructField("codeshares", ArrayType(StringType()), True)
    ]), True),
    StructField("estimatedLandingTime", StringType(), True),
    StructField("expectedTimeBoarding", StringType(), True),
    StructField("expectedTimeGateClosing", StringType(), True),
    StructField("expectedTimeGateOpen", StringType(), True),
    StructField("expectedTimeOnBelt", StringType(), True),
    StructField("expectedSecurityFilter", StringType(), True),
    StructField("flightDirection", StringType(), True),
    StructField("flightName", StringType(), True),
    StructField("flightNumber", IntegerType(), True),
    StructField("gate", StringType(), True),
    StructField("pier", StringType(), True),
    StructField("id", StringType(), True),
    StructField("isOperationalFlight", BooleanType(), True),
    StructField("mainFlight", StringType(), True),
    StructField("prefixIATA", StringType(), True),
    StructField("prefixICAO", StringType(), True),
    StructField("airlineCode", IntegerType(), True),
    StructField("publicEstimatedOffBlockTime", StringType(), True),
    StructField("publicFlightState", StructType([
        StructField("flightStates", ArrayType(StringType()), True)
    ]), True),
    StructField("route", StructType([
        StructField("destinations", ArrayType(StringType()), True),
        StructField("eu", StringType(), True),
        StructField("visa", BooleanType(), True)
    ]), True),
    StructField("scheduleDateTime", StringType(), True),
    StructField("scheduleDate", StringType(), True),
    StructField("scheduleTime", StringType(), True),
    StructField("serviceType", StringType(), True),
    StructField("terminal", IntegerType(), True),
    StructField("transferPositions", StringType(), True),
    StructField("schemaVersion", StringType(), True)
])

# Read the whole dataset as a batch
kafkaStream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9092") \
    .option("failOnDataLoss", "false") \
    .option("subscribe", "flights") \
    .option("startingOffsets", "latest") \
    .load()

df = kafkaStream.selectExpr("CAST(value AS STRING)")

df1 = df.select(from_json(df.value, flight_schema.simpleString()))

df1.printSchema()

sdf = df1.select(col("from_json(value).*"))

sdf.printSchema()

query = sdf \
    .writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9092") \
    .option("checkpointLocation", "/home/jovyan/checkpoint/flights") \
    .option("topic", "flights_results") \
    .outputMode("append") \
    .format("console") \
    .start()
try:
    query.awaitTermination()
except KeyboardInterrupt:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")

root
 |-- from_json(value): struct (nullable = true)
 |    |-- lastUpdatedAt: string (nullable = true)
 |    |-- actualLandingTime: string (nullable = true)
 |    |-- actualOffBlockTime: string (nullable = true)
 |    |-- aircraftRegistration: string (nullable = true)
 |    |-- aircraftType: struct (nullable = true)
 |    |    |-- iataMain: string (nullable = true)
 |    |    |-- iataSub: string (nullable = true)
 |    |-- baggageClaim: struct (nullable = true)
 |    |    |-- belts: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |-- checkinAllocations: string (nullable = true)
 |    |-- codeshares: struct (nullable = true)
 |    |    |-- codeshares: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |-- estimatedLandingTime: string (nullable = true)
 |    |-- expectedTimeBoarding: string (nullable = true)
 |    |-- expectedTimeGateClosing: string (nullable = true)
 |    |-- expectedTimeGateOpen: string (nulla

StreamingQueryException: [STREAM_FAILED] Query [id = 51a14f61-682c-49d6-a8dd-fd158ec2ffda, runId = 130ad0bc-592e-498c-a198-56f5e4a4fb35] terminated with exception: org.apache.kafka.common.errors.TimeoutException: Timed out waiting for a node assignment. Call: describeTopics