In [1]:
from pyspark.sql import SparkSession

# # Topics/Brokers
topic_real_time_states = "real-time-states"
topic_raw_json = 'raw_json'
topic_flat_json = 'flat_json'
topic_test = "topic_test"
broker = "localhost:9092"


In [2]:
import os

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.1 pyspark-shell'

In [3]:
spark = SparkSession \
    .builder \
    .appName("StructuredRealTimeState") \
    .getOrCreate()

In [4]:
from pyspark.sql import types as T
from pyspark.sql import functions as F
import json

In [5]:
schema = T.StructType()\
.add("time", T.IntegerType())\
.add("icao24", T.StringType())\
.add("callsign", T.StringType())\
.add("last_contact", T.IntegerType())\
.add("longitude", T.FloatType())\
 .add("latitude", T.FloatType())\
 .add("baro_altitude", T.FloatType())\
 .add("on_ground", T.IntegerType())\
 .add("velocity", T.FloatType())\
 .add("geo_altitude", T.FloatType())\
 .add("squawk", T.StringType())\
 .add("position_source", T.IntegerType())



In [8]:
df = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", broker) \
  .option("subscribe", topic_test) \
    .option("startingOffsets", "earliest")\
  .load()\
.select(F.from_json(F.col("value").cast("string"), schema).alias("value"))

#.selectExpr("CAST(value AS STRING)") 


In [9]:
df.show()

+-----+
|value|
+-----+
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
+-----+



In [86]:
df1 = df.select("value.*")

In [88]:
df1.show()

+----------+------+--------+------------+---------+--------+-------------+---------+--------+------------+------+---------------+
|      time|icao24|callsign|last_contact|longitude|latitude|baro_altitude|on_ground|velocity|geo_altitude|squawk|position_source|
+----------+------+--------+------------+---------+--------+-------------+---------+--------+------------+------+---------------+
|1576513660|ac96b8|  AAL115|  1576513660| -84.5568| 33.7335|      1996.44|        0|  109.06|     2080.26|  5671|              0|
|1576513660|ac96b8|  AAL115|  1576513660| -84.5568| 33.7335|      1996.44|        0|  109.06|     2080.26|  5671|              0|
|1576513660|ac96b8|  AAL115|  1576513660| -84.5568| 33.7335|      1996.44|        0|  109.06|     2080.26|  5671|              0|
|1576513660|ac96b8|  AAL115|  1576513660| -84.5568| 33.7335|      1996.44|        0|  109.06|     2080.26|  5671|              0|
|1576513660|ac96b8|  AAL115|  1576513660| -84.5568| 33.7335|      1996.44|        0|  109.

In [7]:
df.select(F.from_json(F.col("value").cast("string"), schema)).show()

+------------------------------------+
|jsontostructs(CAST(value AS STRING))|
+------------------------------------+
|                        [1576513660]|
|                        [1576513660]|
|                        [1576513660]|
|                        [1576513660]|
|                        [1576513660]|
+------------------------------------+



In [15]:
df1 = df.select("value.*")

In [16]:
df1.show(1)

+----+---------+
|time|longitude|
+----+---------+
|null|     null|
+----+---------+
only showing top 1 row



In [None]:
schema = T.StructType()\
.add("time", T.IntegerType())\
.add("icao24", T.StringType())\
.add("callsign", T.StringType())\
.add("last_contact", T.StringType())\
.add("longitude", T.FloatType())\
.add("latitude", T.StringType())\
.add("baro_altitude", T.StringType())\
.add("on_ground", T.StringType())\
.add("velocity", T.StringType())\
.add("geo_altitude", T.StringType())\
.add("squawk", T.StringType())\
.add("position_source", T.StringType())

In [11]:
!kafka-topics --zookeeper localhost:2181/kafka --delete --topic topic_test
    
!kafka-topics --zookeeper localhost:2181/kafka --list
    

Final CLASSPATH is /etc/hadoop/conf:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/kerb-client-1.0.0.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/kerb-util-1.0.0.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/json-smart-2.3.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/jsr311-api-1.1.1.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/jsr305-3.0.0.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/jetty-xml-9.3.20.v20170531.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/snappy-java-1.1.4.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/junit-4.11.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/jars/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/kafka/libs/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
19/12/16 21:33:23 INFO zkclient.ZkEventThread: Starting ZkClient event thread.
19/12/16 21:33:23 INFO zookeeper.ZooKeeper: Client environment:zookeeper.version=3.4.5-cdh6.0.0--1, built on 08/17/2018 22:35 GMT
19/12/16 21:33:23 INFO zookeeper.ZooKeeper: Client environment:host.name=cnt7-naya-cdh6.org
19/12/16 21:33:23 INFO zookeeper.ZooKeeper: Client environment:java.version=1.8.0_141
19/12/16 21:33:23 INFO zookeeper.ZooKeeper: Client environment:java.vendor=Oracle C

Error while executing topic command : Topic topic_test does not exist on ZK path localhost:2181/kafka
19/12/16 21:33:23 ERROR admin.TopicCommand$: java.lang.IllegalArgumentException: Topic topic_test does not exist on ZK path localhost:2181/kafka
	at kafka.admin.TopicCommand$.deleteTopic(TopicCommand.scala:180)
	at kafka.admin.TopicCommand$.main(TopicCommand.scala:71)
	at kafka.admin.TopicCommand.main(TopicCommand.scala)

19/12/16 21:33:23 INFO zkclient.ZkEventThread: Terminate ZkClient event thread.
19/12/16 21:33:23 INFO zookeeper.ZooKeeper: Session: 0x16f0d9c347708ac closed
19/12/16 21:33:23 INFO zookeeper.ClientCnxn: EventThread shut down
Final CLASSPATH is /etc/hadoop/conf:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/kerb-client-1.0.0.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/kerb-util-1.0.0.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/json

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/jars/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/kafka/libs/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
19/12/16 21:33:25 INFO zkclient.ZkEventThread: Starting ZkClient event thread.
19/12/16 21:33:25 INFO zookeeper.ZooKeeper: Client environment:zookeeper.version=3.4.5-cdh6.0.0--1, built on 08/17/2018 22:35 GMT
19/12/16 21:33:25 INFO zookeeper.ZooKeeper: Client environment:host.name=cnt7-naya-cdh6.org
19/12/16 21:33:25 INFO zookeeper.ZooKeeper: Client environment:java.version=1.8.0_141
19/12/16 21:33:25 INFO zookeeper.ZooKeeper: Client environment:java.vendor=Oracle C

19/12/16 21:33:25 INFO zookeeper.ClientCnxn: Session establishment complete on server cnt7-naya-cdh6.org/127.0.0.1:2181, sessionid = 0x16f0d9c347708ad, negotiated timeout = 30000
19/12/16 21:33:25 INFO zkclient.ZkClient: zookeeper state changed (SyncConnected)
__consumer_offsets
flat_json
kafka-tst-01
kafka-tst-02
kafka-tst-03
raw_json
real-time-states
topic1
19/12/16 21:33:25 INFO zkclient.ZkEventThread: Terminate ZkClient event thread.
19/12/16 21:33:25 INFO zookeeper.ZooKeeper: Session: 0x16f0d9c347708ad closed
19/12/16 21:33:25 INFO zookeeper.ClientCnxn: EventThread shut down


In [39]:
!kafka-topics --zookeeper localhost:2181/kafka --list

Final CLASSPATH is /etc/hadoop/conf:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/kerb-client-1.0.0.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/kerb-util-1.0.0.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/json-smart-2.3.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/jsr311-api-1.1.1.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/jsr305-3.0.0.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/jetty-xml-9.3.20.v20170531.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/snappy-java-1.1.4.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/lib/junit-4.11.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop/

0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop-hdfs/lib/nimbus-jose-jwt-4.41.1.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop-hdfs/lib/jetty-servlet-9.3.20.v20170531.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop-hdfs/lib/commons-logging-1.1.3.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop-hdfs/lib/kerby-xdr-1.0.0.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop-hdfs/lib/okhttp-2.7.5.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop-hdfs/lib/jetty-util-ajax-9.3.20.v20170531.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop-hdfs/lib/httpclient-4.5.3.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/libexec/../../hadoop-hdfs/lib/jackson-core-asl-1.9.13.jar:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/hadoop/lib

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/jars/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-6.0.0-1.cdh6.0.0.p0.537114/lib/kafka/libs/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
19/12/16 21:01:13 INFO zkclient.ZkEventThread: Starting ZkClient event thread.
19/12/16 21:01:13 INFO zookeeper.ZooKeeper: Client environment:zookeeper.version=3.4.5-cdh6.0.0--1, built on 08/17/2018 22:35 GMT
19/12/16 21:01:13 INFO zookeeper.ZooKeeper: Client environment:host.name=cnt7-naya-cdh6.org
19/12/16 21:01:13 INFO zookeeper.ZooKeeper: Client environment:java.version=1.8.0_141
19/12/16 21:01:13 INFO zookeeper.ZooKeeper: Client environment:java.vendor=Oracle C

19/12/16 21:01:13 INFO zkclient.ZkClient: Waiting for keeper state SyncConnected
19/12/16 21:01:13 INFO zookeeper.ClientCnxn: Opening socket connection to server cnt7-naya-cdh6.org/127.0.0.1:2181. Will not attempt to authenticate using SASL (unknown error)
19/12/16 21:01:13 INFO zookeeper.ClientCnxn: Socket connection established, initiating session, client: /127.0.0.1:35534, server: cnt7-naya-cdh6.org/127.0.0.1:2181
19/12/16 21:01:13 INFO zookeeper.ClientCnxn: Session establishment complete on server cnt7-naya-cdh6.org/127.0.0.1:2181, sessionid = 0x16f0d9c3477082b, negotiated timeout = 30000
19/12/16 21:01:13 INFO zkclient.ZkClient: zookeeper state changed (SyncConnected)
__consumer_offsets
flat_json
kafka-tst-01
kafka-tst-02
kafka-tst-03
raw_json
real-time-states
topic1
topic_test
19/12/16 21:01:13 INFO zkclient.ZkEventThread: Terminate ZkClient event thread.
19/12/16 21:01:13 INFO zookeeper.ZooKeeper: Session: 0x16f0d9c3477082b closed
19/12/16 21:01:13 INFO zookeeper