In [1]:
jvm = spark.sparkContext._jvm
gdb = jvm.com.esri.gdb.FileGDB

In [2]:
for tab in gdb.listTables("Miami.gdb"):
    print(tab)

NameIndex(MiamiExtent,9)
NameIndex(Voyage,10)
NameIndex(Broadcast,11)
NameIndex(Vessel,12)
NameIndex(BaseStations,13)
NameIndex(AttributeUnits,14)
NameIndex(Extent,15)


In [9]:
df = spark.read \
    .format('com.esri.gdb') \
    .options(path='Miami.gdb', name='Broadcast') \
    .load()

In [10]:
df.printSchema()

root
 |-- OBJECTID: integer (nullable = false)
 |-- Shape: struct (nullable = true)
 |    |-- x: double (nullable = true)
 |    |-- y: double (nullable = true)
 |-- SOG: integer (nullable = true)
 |-- COG: integer (nullable = true)
 |-- Heading: integer (nullable = true)
 |-- ROT: integer (nullable = true)
 |-- BaseDateTime: timestamp (nullable = true)
 |-- Status: integer (nullable = true)
 |-- VoyageID: integer (nullable = true)
 |-- MMSI: integer (nullable = true)
 |-- ReceiverType: string (nullable = true)
 |-- ReceiverID: string (nullable = true)



In [11]:
df.count()

1365578

In [12]:
df.show()

+--------+--------------------+---+---+-------+---+-------------------+------+--------+---------+------------+----------+
|OBJECTID|               Shape|SOG|COG|Heading|ROT|       BaseDateTime|Status|VoyageID|     MMSI|ReceiverType|ReceiverID|
+--------+--------------------+---+---+-------+---+-------------------+------+--------+---------+------------+----------+
|      11|[-80.161767, 25.7...|  0| 77|    511|128|2008-12-31 18:59:00|     0|      11|366883280|           b| 003669972|
|      24|[-80.142313, 25.7...|  0|253|    329|  0|2008-12-31 18:59:00|     0|      24|319930000|           r|   07DMIA1|
|      28|[-80.015947, 25.6...|  8|184|    185|  0|2008-12-31 18:59:00|     0|      28|366098000|           r|   07DMIA2|
|      31|[-80.155533, 25.7...|  0|333|    511|128|2008-12-31 18:59:00|     0|      31|367302470|           r|   07DMIA2|
|      33|[-80.246237, 25.7...|  0|347|    309|  0|2008-12-31 18:59:00|     5|      33|351291000|           r|   07DMIA2|
|      44|[-79.9608, 25.

In [13]:
df.registerTempTable("bc")

In [14]:
sql("""
select mmsi,dt1,x1,y1,dt2,x2,y2 from (
select MMSI mmsi,
BaseDateTime as dt1,
Shape.x as x1,
Shape.y as y1,
lead(BaseDateTime) over (partition by MMSI order by BaseDateTime) as dt2,
lead(Shape.x) over (partition by MMSI order by BaseDateTime) as x2,
lead(Shape.y) over (partition by MMSI order by BaseDateTime) as y2
from bc
)
where dt1 < dt2
""").show(100, truncate=True)

+---------+--------------------+----------+------------------+--------------------+----------+------------------+
|     mmsi|                 dt1|        x1|                y1|                 dt2|        x2|                y2|
+---------+--------------------+----------+------------------+--------------------+----------+------------------+
|209570000| 2009-01-03 23:20:00|-79.983802|25.950670000000002|2009-01-03 23:21:...|-79.982727|25.946623000000002|
|209570000|2009-01-03 23:21:...|-79.982727|25.946623000000002| 2009-01-03 23:22:00|-79.981605|25.942572999999996|
|209570000| 2009-01-03 23:22:00|-79.981605|25.942572999999996| 2009-01-03 23:23:00|-79.980565|25.938595000000007|
|209570000| 2009-01-03 23:23:00|-79.980565|25.938595000000007| 2009-01-03 23:24:00| -79.97948|25.934588000000005|
|209570000| 2009-01-03 23:24:00| -79.97948|25.934588000000005|2009-01-03 23:25:...|-79.978423|25.930507000000006|
|209570000|2009-01-03 23:25:...|-79.978423|25.930507000000006| 2009-01-03 23:26:00| -79.