In [20]:
import folium
from folium import Marker, GeoJson
from folium.plugins import HeatMap

import pandas as pd
import geopandas as gpd
from pyspark.sql import SparkSession
import os
from pyspark.sql.functions import col
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer

from sedona.sql import st_constructors as stc

ImportError: cannot import name 'st_constructors' from 'sedona.sql' (/opt/mamba/lib/python3.10/site-packages/sedona/sql/__init__.py)

In [11]:
local=False
if local:
    spark=SparkSession.builder.master("local[4]") \
                  .appName("spark_ais_eda").getOrCreate()
else:
    spark=SparkSession.builder \
                      .master("k8s://https://kubernetes.default.svc:443") \
                      .appName("spark_ais_eda") \
                      .config("spark.kubernetes.container.image",os.environ["IMAGE_NAME"]) \
                      .config("spark.kubernetes.authenticate.driver.serviceAccountName",os.environ['KUBERNETES_SERVICE_ACCOUNT']) \
                      .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE']) \
                      .config("spark.executor.instances", "4") \
                      .config("spark.executor.memory","8g") \
                      .config("spark.driver.memory","16g") \
                      .config("spark.network.timeout","1000s") \
                      .config("spark.driver.maxResultsize","5g") \
                      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
                      .config("spark.kryo.registrator", "org.apache.sedona.core.serde.SedonaKryoRegistrator") \
                      .config('spark.jars.packages',
                              'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.2.1-incubating,'
                              'org.datasyslab:geotools-wrapper:1.1.0-25.2') \
                      .enableHiveSupport() \
                      .getOrCreate()

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/onyxia/.ivy2/cache
The jars for the packages stored in: /home/onyxia/.ivy2/jars
org.apache.sedona#sedona-python-adapter-3.0_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-b3c4e923-074b-4e63-90b3-7d80e500b973;1.0
	confs: [default]
	found org.apache.sedona#sedona-python-adapter-3.0_2.12;1.2.1-incubating in central
	found org.locationtech.jts#jts-core;1.18.2 in central
	found org.wololo#jts2geojson;0.16.1 in central
	found org.apache.sedona#sedona-core-3.0_2.12;1.2.1-incubating in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.5.0 in central
	found org.apache.sedona#sedona-sql-3.0_2.12;1.2.1-incubating in central
	found org.datasyslab#geotools-wrapper;1.1.0-25.2 in central
downloading https://repo1.maven.org/maven2/org/apache/sedona/sedona-python-adapter-3.0_2.12/1.2.1-incubating/sedona-python-adapter-3.0_2.12-1.2.1-incubating.jar ...
	[

# Sedona with pyspark

https://sedona.apache.org/setup/install-python/

- **spark.driver.memory**: tells Spark to allocate enough memory for the driver program because Sedona needs to build global grid files (global index) on the driver program. If you have a large amount of data (normally, over 100 GB), set this parameter to 2~5 GB will be good. Otherwise, you may observe "out of memory" error.

- **spark.network.timeout**: is the default timeout for all network interactions. Sometimes, spatial join query takes longer time to shuffle data. This will ensure Spark has enough patience to wait for the result.

- **spark.driver.maxResultSize**: is the limit of total size of serialized results of all partitions for each Spark action. Sometimes, the result size of spatial queries is large. The "Collect" operation may throw errors.

In [14]:
if local:
    root_path="../data"
else:
    root_path="s3a://pengfei/kaggle/geo_spatial/L07"
    
ais_path=f"{root_path}/AIS_2022_01_01.csv"

In [19]:
SedonaRegistrator.registerAll(spark)

True

# Use geopandas to explorer AIS

With only 1 file of 700 MO, Geopandas requires more than 16GO memory to load the data. And it take long time to do that. So Geopandas can't handle big data. We need to find an alternative

In [3]:
ais=gpd.read_file(ais_path)

In [4]:
ais.head()

Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,VesselName,IMO,CallSign,VesselType,Status,Length,Width,Draft,Cargo,TransceiverClass,geometry
0,368084090,2022-01-01T00:00:00,29.93174,-89.99243,6.0,296.2,299.0,LARRY B WHIPPLE,,WDK7401,57,12.0,23,10,3.0,57.0,A,
1,368140160,2022-01-01T00:00:00,30.33475,-87.14429,0.0,312.0,87.0,TWISTED ANGEL,IMO0000000,WDL5339,36,,12,7,,,B,
2,366941830,2022-01-01T00:00:02,29.30919,-94.79702,0.0,180.2,511.0,SAN PATRICIO,,WCX6675,31,5.0,18,7,,57.0,A,
3,316005971,2022-01-01T00:00:06,46.50268,-84.35674,2.4,258.6,257.0,BEVERLY M I,IMO9084047,CFP2004,31,0.0,34,10,5.3,99.0,A,
4,316004054,2022-01-01T00:00:07,46.50326,-84.37506,0.3,61.9,511.0,ADANAC III,IMO8745333,VCLT,31,0.0,24,5,3.0,50.0,A,


In [5]:
ais.shape

(7239758, 18)

In [12]:
m_1 = folium.Map(location=[29.93, -89.99], tiles='cartodbpositron',zoom_start=11)

# Your code here: Visualize the collision data
for idx,row in ais.head(5).iterrows():
    Marker([row['LAT'], row['LON']]).add_to(m_1)

# Show the map
m_1

NameError: name 'embed_map' is not defined

# Use sedona to explorer AIS

1. Read raw csv file
2. Build geometry column(e.g. point, polygon, lines, etc.)
3. Convert to geo pandas dataframe

In [15]:

ships=spark.read.option("delimiter",",").option("header","true").csv(ais_path)

                                                                                

In [16]:
ships.show(5)

+---------+-------------------+--------+---------+---+-----+-------+---------------+----------+--------+----------+------+------+-----+-----+-----+----------------+
|     MMSI|       BaseDateTime|     LAT|      LON|SOG|  COG|Heading|     VesselName|       IMO|CallSign|VesselType|Status|Length|Width|Draft|Cargo|TransceiverClass|
+---------+-------------------+--------+---------+---+-----+-------+---------------+----------+--------+----------+------+------+-----+-----+-----+----------------+
|368084090|2022-01-01T00:00:00|29.93174|-89.99243|6.0|296.2|  299.0|LARRY B WHIPPLE|      null| WDK7401|        57|    12|    23|   10|  3.0|   57|               A|
|368140160|2022-01-01T00:00:00|30.33475|-87.14429|0.0|312.0|   87.0|  TWISTED ANGEL|IMO0000000| WDL5339|        36|  null|    12|    7| null| null|               B|
|366941830|2022-01-01T00:00:02|29.30919|-94.79702|0.0|180.2|  511.0|   SAN PATRICIO|      null| WCX6675|        31|     5|    18|    7| null|   57|               A|
|316005971

In [None]:
latitude="LAT"
longitude="LON"
geo_ships=ships.withColumn("geometry",stc.ST_Point(col(latitude),col(longitude)))
