# Spatial join



In [25]:
from sedona.spark import *
from pyspark.sql.functions import col
import geopandas as gpd
from keplergl import KeplerGl


In [2]:
# build a sedona session (sedona >= 1.4.1)
config = SedonaContext.builder(). \
    config('spark.jars.packages',
           'org.apache.sedona:sedona-spark-shaded-3.0_2.12:1.4.1,'
           'org.datasyslab:geotools-wrapper:1.4.0-28.2'). \
    getOrCreate()

# create a sedona context
sedona = SedonaContext.create(config)

# get the spark context
sc = sedona.sparkContext
sc.setSystemProperty("sedona.global.charset", "utf8")

23/07/21 10:06:54 WARN Utils: Your hostname, pengfei-Virtual-Machine resolves to a loopback address: 127.0.1.1; using 10.50.2.80 instead (on interface eth0)
23/07/21 10:06:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/pengfei/opt/spark-3.3.0/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/pengfei/.ivy2/cache
The jars for the packages stored in: /home/pengfei/.ivy2/jars
org.apache.sedona#sedona-spark-shaded-3.0_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ec2a8460-9114-4802-a262-94a3de9ffc6a;1.0
	confs: [default]
	found org.apache.sedona#sedona-spark-shaded-3.0_2.12;1.4.1 in central
	found org.datasyslab#geotools-wrapper;1.4.0-28.2 in central
:: resolution report :: resolve 260ms :: artifacts dl 20ms
	:: modules in use:
	org.apache.sedona#sedona-spark-shaded-3.0_2.12;1.4.1 from central in [default]
	org.datasyslab#geotools-wrapper;1.4.0-28.2 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------

23/07/21 10:06:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

## 1. Read shape files

In [3]:
airports_file_path = "/home/pengfei/data_set/kaggle/geospatial/airports_shape"
countries_file_path = "/home/pengfei/data_set/kaggle/geospatial/countries_shape"

In [4]:
# read countries shape file
countries = ShapefileReader.readToGeometryRDD(sc, countries_file_path)
countries_df = Adapter.toDf(countries, sedona)
countries_df.createOrReplaceTempView("country")
countries_df.printSchema()

23/07/21 10:07:09 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
root
 |-- geometry: geometry (nullable = true)
 |-- featurecla: string (nullable = true)
 |-- scalerank: string (nullable = true)
 |-- LABELRANK: string (nullable = true)
 |-- SOVEREIGNT: string (nullable = true)
 |-- SOV_A3: string (nullable = true)
 |-- ADM0_DIF: string (nullable = true)
 |-- LEVEL: string (nullable = true)
 |-- TYPE: string (nullable = true)
 |-- ADMIN: string (nullable = true)
 |-- ADM0_A3: string (nullable = true)
 |-- GEOU_DIF: string (nullable = true)
 |-- GEOUNIT: string (nullable = true)
 |-- GU_A3: string (nullable = true)
 |-- SU_DIF: string (nullable = true)
 |-- SUBUNIT: string (nullable = true)
 |-- SU_A3: string (nullable = true)
 |-- BRK_DIFF: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- NAME_LONG: string (nullable = true)
 |-- BRK_A3: string (nullable = t

In [5]:
# read airports shape file
airports = ShapefileReader.readToGeometryRDD(sc, airports_file_path)
airports_df = Adapter.toDf(airports, sedona)
airports_df.createOrReplaceTempView("airport")
airports_df.printSchema()

root
 |-- geometry: geometry (nullable = true)
 |-- scalerank: string (nullable = true)
 |-- featurecla: string (nullable = true)
 |-- type: string (nullable = true)
 |-- name: string (nullable = true)
 |-- abbrev: string (nullable = true)
 |-- location: string (nullable = true)
 |-- gps_code: string (nullable = true)
 |-- iata_code: string (nullable = true)
 |-- wikipedia: string (nullable = true)
 |-- natlscale: string (nullable = true)



## Join the data frame

In this example, we join the country data frame and airport data frame by using the condition **ST_Contains(c.geometry, a.geometry)**. It means if the airport (point) in the country (polygon), then we show the row.

In [6]:
# create a new dataframe to host the result of the join
countries_airport_df = sedona.sql("SELECT c.geometry as country_location, c.NAME_EN as country_name, a.geometry as airport_location, a.name as airport_name FROM country c, airport a WHERE ST_Contains(c.geometry, a.geometry)")
countries_airport_df.show()

                                                                                

23/07/21 10:07:18 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.


                                                                                

+--------------------+--------------------+--------------------+--------------------+
|    country_location|        country_name|    airport_location|        airport_name|
+--------------------+--------------------+--------------------+--------------------+
|MULTIPOLYGON (((1...|Taiwan           ...|POINT (121.231370...|Taoyuan          ...|
|MULTIPOLYGON (((5...|Netherlands      ...|POINT (4.76437693...|Schiphol         ...|
|POLYGON ((103.969...|Singapore        ...|POINT (103.986413...|Singapore Changi ...|
|MULTIPOLYGON (((-...|United Kingdom   ...|POINT (-0.4531566...|London Heathrow  ...|
|MULTIPOLYGON (((-...|United States of ...|POINT (-149.98172...|Anchorage Int'l  ...|
|MULTIPOLYGON (((-...|United States of ...|POINT (-84.425397...|Hartsfield-Jackso...|
|MULTIPOLYGON (((1...|People's Republic...|POINT (116.588174...|Beijing Capital  ...|
|MULTIPOLYGON (((-...|Colombia         ...|POINT (-74.143371...|Eldorado Int'l   ...|
|MULTIPOLYGON (((6...|India            ...|POINT (72.8

In [7]:
countries_airport_df.printSchema()

root
 |-- country_location: geometry (nullable = true)
 |-- country_name: string (nullable = true)
 |-- airport_location: geometry (nullable = true)
 |-- airport_name: string (nullable = true)



In [11]:
# create a table
countries_airport_df.createOrReplaceTempView("country_airport")

In [18]:
airports_count=sedona.sql("SELECT c.country_name, c.country_location, count(*) as airport_count FROM country_airport c GROUP BY c.country_name, c.country_location sort by airport_count desc")
airports_count.show(5)

23/07/21 10:52:32 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.


[Stage 53:>                                                         (0 + 1) / 1]

+--------------------+--------------------+-------------+
|        country_name|    country_location|airport_count|
+--------------------+--------------------+-------------+
|United States of ...|MULTIPOLYGON (((-...|           35|
|Canada           ...|MULTIPOLYGON (((-...|           15|
|Mexico           ...|MULTIPOLYGON (((-...|           12|
|Brazil           ...|MULTIPOLYGON (((-...|           12|
|People's Republic...|MULTIPOLYGON (((1...|            7|
+--------------------+--------------------+-------------+
only showing top 5 rows



                                                                                

In [20]:
airports_count.filter(col("country_name").startswith("F")).show()

23/07/21 10:56:06 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.
+--------------------+--------------------+-------------+
|        country_name|    country_location|airport_count|
+--------------------+--------------------+-------------+
|France           ...|MULTIPOLYGON (((9...|            2|
|Finland          ...|MULTIPOLYGON (((2...|            1|
|Fiji             ...|MULTIPOLYGON (((1...|            1|
+--------------------+--------------------+-------------+



In [15]:
countries_airport_df.select("airport_name","airport_location").filter(col("country_name").startswith("Fr")).show(truncate=False)

23/07/21 10:12:00 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|airport_name                                                                                                                                                                                            |airport_location                            |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|Charles de Gaulle Int'l                                                                                                                                  

## Visualize the number of airports for each country

In [27]:
raw_pdf = airports_count.toPandas()
geo_pdf = gpd.GeoDataFrame(raw_pdf, geometry = "country_location").rename(columns={'country_location':'geometry'})

In [28]:

config = {'version': 'v1',
          'config': {'visState': {'filters': [],
                                  'layers': [{'id': 'ikzru0t',
                                              'type': 'geojson',
                                              'config': {'dataId': 'AirportCount',
                                                         'label': 'AirportCount',
                                                         'color': [218, 112, 191],
                                                         'highlightColor': [252, 242, 26, 255],
                                                         'columns': {'geojson': 'geometry'},
                                                         'isVisible': True,
                                                         'visConfig': {'opacity': 0.8,
                                                                       'strokeOpacity': 0.8,
                                                                       'thickness': 0.5,
                                                                       'strokeColor': [18, 92, 119],
                                                                       'colorRange': {'name': 'Uber Viz Sequential 6',
                                                                                      'type': 'sequential',
                                                                                      'category': 'Uber',
                                                                                      'colors': ['#E6FAFA',
                                                                                                 '#C1E5E6',
                                                                                                 '#9DD0D4',
                                                                                                 '#75BBC1',
                                                                                                 '#4BA7AF',
                                                                                                 '#00939C',
                                                                                                 '#108188',
                                                                                                 '#0E7077']},
                                                                       'strokeColorRange': {'name': 'Global Warming',
                                                                                            'type': 'sequential',
                                                                                            'category': 'Uber',
                                                                                            'colors': ['#5A1846',
                                                                                                       '#900C3F',
                                                                                                       '#C70039',
                                                                                                       '#E3611C',
                                                                                                       '#F1920E',
                                                                                                       '#FFC300']},
                                                                       'radius': 10,
                                                                       'sizeRange': [0, 10],
                                                                       'radiusRange': [0, 50],
                                                                       'heightRange': [0, 500],
                                                                       'elevationScale': 5,
                                                                       'enableElevationZoomFactor': True,
                                                                       'stroked': False,
                                                                       'filled': True,
                                                                       'enable3d': False,
                                                                       'wireframe': False},
                                                         'hidden': False,
                                                         'textLabel': [{'field': None,
                                                                        'color': [255, 255, 255],
                                                                        'size': 18,
                                                                        'offset': [0, 0],
                                                                        'anchor': 'start',
                                                                        'alignment': 'center'}]},
                                              'visualChannels': {'colorField': {'name': 'AirportCount',
                                                                                'type': 'integer'},
                                                                 'colorScale': 'quantize',
                                                                 'strokeColorField': None,
                                                                 'strokeColorScale': 'quantile',
                                                                 'sizeField': None,
                                                                 'sizeScale': 'linear',
                                                                 'heightField': None,
                                                                 'heightScale': 'linear',
                                                                 'radiusField': None,
                                                                 'radiusScale': 'linear'}}],
                                  'interactionConfig': {'tooltip': {'fieldsToShow': {'AirportCount': [{'name': 'NAME_EN',
                                                                                                       'format': None},
                                                                                                      {'name': 'AirportCount', 'format': None}]},
                                                                    'compareMode': False,
                                                                    'compareType': 'absolute',
                                                                    'enabled': True},
                                                        'brush': {'size': 0.5, 'enabled': False},
                                                        'geocoder': {'enabled': False},
                                                        'coordinate': {'enabled': False}},
                                  'layerBlending': 'normal',
                                  'splitMaps': [],
                                  'animationConfig': {'currentTime': None, 'speed': 1}},
                     'mapState': {'bearing': 0,
                                  'dragRotate': False,
                                  'latitude': 56.422456606624316,
                                  'longitude': 9.778836615231771,
                                  'pitch': 0,
                                  'zoom': 0.4214991225736964,
                                  'isSplit': False},
                     'mapStyle': {'styleType': 'dark',
                                  'topLayerGroups': {},
                                  'visibleLayerGroups': {'label': True,
                                                         'road': True,
                                                         'border': False,
                                                         'building': True,
                                                         'water': True,
                                                         'land': True,
                                                         '3d building': False},
                                  'threeDBuildingColor': [9.665468314072013,
                                                          17.18305478057247,
                                                          31.1442867897876],
                                  'mapStyles': {}}}}


In [29]:
map = KeplerGl(data={"AirportCount": geo_pdf}, config=config)
map

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(config={'version': 'v1', 'config': {'visState': {'filters': [], 'layers': [{'id': 'ikzru0t', 'type': …