# Sedona quick start

In this tutorial, we will show how to use sedona to do some basic geometry analytics

In [2]:
from sedona.spark import *
from pathlib import Path

# import sedona constructor function, often used to build geometry type column from native data type 
from sedona.sql import st_constructors as stc

# import sedona simple function, often used to do geometry calculation such as ST_Distance, 
from sedona.sql import st_functions as stf

# import sedona predicates function, often used to determine relation between two geometry column such as ST_Contains
from sedona.sql import st_predicates as stp
# import sedona aggregates function,
from sedona.sql import st_aggregates

In [3]:
# build a sedona session offline
jar_folder = Path(r"/home/pengfei/git/PySparkCommonFunc/jars")
jar_list = [str(jar) for jar in jar_folder.iterdir() if jar.is_file()]
jar_path = ",".join(jar_list)

# build a sedona session (sedona = 1.5.1)
config = SedonaContext.builder() \
    .master("local[*]") \
    .config('spark.jars', jar_path). \
    getOrCreate()
# config = SedonaContext.builder(). \
#     config('spark.jars.packages',
#            'org.apache.sedona:sedona-spark-shaded-3.0_2.12:1.4.1,'
#            'org.datasyslab:geotools-wrapper:1.4.0-28.2'). \
#     getOrCreate()



24/04/09 13:48:59 WARN Utils: Your hostname, pengfei-Virtual-Machine resolves to a loopback address: 127.0.1.1; using 10.50.2.80 instead (on interface eth0)
24/04/09 13:48:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/04/09 13:49:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

In [3]:
# build a sedona session online
# build a sedona session with internet
config = SedonaContext.builder(). \
    config('spark.jars.packages',
           'org.apache.sedona:sedona-spark-3.5_2.13:1.6.1,'
           'org.datasyslab:geotools-wrapper:1.6.1-28.2'). \
    config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all'). \
getOrCreate()

In [4]:
# create a sedona context
sedona = SedonaContext.create(config)

In [5]:
# get the spark context
sc = sedona.sparkContext

In [None]:
# read shape file

In [6]:
root_dir = "../data"
airports_file_path = f"{root_dir}/shapefile/airports_shape"
countries_file_path = f"{root_dir}/shapefile/countries_shape"

In [7]:
# read countries shape file
countries = ShapefileReader.readToGeometryRDD(sc, countries_file_path)
countries_df = Adapter.toDf(countries, sedona)
countries_df.createOrReplaceTempView("country")
countries_df.printSchema()

root
 |-- geometry: geometry (nullable = true)
 |-- featurecla: string (nullable = true)
 |-- scalerank: string (nullable = true)
 |-- LABELRANK: string (nullable = true)
 |-- SOVEREIGNT: string (nullable = true)
 |-- SOV_A3: string (nullable = true)
 |-- ADM0_DIF: string (nullable = true)
 |-- LEVEL: string (nullable = true)
 |-- TYPE: string (nullable = true)
 |-- ADMIN: string (nullable = true)
 |-- ADM0_A3: string (nullable = true)
 |-- GEOU_DIF: string (nullable = true)
 |-- GEOUNIT: string (nullable = true)
 |-- GU_A3: string (nullable = true)
 |-- SU_DIF: string (nullable = true)
 |-- SUBUNIT: string (nullable = true)
 |-- SU_A3: string (nullable = true)
 |-- BRK_DIFF: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- NAME_LONG: string (nullable = true)
 |-- BRK_A3: string (nullable = true)
 |-- BRK_NAME: string (nullable = true)
 |-- BRK_GROUP: string (nullable = true)
 |-- ABBREV: string (nullable = true)
 |-- POSTAL: string (nullable = true)
 |-- FORMAL_EN: st

In [8]:
# read airports shape file
airports = ShapefileReader.readToGeometryRDD(sc, airports_file_path)
airports_df = Adapter.toDf(airports, sedona)
airports_df.createOrReplaceTempView("airport")
airports_df.printSchema()

root
 |-- geometry: geometry (nullable = true)
 |-- scalerank: string (nullable = true)
 |-- featurecla: string (nullable = true)
 |-- type: string (nullable = true)
 |-- name: string (nullable = true)
 |-- abbrev: string (nullable = true)
 |-- location: string (nullable = true)
 |-- gps_code: string (nullable = true)
 |-- iata_code: string (nullable = true)
 |-- wikipedia: string (nullable = true)
 |-- natlscale: string (nullable = true)



# Join the data frame
In this example, we join the country data frame and airport data frame by using the condition ST_Contains(c.geometry, a.geometry). It means if the airport (point) in the country (polygon), then we show the row.

In [9]:
# create a new dataframe to host the result of the join
countries_airport_df = sedona.sql("SELECT c.geometry as country_location, c.NAME_EN as country_name, a.geometry as airport_location, a.name as airport_name FROM country c, airport a WHERE ST_Contains(c.geometry, a.geometry)")
countries_airport_df.show()

+--------------------+--------------------+--------------------+--------------------+
|    country_location|        country_name|    airport_location|        airport_name|
+--------------------+--------------------+--------------------+--------------------+
|MULTIPOLYGON (((1...|              Taiwan|POINT (121.231370...|             Taoyuan|
|MULTIPOLYGON (((5...|         Netherlands|POINT (4.76437693...|            Schiphol|
|POLYGON ((103.969...|           Singapore|POINT (103.986413...|    Singapore Changi|
|MULTIPOLYGON (((-...|      United Kingdom|POINT (-0.4531566...|     London Heathrow|
|MULTIPOLYGON (((-...|United States of ...|POINT (-149.98172...|     Anchorage Int'l|
|MULTIPOLYGON (((-...|United States of ...|POINT (-84.425397...|Hartsfield-Jackso...|
|MULTIPOLYGON (((1...|People's Republic...|POINT (116.588174...|     Beijing Capital|
|MULTIPOLYGON (((-...|            Colombia|POINT (-74.143371...|      Eldorado Int'l|
|MULTIPOLYGON (((6...|               India|POINT (72.8

In [10]:
countries_airport_df.printSchema()

root
 |-- country_location: geometry (nullable = true)
 |-- country_name: string (nullable = true)
 |-- airport_location: geometry (nullable = true)
 |-- airport_name: string (nullable = true)



In [11]:
# create a table
countries_airport_df.createOrReplaceTempView("country_airport")

In [12]:
airports_count=sedona.sql("SELECT c.country_name, c.country_location, count(*) as airport_count FROM country_airport c GROUP BY c.country_name, c.country_location sort by airport_count desc")
airports_count.show(5)

+--------------------+--------------------+-------------+
|        country_name|    country_location|airport_count|
+--------------------+--------------------+-------------+
|United States of ...|MULTIPOLYGON (((-...|           35|
|              Canada|MULTIPOLYGON (((-...|           15|
|              Mexico|MULTIPOLYGON (((-...|           12|
|              Brazil|MULTIPOLYGON (((-...|           12|
|People's Republic...|MULTIPOLYGON (((1...|            7|
+--------------------+--------------------+-------------+
only showing top 5 rows

