### Notebook to demo converting a Spark BDT Dataframe to a Feature Class Using Arrow.

In [None]:
import os
import arcpy
import pyarrow as pa

import sparkgeo.functions as S
import pyspark.sql.functions as F

from pathlib import Path
from pyspark.sql import SparkSession

### Create a Spark Session.

Note how `arrow` is enabled and a "window" will appear. This will NOT happen with GAE :-)

In [None]:
extra_java_options = "-XX:+UseCompressedOops -XX:+AggressiveHeap"

jars = str(Path.home() / "sparkgeo-3.2-0.53" / "sparkgeo-3.2-0.53.jar")

spark = SparkSession\
    .builder\
    .master("local[*]")\
    .config("spark.driver.memory", "16G")\
    .config("spark.executor.memory", "16G")\
    .config("spark.driver.extraJavaOptions", extra_java_options)\
    .config("spark.executor.extraJavaOptions", extra_java_options)\
    .config("spark.sql.catalogImplementation", "in-memory")\
    .config("spark.sql.execution.arrow.enabled", True)\
    .config("spark.ui.enabled", False)\
    .config("spark.ui.showConsoleProgress", True)\
    .config("spark.jars", jars)\
    .getOrCreate()

### Defined SR in WKT.

In [None]:
sr_wkt = """
PROJCS["WGS_1984_Web_Mercator_Auxiliary_Sphere",
    GEOGCS["GCS_WGS_1984",
        DATUM["D_WGS_1984",
            SPHEROID["WGS_1984",6378137.0,298.257223563]],
        PRIMEM["Greenwich",0.0],
        UNIT["Degree",0.0174532925199433]],
    PROJECTION["Mercator_Auxiliary_Sphere"],
    PARAMETER["False_Easting",0.0],
    PARAMETER["False_Northing",0.0],
    PARAMETER["Central_Meridian",0.0],
    PARAMETER["Standard_Parallel_1",0.0],
    PARAMETER["Auxiliary_Sphere_Type",0.0],
    UNIT["Meter",1.0]]
"""

### Define shape column metadata.

In [None]:
metadata_shp = {'esri.encoding' : 'EsriShape', 'esri.sr_wkt': sr_wkt}

### Create arrow table schema.

Note how we defined the fields nullability and metadata.

In [None]:
fields = [
    pa.field("SHAPE", pa.binary(), nullable=False, metadata=metadata_shp),
    pa.field("x", pa.float64(), nullable=False),
    pa.field("y", pa.float64(), nullable=False)
]

schema = pa.schema(fields)
# schema

### Create a spark dataframe of points and make sure the column order is the same as in the arrow schema.


In [None]:
df = spark\
    .range(100_000)\
    .select(S.lon_to_x(F.rand()*360-180).alias("x"),S.lat_to_y(F.rand()*180-90).alias("y"))\
    .withColumn("SHAPE", S.st_as_esrishape(S.st_point("x","y")))\
    .select("SHAPE","x","y")\
    .cache()

In [None]:
# df.printSchema()

### Get Spark DF as Arrow (Thanks Jordan :-)

In [None]:
batches = df._collect_as_arrow()

### Create Arrow table with explicit schema, as metadata is missing :-(

In [None]:
tab = pa.Table.from_batches(batches, schema=schema)

In [None]:
# tab.schema

### Create ephemeral feature class.

In [None]:
fc = os.path.join("memory","SparkPoints")
arcpy.management.Delete(fc)
arcpy.management.CopyFeatures(tab, fc)

In [None]:
# df.write.parquet("delete_me.prq", mode="overwrite")

In [None]:
# spark.read.parquet("delete_me.prq").show()