### Notebook to draw the errors of the predicted trip durations from a selected location.

### Import the required modules.

In [None]:
import os
import arcpy

from spark_esri import spark_start, spark_stop

from pyspark.sql.functions import col, lit

### Start a Spark instance.

Note the `config` argument to [configure the Spark instance](https://spark.apache.org/docs/latest/configuration.html).

In [None]:
spark_stop()

config = {"spark.driver.memory":"2G"}
spark = spark_start(config=config)

### Create a Spark data frame of the selected predictions features, and create a view named 'v0'.

A new column (`error`) is added which is the square of the difference between `duration` and `duration_predicted`. 

In [None]:
fields = ['plon','plat','dlon','dlat','duration','duration_predicted']

schema = ",".join([f"{f} double" for f in fields])

with arcpy.da.SearchCursor("Predictions",fields) as data:
    spark\
        .createDataFrame(data,schema)\
        .withColumn("delta",col("duration")-col("duration_predicted"))\
        .withColumn("error", col("delta")*col("delta"))\
        .drop("delta")\
        .createOrReplaceTempView("v0")

### Calculate the average of the pickup locations.

In [None]:
rows = spark\
    .sql("""select avg(plon) plon,avg(plat) plat from v0""")\
    .collect()

plon,plat = rows[0]

### Aggregate the dropoff location at bin locations.

In [None]:
cell1 = 0.05
cell2 = cell1 * 0.5

spark\
    .sql(f"""
select
cast(dlon/{cell1} as long) dq,
cast(dlat/{cell1} as long) dr,
error
from v0
""")\
    .createOrReplaceTempView('v1')

rows = spark\
    .sql(f"""
select
dq*{cell1}+{cell2} dlon,
dr*{cell1}+{cell2} dlat,
avg(error) mse
from v1
group by dq,dr
""")\
    .collect()

### Create an in-memory linestring features between the avg pickup location and the dropoff bins.

In [None]:
ws = "memory"
nm = "Trips"

fc = os.path.join(ws,nm)

arcpy.management.Delete(fc)

sp_ref = arcpy.SpatialReference(4326)
arcpy.management.CreateFeatureclass(ws,nm,"POLYLINE",spatial_reference=sp_ref)
arcpy.management.AddField(fc, "MSE", "DOUBLE")

with arcpy.da.InsertCursor(fc, ["SHAPE@WKT","MSE"]) as cursor:
    for dlon,dlat,mse in rows:
        wkt = f"LINESTRING({plon} {plat},{dlon} {dlat})"
        cursor.insertRow((wkt,mse))

### Stop the spark instance.

In [None]:
spark_stop()