In [1]:
import numpy as np
import pandas as pd
import yaml
from geospark.register import GeoSparkRegistrator
from pyspark import SparkConf, SparkContext
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import FloatType, IntegerType, LongType
from pyspark.sql.window import Window
from tqdm import tqdm

config = yaml.load(open(r"config/config.yaml"), Loader=yaml.FullLoader)
conf = SparkConf().setAll(config["sparkConf"].items())

spark = (
    SparkSession.builder.appName("DistancesDistribution")
    .config(conf=conf)
    .enableHiveSupport()
    .getOrCreate()
)

GeoSparkRegistrator.registerAll(spark)

%load_ext autoreload
%autoreload 2

from lib.distribution import *
from lib.utils import *

## Initialization

In [11]:
# Features table and features
categoricalFeatures = None
numericalFeatures = ["distance"]
categoricalFunctions = None
numericalFunctions = ["min"]

grid = spark.table(config["tableConf"]["gridCentroids"])
dataFeatures = (
    spark.table(config["tableConf"]["locality_with_centroids"])
    .dropDuplicates(["locality_id"])
    .na.fill(0, numericalFeatures)
)

gridID = "gid"
gridGeometry = "geom_wkt"
featuresGeometry = "geometry"
gridCentrLon = "lon"
gridCentrLat = "lat"
featuresCentrLon = "lon"
featuresCentrLat = "lat"

featurePopulation = "locality_population"
binPopulation = "bin_population"

toWriteTbl = f"{dbName}.{prefixName}_geo_grid_distances"

## Calculation

In [None]:
gridDistrib = GridDistribution(
    gridData,
    gridID,
    gridGeometry,
    dataFeatures,
    featuresGeometry,
    numericalFeatures,
    categoricalFeatures,
    categoricalFunctions,
    numericalFunctions,
)

saveTableOverwritePartition(
   gridDistrib.gridDistances(
       featurePopulation,
       binPopulation,
       gridCentrLon,
       gridCentrLat,
       featuresCentrLon,
       featuresCentrLat,
   ),
   50,
   [gridID],
   toWriteTbl,
   "overwrite",
)

allDistancesData = spark.table(toWriteTbl)

binsPop = allDistancesData.select(
    collect_set(col(f"{binPopulation}")).alias(f"{binPopulation}")
).first()[f"{binPopulation}"]

distancesTblAndAliases = {}

for binPop in binsPop:
    distanseData = allDistancesData.filter(col(f"{binPopulation}") == binPop)
    distancesTblAndAliases[distanseData] = f"lower_{binPop}"

toWriteTbl = f"{dbName}.{prefixName}_geo_grid_locality_distances"

DistancesJoining = gridDistrib.joining(distancesTblAndAliases)

saveTableOverwritePartition(
    DistancesJoining,
    135,
    [gridID],
    toWriteTbl,
)

## Visualisation

In [None]:
import geopandas
import keplergl
from shapely import wkt

### Take only one region

In [None]:
regionID = # insert your value
featuresDF = spark.table(f"{dbName}.{prefixName}_geo_grid_locality_distances")
gridLocal = grid.filter(col("region_id") == regionID)

featuresDF = (
    featuresDF
    .join(gridLocal, [f"{gridID}"], "inner")
    .withColumnRenamed(f"{gridGeometry}", "geometry")
)

print(featuresDF.count())

df = featuresDF.toPandas()

### Transform to geopandas and add to map

In [None]:
for column in df.columns:
    if column != "geometry":
        df[column] = df[column].astype("int")


df["geometry"] = df.apply(lambda x: wkt.loads(str(x["geometry"])), axis=1)

poly_sectors_gdf = geopandas.GeoDataFrame(
    df, crs={"init": "epsg:4326"}, geometry="geometry"
)

map_1 = keplergl.KeplerGl(height=900)
map_1.add_data(
    data=poly_sectors_gdf
)
map_1