In [1]:
import numpy as np
import pandas as pd
import yaml
from geospark.register import GeoSparkRegistrator
from pyspark import SparkConf, SparkContext
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import FloatType, IntegerType, LongType
from pyspark.sql.window import Window
from tqdm import tqdm

config = yaml.load(open(r"config/config.yaml"), Loader=yaml.FullLoader)
conf = SparkConf().setAll(config["sparkConf"].items())

spark = (
    SparkSession.builder.appName("GKHDistribution")
    .config(conf=conf)
    .enableHiveSupport()
    .getOrCreate()
)

GeoSparkRegistrator.registerAll(spark)

%load_ext autoreload
%autoreload 2

from lib.distribution import *
from lib.utils import *

## Initialization

In [11]:
# Features table and features
categoricalFeatures = ["houseguid"]
numericalFeatures = [
    "floor_count_max",
    "quarters_count",
    "entrance_count",
    "area_total",
    "parking_square",
    "living_quarters_count",
]

grid = spark.table(config["tableConf"]["gridBuffers"])
dataFeatures = (
    spark.table(config["tableConf"]["gkh"])
    .dropDuplicates(["houseguid"])
    .na.fill(0, numericalFeatures)
)

featuresGeometry = "geometry"
gridID = "gid"
gridGeometry = "geom_wkt"

dbName = ""
prefixName = ""

# Dictionary tables: which cell buffer intersect which cell
buffersDicts = {
    "": None,
    spark.table(f"{dbName}.{prefixName}_grid_dict_buffer_1km"): "buffer_1km",
    spark.table(f"{dbName}.{prefixName}_grid_dict_buffer_2km"): "buffer_2km",
    spark.table(f"{dbName}.{prefixName}_grid_dict_buffer_3km"): "buffer_3km",
    spark.table(f"{dbName}.{prefixName}_grid_dict_buffer_4km"): "buffer_4km",
    spark.table(f"{dbName}.{prefixName}_grid_dict_buffer_5km"): "buffer_5km",
}

# Tables for writing and their aliases
tblAndAliases = {
    f"{dbName}.{prefixName}_grid_gkh_grid": "grid",
    f"{dbName}.{prefixName}_grid_gkh_1km": "1km",
    f"{dbName}.{prefixName}_grid_gkh_2km": "2km",
    f"{dbName}.{prefixName}_grid_gkh_3km": "3km",
    f"{dbName}.{prefixName}_grid_gkh_4km": "4km",
    f"{dbName}.{prefixName}_grid_gkh_5km": "5km"}

toWriteTbls = list(tblAndAliases.keys())

## Calculation

In [None]:
gridDistrib = GridDistribution(
    grid,
    gridID,
    gridGeometry,
    dataFeatures,
    featuresGeometry,
    numericalFeatures,
    categoricalFeatures,
)

# Intersection of features and grids
featuresAndGrid = gridDistrib.featuresByGrid()

for tblNumber, bufferTbl in enumerate(buffersDicts.keys()):
    bufferData = bufferTbl
    bufferColumn = buffersDicts[bufferTbl]

    gridDistribGKH = gridDistrib.gridFeaturesAgg(
        featuresAndGrid, bufferData, bufferColumn
    )

    saveTableOverwritePartition(gridDistribGKH, 50, [gridID], toWriteTbls[tblNumber])


toWriteTbl = f"{dbName}.{prefixName}_geo_grid_gkh"

GKHJoining = gridDistrib.joining(tblAndAliases)

saveTableOverwritePartition(
    GKHJoining, 135, [gridID], toWriteTbl,
)

## Visualisation

In [None]:
import geopandas
import keplergl
from shapely import wkt

### Take only one region

In [None]:
regionID = # insert your value
featuresDF = spark.table(f"{dbName}.{prefixName}_geo_grid_gkh")
gridLocal = grid.filter(col("region_id") == regionID)

featuresDF = (
    featuresDF
    .join(gridLocal, [f"{gridID}"], "inner")
    .withColumnRenamed(f"{gridGeometry}", "geometry")
)

print(featuresDF.count())

df = featuresDF.toPandas()

### Transform to geopandas and add to map

In [None]:
for column in df.columns:
    if column != "geometry":
        df[column] = df[column].astype("int")


df["geometry"] = df.apply(lambda x: wkt.loads(str(x["geometry"])), axis=1)

poly_sectors_gdf = geopandas.GeoDataFrame(
    df, crs={"init": "epsg:4326"}, geometry="geometry"
)

map_1 = keplergl.KeplerGl(height=900)
map_1.add_data(
    data=poly_sectors_gdf
)
map_1