# Comparing GHSL and POI data sources

In [1]:
# data
import matplotlib.pyplot as plt

import scalenav.oop as snoo

import ibis as ib
from ibis import _
import ibis.selectors as s

ib.options.interactive = True
ib.options.graphviz_repr = True

## Parameters 

In [2]:
from parameters import *

## Analysis

In [3]:
# the spatial extension
# https://duckdb.org/docs/extensions/spatial/functions

# the h3 extension in duckdb
# https://github.com/isaacbrodsky/h3-duckdb?tab=readme-ov-file

conn = snoo.sn_connect(interactive=True)

Connecting to a temporary in-memory DB instance.


In [4]:
## Aggregation resolution
agg_res = 6

## Reading processed and aggregated files 

In [5]:
try:
    ghsl = (conn
            .read_parquet(f"../datasets/JRC/processed/aggregated/S_NRES_10_res_{agg_res}.parquet")
            .cast({"band_var" : "int32"}))
except: 
    raise IOError("This aggregated file does not exist")

In [6]:
# foursqure
fsq_processed_file = "../datasets/foursquare/processed/places.parquet"
fsq_pois_ = conn.read_parquet(fsq_processed_file)

# overture 
ov_processed_filename="../datasets/overture/processed/places_landuses.parquet"
ov_pois_ = conn.read_parquet(ov_processed_filename)

## Projecting

In [7]:
fsq_pois = snoo.sn_project(fsq_pois_,res=agg_res)

Assuming coordinates columns ('longitude','latitude')


In [8]:
ov_pois = snoo.sn_project(ov_pois_,res=agg_res)

Existing h3_id column will be overwritten
Assuming coordinates columns ('x','y')


## Aggregating

In [9]:
def sn_aggregate(input : ib.Table,exprs = None):
    """Function for getting a density from a table of features. Simplest form is to return the number of features per cell. 
    More sophisticated aggregations can be done as well ?
    """

    if exprs is not None:
        # pass the expressions into the aggregations.
        pass

    return (input
            .h3_id
            .value_counts()
            .rename(count_var="h3_id_count")
            )

In [10]:
ov_pois_agg = sn_aggregate(ov_pois)
fsq_pois_agg = sn_aggregate(fsq_pois)

## Joing the tables

In [11]:
nres_poi = ib.join(ghsl,ov_pois_agg,predicates="h3_id",how="outer",rname="ov_{name}")

In [12]:
nres_poi = (nres_poi
            .join(fsq_pois_agg,"h3_id",how="outer",rname="fsq_{name}")
            )

In [13]:
nres_poi.head()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [None]:
nres_poi = (nres_poi
 .mutate(h3_id = ib.coalesce(_.select(s.matches("h3_id"))))
 .drop("h3_id_right","fsq_h3_id")
 )

In [None]:
nres_poi.head()

In [None]:
nres_poi = nres_poi.mutate(s.across(s.numeric(), _.fill_null(0).try_cast("int32")))

## Processing into df

In [None]:
nres_poi_df= nres_poi.execute()

In [None]:
nres_poi_df

In [None]:
scatter_params = {
    "logx" : True,
    "logy" : True,
}

In [None]:
nres_poi_df.plot.scatter(x="band_var",y="h3_id_count",**scatter_params)
plt.show()

In [None]:
nres_poi_df.plot.scatter(x="band_var",
                        y="fsq_h3_id_count",
                        **scatter_params
                        )
plt.show()

In [None]:
nres_poi_df.plot.hist("band_var")

In [None]:
nres_poi_df.plot.hist("fsq_h3_id_count")

In [None]:
nres_poi_df.plot.hist("h3_id_count")