# Draft of semi-automated banana validation
### with building data from OSM

In [1]:
## shapely 2.0.0.!
import shapely
from shapely import strtree
from shapely.geometry import LineString
from shapely.validation import make_valid

import os
os.environ['USE_PYGEOS'] = '0'
# import pygeos

import geopandas
import pandas

import numpy
import matplotlib.pyplot as plt
import seaborn as sns
from palettable.cartocolors.qualitative import Bold_6

from scipy.signal import find_peaks
from scipy.stats import gaussian_kde

import os
import warnings

import pickle
from collections import Counter

from tqdm import tqdm

import osmnx as ox

### Read in face polygons
"Mini" version looks at only 1 city (Vienna)

In [3]:
city = "Vienna"
# sample meta data
sample = geopandas.read_parquet("../data/sample.parquet")
sample_mini = sample[sample.eFUA_name.isin([city])]

# face polygon data
city_id = int(sample[sample.eFUA_name == "Vienna"]["eFUA_ID"])
# read in face polygons 
facepoly = geopandas.read_parquet(f"../data/{int(city_id)}/polygons/")
# drop not needed colums and reset index
facepoly = facepoly[["geometry", "circular_compactness_index"]]
facepoly = facepoly.reset_index(drop=True)

### Classify face polygons into bananas and non-bananas with help of banana index:
- read in data
- apply peak finding procedure from `04_DRAFT_peaks` (with circular compactness index as indicator)
- classify face polygons into (potential) bananas and non-bananas

In [5]:
# peak finding procedure from notebook 04

data = numpy.log(facepoly["circular_compactness_index"])

# adjust linspace to be sparser than observations (linspace contains 10x less datapoints than original data)
n = int(len(data)/10)
mylinspace = numpy.linspace(data.min(), data.max(), n)

# fit Gaussian KDE
kde = gaussian_kde(data, bw_method="silverman")
pdf = kde.pdf(mylinspace)

# find peaks
peaks, d = find_peaks(
    x = -pdf +1,
    height = (0,.995),
    threshold = None,
    distance = None,
    prominence = 0.0005,
    width = 1,
    plateau_size = None)
    
banana = float(numpy.exp(mylinspace[peaks]))
print(f"banana value found for {city}: {banana}")

# add banana/non-banana classification to face polygon gdf
facepoly["banana"] = None
facepoly.loc[facepoly["circular_compactness_index"]<banana, "banana"] = "banana"
facepoly.loc[facepoly["circular_compactness_index"]>=banana, "banana"] = "urbanblock"

banana value found for Vienna: 523.0864515304407


### Semi-automated validation with building data from OSM


Read in / download OSM building data for city

In [6]:
# Filter warnings about GeoParquet implementation.
warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')

if os.path.exists(f"../data/buildings/buildings_Vienna.gpkg"):
    
    buildings = geopandas.read_file(f"../data/buildings/buildings_Vienna.gpkg")
    
    # project to same (projected) CRS as the face polygons
    buildings = buildings.to_crs(facepoly.crs)

# check if file exists - if it doesn't, download from OSM:
if not os.path.exists(f"../data/buildings/buildings_Vienna.gpkg"):

    # Loop over all samples
    for ix, row in tqdm(sample_mini.iterrows(), total=len(sample_mini)):
        
        print(row.eFUA_name)

        # Download OSM buildings
        buildings = ox.geometries_from_polygon(
            row.geometry, 
            tags = {"building": True})
        buildings_small = buildings[["building", "geometry"]] # saving only most relevant columns
        buildings_small.to_file(f"../data/buildings/buildings_{row.eFUA_name}.gpkg", index = False)

For each polygon, use rtree and then intersection area computation to check whether building polygons are contained inside. 

**Note:** We introduce a "building tolerance" of 100m2. Reason: bananas can contain smaller objects tagged as buildings in OSM, such as bus stop roofs. So we will say that a face polygon contains a building only if it contains over 100m2 of built area.
**This excludes all buildings saved as Points in the building data.**

In [7]:
# make rtree of *buildings*
mytree = strtree.STRtree(geoms = buildings.geometry)

# query indeces of buildings that are CONTAINED by the banana (attention false positives!)
facepoly["contains_indeces"] = facepoly.apply(
        lambda x: 
            mytree.query(x.geometry, predicate = "contains"), 
        axis = 1)
    

In [8]:
# add areas of buildings-in-facepolygon
facepoly["contains_area"] = None

for i in facepoly.index:
    facepoly.loc[i, "contains_area"] = sum(facepoly.loc[i, "geometry"].intersection(
        buildings.loc[facepoly.loc[i, "contains_indeces"], "geometry"]
        ).area)


In [9]:
# add a simple yes-no for "contains buildings" (of any area > 0m2)
facepoly["contains_buildings"] = facepoly.apply(
    lambda x: x.contains_area > 0, 
    axis = 1
)

In [10]:
# add an area cutoff of 100m2 to the contains yes/no

facepoly["contains_buildings_over100m2"] = facepoly.apply(
    lambda x: x.contains_area > 100, 
    axis = 1
)


**Confusion matrix for bananas:**
* *TP - true positive*: we correctly identified a banana. 
* *TN - true negative*: we correctly identified an urban block (non-banana)
* *FP - false positive*: we thought it's a banana, but it's actually an urban block.
* *FN - false negative*: we thought it's not a banana, but it actually is.

**Partial automation of setting up the confusion matrix with help of building validation from OSM**:
preliminary classifications with help of face polygon gdf:
* *TP*: `banana=banana` and `contains_buildings_over100m2=False` 
* *TN*: `banana=urbanblock` and `contains_buildings_over100m2=True`
* *FP*: `banana=banana` and `contains_buildings_over100m2=True`
* *FN*: `banana=urbanblock` and `contains_buildings_over100m2=False` 
after this, check and correct classifications in QGIS

In [13]:
# add confusion matrix
facepoly["confusion"] = "unknown"

# true positive
facepoly.loc[
        (facepoly["banana"] == "banana") & 
        (facepoly["contains_buildings_over100m2"]==False), 
    "confusion"] = "tp"

# true negative
facepoly.loc[
        (facepoly["banana"] == "urbanblock") & 
        (facepoly["contains_buildings_over100m2"]==True), 
    "confusion"] = "tn"

# false positive
facepoly.loc[
        (facepoly["banana"] == "banana") & 
        (facepoly["contains_buildings_over100m2"]==True), 
    "confusion"] = "fp"
    
# false negative
facepoly.loc[
        (facepoly["banana"] == "urbanblock") & 
        (facepoly["contains_buildings_over100m2"]==0), 
    "confusion"] = "fn"

In [14]:
Counter(facepoly.confusion)

Counter({'tn': 13117, 'tp': 3438, 'fn': 725, 'fp': 79})

In [15]:
# export as pickle for further data analysis in notebooks
facepoly.to_pickle("../results/facepoly_classified.pickle")

# export as gpkg for manual checks in QGIS
facepoly[["geometry", "confusion"]].to_file(
    "../../../bananas-qgis/confusionmatrix_Vienna.gpkg", index = None)
