In [1]:
# %load_ext autoreload
%reload_ext autoreload
%autoreload 3
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import os, sys, json
import pprint

pp = pprint.PrettyPrinter(indent=1)
sns.set_theme()

In [None]:
url = "https://treesai-gus-public.s3.eu-west-3.amazonaws.com/inputs/amsterdam_all_trees.csv"
df_pop = pd.read_csv(url)
print(
    "Based on the source, there are {} Trees in the public Amsterdam Trees Dataset.".format(
        len(df_pop)
    )
)

In [None]:
df_pop = df_pop[["OBJECTNUMMER", "Soortnaam_WTS", "Boomhoogte", "RADIUS", "LNG", "LAT"]]
df_pop = df_pop.rename(
    {
        "OBJECTNUMMER": "id",
        "Soortnaam_WTS": "species",
        "Boomhoogte": "height",
        "RADIUS": "radius",
        "LNG": "lng",
        "LAT": "lat",
    },
    axis=1,
)

In [None]:
df_pop.set_index("id")
df_pop["dbh"] = df_pop["radius"] * 2.54
df_pop.tail()

#### 1.3 Take avg. Height

In [None]:
import re

df_pop = df_pop[df_pop["height"] != "Onbekend"]

df_pop.tail()

In [None]:
def get_average_height(height_string):
    """
    Extracts the average height from a string of the format 'X tot Y m'.
    """
    matches = re.findall(r"\d+", height_string)
    return sum(map(int, matches)) / len(matches)


df_pop["height"] = df_pop["height"].apply(get_average_height)

df_pop.tail()

#### 1.4 Map LAT,LNG to x,y POS on MESA Grid.

In [None]:
# take lat and lng columns and convert to 2 separate numpy arrays
latlng_array_to_xy(df_pop)
df_pop.tail()

In [None]:
url = "https://treesai-gus-public.s3.eu-west-3.amazonaws.com/inputs/species_list_amsterdam.csv"
df = pd.read_csv(url)
df = df.rename(columns={"Species": "species"})
df.tail()

### 2. Data Prep

Merge the two dataframes on the 'species' column
Map specific Tree species to their belonging categories 
to be able to use their generic allometric equations.

merged_df = pd.merge(df, df_pop, on="species")
merged_df.columns
df_pop = merged_df[["id", "Category", "height", "dbh", "gus_x", "gus_y"]]
df_pop.tail()

In [None]:
df_pop = df_pop.rename(
    columns={"Category": "species", "gus_x": "xpos", "gus_y": "ypos"}
)
df_pop.tail()

In [None]:
# Distribute the conditions of Trees over the df.
conditions = ["fair", "good", "excellent"] * (len(df_pop) // 3) + ["good"]
df_pop["condition"] = conditions

In [None]:
from datetime import datetime

print(datetime.now())
# df_pop['species'].isna().sum() -- 368, deleting those for this case now.
df_pop.dropna(subset=["species"], inplace=True)
df_pop.tail()