`scripts/make_presto_data.py` extracts the data from `zarr` files into a single `GeoDataFrame`. The steps and sampling of external data is explained here.

The field data is sampled only from within 35WNT. In addition to these data, we add 100 randomly sampled points for both asphalt roads and buildings, using Topographic Database as the reference.

```python
aoi = gpd.read_file('data/AOI.geojson').to_crs('epsg:3067')
plots = gpd.read_file('data/koealat_kaikki3101.shp', bbox=tuple(aoi.total_bounds)).to_crs('epsg:3067')
roadpoints = gpd.read_file('data/mtkpisteet.gpkg', layer='tiet')
roadpoints = roadpoints[roadpoints.kohdeluokka.isin([12111,12112,12121,12122])].copy()
buildings = gpd.read_file('data/mtkpisteet.gpkg', layer='rakennus')

plots = plots[['Inventoint', 'InvLK', 'geometry']]
plots.loc[plots.Inventoint == '430','Inventoint'] = '430 Järvi tai lampi'
plots['Inventoint'] = plots.Inventoint.apply(lambda row: row[4:])
plots.rename(columns={'Inventoint': 'label'}, inplace=True)

sample_roads = roadpoints.sample(100, random_state=66)[['kohdeluokka', 'geometry']]
sample_buildings = buildings.sample(100, random_state=66)[['kohdeluokka', 'geometry']]

sample_roads['label'] = 'Tie'
sample_buildings['label'] = 'Rakennus'
sample_roads.rename(columns={'kohdeluokka': 'InvLK'}, inplace=True)
sample_buildings.rename(columns={'kohdeluokka': 'InvLK'}, inplace=True)

sample_buildings = sample_buildings.to_crs('epsg:3067')
sample_roads = sample_roads.to_crs('epsg:3067')
df = pd.concat((plots, sample_buildings, sample_roads))
```

Next sample elevation and slope:

```python
xs = xr.DataArray(df.geometry.x.to_numpy(), dims='points', coords={'points': np.arange(len(df))})
ys = xr.DataArray(df.geometry.y.to_numpy(), dims='points', coords={'points': np.arange(len(df))})

dtm = xr.open_dataset('data/dtm.zarr')
elevations = dtm['elevation'].sel(x=xs, y=ys, method='nearest')
slopes = dtm['slope'].sel(x=xs, y=ys, method='nearest')

df['elevation'] = elevations.values
df['slope'] = slopes.values
```

Then temperature and precipitation:

```python
fmi_data = xr.open_dataset('data/fmi.zarr').sel(x=xs, y=ys, method='nearest')

for t in ['precipitation', 'temperature']:
    for y in fmi_data.year.values:
        for m in fmi_data.month.values:
            df[f'{t}_{y}_{m}'] = fmi_data[t].sel({'year': y, 'month': m})
```

Finally sample monthly Sentinel-2 data:

```python
xs_s2 = xr.DataArray(df.to_crs('epsg:32635').geometry.x.to_numpy(), dims='points', coords={'points': np.arange(len(df))})
ys_s2 = xr.DataArray(df.to_crs('epsg:32635').geometry.y.to_numpy(), dims='points', coords={'points': np.arange(len(df))})

months = [1,2,3,4,5,6,7,8,9,10,11,12]

s2_2020 = xr.open_dataset('s2-data/medians/2020/35WNT.zarr').sel(x=xs_s2, y=ys_s2, method='nearest')
for b in s2_2020.data_vars:
    for m in months:
        if m not in s2_2020.month.values: df[f'{b}_2020_{m}'] = np.nan
        else: df[f'{b}_2020_{m}'] = s2_2020[b].sel({'month': m})

s2_2021 = xr.open_dataset('s2-data/medians/2021/35WNT.zarr').sel(x=xs_s2, y=ys_s2, method='nearest')
for b in s2_2021.data_vars:
    for m in months:
        if m not in s2_2021.month.values: df[f'{b}_2021_{m}'] = np.nan
        else: df[f'{b}_2021_{m}'] = s2_2021[b].sel({'month': m})

s2_2022 = xr.open_dataset('s2-data/medians/2022/35WNT.zarr').sel(x=xs_s2, y=ys_s2, method='nearest')
for b in s2_2022.data_vars:
    for m in months:
        if m not in s2_2022.month.values: df[f'{b}_2022_{m}'] = np.nan
        else: df[f'{b}_2022_{m}'] = s2_2022[b].sel({'month': m})

df.to_file('sampled_data.gpkg')

```