In [1]:
import pyproj
import geojson
import pandas as pd

source_crs = pyproj.CRS.from_epsg(4326)  
target_crs = pyproj.CRS.from_epsg(25832) 
transformer = pyproj.Transformer.from_crs(source_crs, target_crs, always_xy=True)

In [2]:
%%time

# This file contains annotations for different types of nature phenomena (e.g. lakes, forests)
path_to_file = './naturtyper_layer.geojson'

with open(path_to_file, 'r') as f:
    gj = geojson.load(f)
print(len(gj['features']), gj.keys())

# Filter to the annotations of the lakes around Denmark
gj_features = []
for feature in gj['features']:
    if feature['properties']['Natyp_kode'] == 6: # Code for lakes is 6
        gj_features.append(feature)
print(len(gj_features))
gj_features[0]

312325 dict_keys(['type', 'name', 'crs', 'features'])
152920
CPU times: user 57 s, sys: 2.82 s, total: 59.8 s
Wall time: 59.8 s


{"geometry": {"coordinates": [[[[501332.248, 6224773.935], [501334.244, 6224779.934], [501334.244, 6224784.933], [501333.246, 6224790.932], [501327.243, 6224789.932], [501317.249, 6224783.933], [501316.243, 6224780.934], [501319.244, 6224774.935], [501324.25, 6224771.936], [501329.247, 6224770.936], [501332.248, 6224773.935]]]], "type": "MultiPolygon"}, "properties": {"Aendr_kode": 0, "Aendrbegr": "Ikke udfyldt", "Besig_dato": null, "Bruger_id": "00000000-0000-0000-0000-000000000000", "CVR_kode": 29189919, "CVR_navn": "Herning kommune", "Gl_sys_ref": null, "Journalnr": null, "Link": null, "Natyp_kode": 6, "Natyp_navn": "Sø", "Objekt_id": "0460cd7c-5353-11e2-af2b-00155d01e765", "Off_kode": 1, "Offentlig": "Synlig for alle", "Oprettet": "2006-12-31T01:00:00", "Oprindelse": "Ikke udfyldt", "Oprindkode": 0, "Sagsbeh": null, "Shape_area": 252.94599999301087, "Shape_length": 0.0, "Status": "Gældende / Vedtaget", "Statuskode": 3, "Systid_fra": "2006-12-31T01:00:00", "Systid_til": null, "Temak

In [3]:
from tqdm import tqdm
import shapely

polys = []

failed, skipped = 0, 0

# Loop over the annotations, get all the polygons
for i, feature in tqdm(enumerate(gj_features)):
    geometry = feature['geometry']
    coords = geometry['coordinates']
    try:
        poly = shapely.geometry.shape(geometry)
        if not poly.is_valid: # Skip invalid shapes
            skipped += 1
            continue
        polys.append(poly)
    except:
        pass 
print('Number of polygons:', len(polys))

152920it [00:13, 11545.10it/s]

Number of polygons: 152886





# Danish water sample data

In [4]:
base_path = './denmark_data/water_samples_{}.xlsx'
xls_2020 = pd.ExcelFile(base_path.format(2020))
xls_2021 = pd.ExcelFile(base_path.format(2021))
xls_2022 = pd.ExcelFile(base_path.format(2022))
xls_2023 = pd.ExcelFile(base_path.format(2023))

In [5]:
df_2020 = pd.read_excel(xls_2020, 'Opdeling')

# There's a few sheets with lakes info: 'Sø' is another one which seems to include more?
df_2021 = pd.read_excel(xls_2021, 'Vandprøver 2021', header = 1)
df_2022 = pd.read_excel(xls_2022, 'Vandprøver 2022', header = 1)
df_2023 = pd.read_excel(xls_2023, 'Master', header = 1)

# Gather the dataframes together for easier access
dfs = [df_2020, df_2021, df_2022, df_2023]

In [16]:
# Obtain the longitudes (x) and latitudes (y) of the lakes from which we have water samples
latitude_str = 'Breddegrad'	
longitude_str = 'Længdegrad'
latitudes = []
longitudes = []
for df in dfs:
    latitudes_df = df[latitude_str]
    longitudes_df = df[longitude_str]
    latitudes.extend(latitudes_df.tolist())
    longitudes.extend(longitudes_df.tolist())

# Convert to the target CRS system (25832) that's used for the orthographic map of Denmark
# We also ensure NaN ('Ingen' in Danish) values are skipped by checking that the data is float
coordinates = [transformer.transform(long, lat) for long, lat in zip(longitudes, latitudes) if isinstance(long, float) and isinstance(lat, float)]
len(coordinates), len(latitudes), len(longitudes)

(1024, 1075, 1075)

# Connect the location of water samples to lakes

In [14]:
from shapely.geometry import Point, Polygon
import geopandas as gpd

# Convert x, y coordinates to Point objects
points = [Point(x, y) for x, y in coordinates]

# Add the polygons to a GeoDF so we can easily compute distances
gdf_polygons = gpd.GeoDataFrame({'geometry': polys})

# List to store the index of the closest polygon (one for each point)
closest_polygon_idx = []

# Calculate the closest polygon for each point, and store the index of that polygon
for point in tqdm(points):
    distances = gdf_polygons.distance(point) # Calculate the distance from the point to each polygon
    closest_idx = distances.idxmin() # The index of the polygon that's closest to our point
    closest_polygon_idx.append(closest_idx)

# Print the results
for i, point in enumerate(points):
    print(f"Coordinate {point} is closest to polygon number {closest_polygon_idx[i]}")

  closest_idx = distances.idxmin() # The index of the polygon that's closest to our point
  closest_idx = distances.idxmin() # The index of the polygon that's closest to our point
  closest_idx = distances.idxmin() # The index of the polygon that's closest to our point
  closest_idx = distances.idxmin() # The index of the polygon that's closest to our point
  closest_idx = distances.idxmin() # The index of the polygon that's closest to our point
  closest_idx = distances.idxmin() # The index of the polygon that's closest to our point
  closest_idx = distances.idxmin() # The index of the polygon that's closest to our point
  closest_idx = distances.idxmin() # The index of the polygon that's closest to our point
  closest_idx = distances.idxmin() # The index of the polygon that's closest to our point
  closest_idx = distances.idxmin() # The index of the polygon that's closest to our point
  closest_idx = distances.idxmin() # The index of the polygon that's closest to our point
  closest_

Coordinate POINT (600796.0012059553 6104007.465808964) is closest to polygon number 48724
Coordinate POINT (568378.2096251181 6158120.037465337) is closest to polygon number 87392
Coordinate POINT (594786.1401881839 6130065.447078809) is closest to polygon number 62231
Coordinate POINT (592394.123745309 6137709.017955546) is closest to polygon number 138746
Coordinate POINT (591481.2221185759 6136412.35701762) is closest to polygon number 20812
Coordinate POINT (565640.4526632151 6188510.109303815) is closest to polygon number 147767
Coordinate POINT (567080.5166618623 6137576.6754385745) is closest to polygon number 41800
Coordinate POINT (566985.2428472362 6136714.173147949) is closest to polygon number 75162
Coordinate POINT (591988.3744673001 6138611.706105445) is closest to polygon number 73462
Coordinate POINT (591423.474147991 6130891.221272387) is closest to polygon number 94268
Coordinate POINT (591345.9811052275 6132275.331788059) is closest to polygon number 23732
Coordinate


