# Rreprocessing of raw data from GeoFA

In [None]:
import geopandas as gpd
import neatnet
import networkx as nx
import momepy
import shapely
from collections import Counter
import matplotlib.pyplot as plt
import contextily as cx
import folium
import pandas as pd
from shapely.geometry import Point
import time

**Author:** @anastassiavybornova

**Data source:** Data is available through [GeoFA](https://geofa-kort.geodanmark.dk/app/fkg/?config=/api/v2/configuration/fkg/configuration_fkg_udgivet_5f465f5d3181f687353260.json#Basis_kort/8/9.8328/55.9892/fkg.t_5609_cykelknudepunktsstraekninger) (download from web interface, menu point "Vej og Trafik" > "Cykelknudepunktsstr√¶kninger")

This notebook takes the raw data (obtained as described above), preprocesses it with the help of `geopandas` (and visual/manual inspection), simplifies geometries with the help of `neatnet`, and postprocesses the simplified geometries with `geopandas` and `neatnet` (and visual/manual inspection).

**Final output:** `../data/raw/edges_final.gpkg`, which contain all geometries necessary to create the 25-component bicycle node network of DK with simplified geometries.

***

## load and preprocess raw data

In [None]:
# try instead with neatnet native functions:
# load raw data (bicycle node network for all of DK)
gdf = gpd.read_file("../data/raw/2026-01-22-cykelknudepunktsstraekninger.gpkg")

# convert to proj crs
proj_crs = "EPSG:25832"
gdf = gdf.to_crs(proj_crs)
# keep only geometry column
gdf = gdf[["geometry"]].copy()
# explode multigeometries and reset index
gdf = gdf.explode().reset_index(drop=True)
# drop duplicates
gdf = gdf.drop_duplicates().reset_index(drop=True)
# make sure we only have linestrings now
assert all(gdf.geometry.geom_type == "LineString")

# remove interstitial nodes
gdf = neatnet.remove_interstitial_nodes(gdf)

# SNAP ENDPOINTS with neatnet
gdf.geometry = neatnet.close_gaps(gdf, 2)

# CLOSE GAPS with neatnet
gdf = neatnet.extend_lines(gdf, tolerance=2)

# INDUCE NODES
ind = neatnet.induce_nodes(gdf)

# convert to nx object;
# count comps in processed data (should be 25)
G = momepy.gdf_to_nx(ind)
comps = [c for c in nx.connected_components(G)]
comps = sorted(comps,key=len, reverse=True) # sort comps by descending length
assert len(comps)==25

# create edge_collect and node_collect gdfs
# edge_collect... contains edges and component IDs
# edge_collect will be used to pass on to neatnet comp-wise
# node_collect... only for plotting

edge_collect = []
node_collect = []

for i, comp in enumerate(comps):
        
        S = G.subgraph(comp)

        # collect all edges compwise in one gdf
        edgedata = list(S.edges(data=True))
        gdf_comp = gpd.GeoDataFrame(
            {
                "geometry": [edge[2]["geometry"] for edge in edgedata]
            },
            crs = proj_crs
        )
        gdf_comp["comp"] = i
        edge_collect.append(gdf_comp)

        # collect all nodes compwise in one gdf
        nodedata = list(S.nodes(data=False))
        gdf_nodes = gpd.GeoDataFrame(
            {
                "geometry": [Point(n) for n in nodedata]
            },
            crs = proj_crs
        )
        gdf_nodes["comp"] = i
        node_collect.append(gdf_nodes)

edge_collect = pd.concat(edge_collect).reset_index(drop=True)
edge_collect = gpd.GeoDataFrame(edge_collect, crs = proj_crs)

node_collect = pd.concat(node_collect).reset_index(drop=True)
node_collect = gpd.GeoDataFrame(node_collect).reset_index(drop=True)

# save to file
edge_collect.to_file("../data/raw/edge_collect_preneat.gpkg", index=False)
node_collect.to_file("../data/raw/node_collect_preneat.gpkg", index=False)

In [None]:
# edge is our 25-component object that we will pass on to neatnet.
# plot for plausibility check
fig, ax = plt.subplots(1,1, figsize=(15,15))
edge_collect.plot(ax=ax, column="comp",cmap="jet")
#cx.add_basemap(ax=ax, crs = proj_crs)
ax.set_axis_off()

In [None]:
# # RUN TO DECIDE WHICH COMPS NEED NEATNET SIMPLIFICATION
# for i, _ in enumerate(comps):
#     fig, ax = plt.subplots(1,1, figsize=(30,30))
#     edge_collect[edge_collect.comp==i].plot(ax=ax, color = "black")
#     node_collect[node_collect.comp==i].plot(ax=ax, color = "red")
#     ax.set_axis_off()
#     plt.title(f"Comp {i}")
#     plt.show()
# BASED ON VISUAL INSPECTION:
# simplify separately: 0, 1, 2, 3, 4, 5, 10, 16 (?)
# all other components can be left as is

***

## neatify with neatnet

In [None]:
simplified_comps = []
comps_to_simplify = [0,1,2,3,4,5,10]

for comp, group in edge_collect.groupby("comp"):
    if comp in comps_to_simplify:
        print(f"simplifying {comp}!")
        time_start = time.time()
        group_simplified = neatnet.neatify(group).copy()
        time_stop = time.time()
        time_total = round(time_stop - time_start)
        print(f"**********finished in {time_total} seconds**********")
        group_simplified["comp"] = comp
        simplified_comps.append(group_simplified)
    else:
        simplified_comps.append(group)

# combine separately neatified comps into one gdf
simplified_comps = pd.concat(simplified_comps).reset_index(drop=True)
simplified_comps = gpd.GeoDataFrame(simplified_comps, crs = proj_crs)
simplified_comps.to_file("../data/raw/simplified_comps.gpkg", index=False) # save to file

In [None]:
# # WHAT CHANGED? neatnet viz
# fig, ax = plt.subplots(1,1,figsize=(40,40))
# simplified_comps.plot(ax=ax, color = "blue", zorder=1)
# edge_collect.plot(ax=ax, color = "red", zorder=0)
# simplified_comps[simplified_comps._status.isin(["changed","new"])].plot(ax=ax,zorder=2, color = "orange")
# ax.set_axis_off()

In [None]:
# comps in neatified data
G = momepy.gdf_to_nx(simplified_comps)
comps = [c for c in nx.connected_components(G)]
print(len(comps))

***

## postprocessing of neatnet network:

In [None]:
# try instead with neatnet native functions:
# load raw data (bicycle node network for all of DK)
gdf = gpd.read_file("../data/raw/simplified_comps.gpkg")

# convert to proj crs
proj_crs = "EPSG:25832"
gdf = gdf.to_crs(proj_crs)
# keep only geometry column
gdf = gdf[["geometry"]].copy()
# explode multigeometries and reset index
gdf = gdf.explode().reset_index(drop=True)
# drop duplicates
gdf = gdf.drop_duplicates().reset_index(drop=True)
# make sure we only have linestrings now
assert all(gdf.geometry.geom_type == "LineString")

# remove interstitial nodes
gdf = neatnet.remove_interstitial_nodes(gdf)

# SNAP ENDPOINTS with neatnet
gdf.geometry = neatnet.close_gaps(gdf, 2)

# CLOSE GAPS with neatnet
gdf = neatnet.extend_lines(gdf, tolerance=2)

# INDUCE NODES
ind = neatnet.induce_nodes(gdf)

# SAVE TO FILE

# convert to nx object;
# count comps in processed data (should be 25)
G = momepy.gdf_to_nx(ind)
comps = [c for c in nx.connected_components(G)]
comps = sorted(comps,key=len, reverse=True) # sort comps by descending length

# create edge_collect and node_collect gdfs
# edge_collect... contains edges and component IDs
# edge_collect will be used to pass on to neatnet comp-wise
# node_collect... only for plotting

edge_collect = []
node_collect = []

for i, comp in enumerate(comps):
        
        S = G.subgraph(comp)

        # collect all edges compwise in one gdf
        edgedata = list(S.edges(data=True))
        gdf_comp = gpd.GeoDataFrame(
            {
                "geometry": [edge[2]["geometry"] for edge in edgedata]
            },
            crs = proj_crs
        )
        gdf_comp["comp"] = i
        edge_collect.append(gdf_comp)

        # collect all nodes compwise in one gdf
        nodedata = list(S.nodes(data=False))
        gdf_nodes = gpd.GeoDataFrame(
            {
                "geometry": [Point(n) for n in nodedata]
            },
            crs = proj_crs
        )
        gdf_nodes["comp"] = i
        node_collect.append(gdf_nodes)

edge_collect = pd.concat(edge_collect).reset_index(drop=True)
edge_collect = gpd.GeoDataFrame(edge_collect, crs = proj_crs)

node_collect = pd.concat(node_collect).reset_index(drop=True)
node_collect = gpd.GeoDataFrame(node_collect).reset_index(drop=True)

# save to file
node_collect.to_file("../data/raw/node_collect_postneat.gpkg", index=False)
edge_collect.to_file("../data/raw/edge_collect_postneat.gpkg", index=False)

In [None]:
# drop noisy components introduced by neatnet (with length < 1km)
noisy_comps = []
for comp, group in edge_collect.groupby("comp"):
    if group.geometry.length.sum() < 1000:
        noisy_comps.append(comp)
edges_final = edge_collect[~edge_collect.comp.isin(noisy_comps)].copy().reset_index(drop=True)

In [None]:
# save final edges to file
edges_final[["geometry"]].to_file("../data/raw/edges_final.gpkg", index=False)

***

## How to use `edges_final.gpkg`

In [13]:
# convert to nx object;
# count comps in processed data (should be 25)
gdf = gpd.read_file("../data/raw/edges_final.gpkg")
G = momepy.gdf_to_nx(gdf)
comps = [c for c in nx.connected_components(G)]
comps = sorted(comps,key=len, reverse=True) # sort comps by descending length
assert len(comps)==25

***

below this line: outdated code snippets

In [None]:
# nodes = {
#     "pre": gpd.read_file("node_collect_preneat.gpkg"),
#     "post": gpd.read_file("node_collect_postneat.gpkg"),
# }

# edges = {
#     "pre": gpd.read_file("edge_collect_preneat.gpkg"),
#     "post": gpd.read_file("edge_collect_postneat.gpkg"),
# }

In [None]:
# # RUN TO DECIDE WHICH COMPS NEED NEATNET SIMPLIFICATION
# for i in range(edges["post"].comp.max()+1):
#     fig, axs = plt.subplots(1,2, figsize=(20,10))
    
#     # pre
#     j = 0
#     edges["pre"][edges["pre"].comp==i].plot(ax=axs[j], color = "black")
#     nodes["pre"][nodes["pre"].comp==i].plot(ax=axs[j], color = "red")
#     axs[j].set_axis_off()
#     axs[j].set_title(f"Comp {i} PRE")
    
#     # post
#     j = 1
#     edges["post"][edges["post"].comp==i].plot(ax=axs[j], color = "black")
#     nodes["post"][nodes["post"].comp==i].plot(ax=axs[j], color = "red")
#     axs[j].set_axis_off()
#     axs[j].set_title(f"Comp {i} post")
    
#     plt.show()

In [None]:
# # edge is our 25-component object that we will pass on to neatnet.
# # plot for plausibility check
# fig, ax = plt.subplots(1,1, figsize=(15,15))
# edge_collect.plot(ax=ax, column="comp",cmap="turbo")
# #cx.add_basemap(ax=ax, crs = proj_crs)
# ax.set_axis_off()

In [None]:
# # RUN TO DECIDE WHICH COMPS NEED NEATNET SIMPLIFICATION
# for i, _ in enumerate(comps):
#     fig, ax = plt.subplots(1,1, figsize=(30,30))
#     edge_collect[edge_collect.comp==i].plot(ax=ax, color = "black")
#     node_collect[node_collect.comp==i].plot(ax=ax, color = "red")
#     ax.set_axis_off()
#     plt.title(f"Comp {i}")
#     plt.show()
# # BASED ON VISUAL INSPECTION:
# # simplify separately: 0, 1, 2, 3, 4, 5, 10, 16 (?)
# # all other components can be left as is

In [None]:
## OUTDATED DATA PREPROC

# def add_nodes(gdf):
#     collection = shapely.GeometryCollection(gdf.geometry.array)  # combine to a single object
#     noded = shapely.node(collection)  # add missing nodes
#     noded_gdf = gpd.GeoDataFrame(
#         {
#         },
#             "geometry": list(noded.geoms),
#         crs = gdf.crs
#     )
#     return noded_gdf

# # load raw data (bicycle node network for all of DK)
# gdf = gpd.read_file("../data/raw/2026-01-22-cykelknudepunktsstraekninger.gpkg")

# # convert to proj crs
# proj_crs = "EPSG:25832"
# gdf = gdf.to_crs(proj_crs)
# # keep only geometry column
# gdf = gdf[["geometry"]].copy()
# # explode multigeometries and reset index
# gdf = gdf.explode().reset_index(drop=True)
# # drop duplicates
# gdf = gdf.drop_duplicates().reset_index(drop=True)
# # make sure we only have linestrings now
# assert all(gdf.geometry.geom_type == "LineString")

# noded_gdf = add_nodes(gdf)
# # additional noding (at intersections etc)
# # collection = shapely.GeometryCollection(gdf.geometry.array)  # combine to a single object
# # noded = shapely.node(collection)  # add missing nodes
# # noded_gdf = gpd.GeoDataFrame(
# #     {
# #         "geometry": list(noded.geoms),
# #     },
# #     crs = gdf.crs
# # )

# # close minor gaps
# closed_gaps = momepy.close_gaps(noded_gdf, 1)
# noded_gdf["geometry"] = closed_gaps

# # node again
# noded_gdf = add_nodes(noded_gdf)


In [None]:
### OUTDATED EXPLORATIONS: NEED TO RENODE AFTER CLOSING GAPS!

# gdf_all = gpd.GeoDataFrame(
#     {
#         "geometry": list(nx.get_edge_attributes(G, "geometry").values())
#     },
#     crs = proj_crs
# )
# #gdf_all.plot()

# m = gdf_all.explore(tiles = "CartoDB positron", name="all", color = "red")

# for i, comp in enumerate(comps):

#     S = G.subgraph(comp)
#     edgedata = list(S.edges(data=True))
#     gdf_comp = gpd.GeoDataFrame(
#         {
#             "geometry": [edge[2]["geometry"] for edge in edgedata],
#         },
#         crs = proj_crs
#     )
#     gdf_comp["comp"] = i

#     gdf_comp.explore(m=m, name=f"{i} comp")

# folium.LayerControl().add_to(m)
# m

# gdf_collect = []

# for i, comp in enumerate(comps):

#     S = G.subgraph(comp)
#     edgedata = list(S.edges(data=True))
#     gdf_comp = gpd.GeoDataFrame(
#         {
#             "geometry": [edge[2]["geometry"] for edge in edgedata],
#         },
#         crs = proj_crs
#     )
#     gdf_comp["comp"] = i
#     gdf_collect.append(gdf_comp)

# gdf_temp = pd.concat(gdf_collect).reset_index(drop=True)
# gdf_temp = gpd.GeoDataFrame(gdf_temp, crs = proj_crs)

# # drop, in their current state: comps 26, 28, 29, 32, 33, 38

In [None]:
# for comp in comps:
#     if len(comp) in range(10,101):
#         S = G.subgraph(comp)

#         fig, ax = plt.subplots(1,1)
#         edgedata = list(S.edges(data=True))
#         gpd.GeoDataFrame(
#             {
#                 "geometry": [edge[2]["geometry"] for edge in edgedata]
#             },
#             crs = proj_crs
#         ).plot(ax=ax)
#         ax.set_axis_off()
#         cx.add_basemap(ax=ax, crs = proj_crs)
#         plt.show()