# 00. Network preprocessing, single study area
## Project: Bicycle node network loop analysis

This notebook preprocesses the network data from a single study area (*not* whole Denmark) and saves an intermediate format.  
Please select an appropriate study area in the `config.yml` (*not* `denmark`).  

The steps are:
- Take largest component
- Simplify: Drop self-loops, dangling nodes, degree 2 nodes

Contact: Michael Szell (michael.szell@gmail.com)

Created: 2024-10-01  
Last modified: 2025-08-07

## Parameters

In [None]:
%run -i setup_parameters.py
debug = True  # Set to True for extra plots and verbosity

## Functions

In [None]:
%run -i functions.py

## Processing data

### Load data

In [None]:
edges = gpd.read_file(PATH["data_in_network"] + "edges_slope.gpkg")
edges_orig = edges  # Make a deep copy to save the geometries
edges["edge_id"] = edges.index  # Make index the edge id
if not MAXSLOPES_AVAILABLE:  # Generate random data if max_slopes not available
    edges["max_slope"] = np.random.normal(2, 2, edges.shape[0]).clip(
        min=0
    ) + np.random.rand(edges.shape[0])
nodes = gpd.read_file(PATH["data_in_network"] + "nodes.gpkg")
# Set CRS
edges.set_crs(epsg=25832, inplace=True)
nodes.set_crs(epsg=25832, inplace=True);

In [None]:
if debug:
    print(edges.head())

In [None]:
if debug:
    print(nodes.head())

In [None]:
nodes_id = list(nodes.nodeID)
nodes_x = list(nodes.geometry.x)
nodes_y = list(nodes.geometry.y)
nodes_coords = list(zip(normalize_data(nodes_x), normalize_data(nodes_y)))

In [None]:
# Rename mm_len to weight for igraph
edges = edges.rename(columns={"mm_len": "weight"})
# Drop unused columns
used_columns = {
    "node_start": (),
    "node_end": (),
    "weight": (),
    "edge_id": (),
    "max_slope": (),
    "geometry": (),
}
for c_name, _ in edges.items():
    if c_name not in used_columns:
        del edges[c_name]

# Reorder columns
edges = edges[["node_start", "node_end", "weight", "edge_id", "max_slope", "geometry"]]
edges = edges.dropna()  # Drop edges with None node_start or node_end
edges.head()

### Turn into igraph object

In [None]:
G = ig.Graph.TupleList(
    edges.itertuples(index=False),
    directed=False,
    weights=False,
    edge_attrs=["weight", "edge_id", "max_slope", "geometry"],
)
if STUDY_AREA == "bornholm":
    delta = BORNHOLM_DELTA
    for e in G.es:  # translate edges, below nodes
        G.es[e.index]["geometry"] = gpd.GeoSeries(G.es[e.index]["geometry"]).translate(
            xoff=delta[0], yoff=delta[1]
        )[0]
else:
    delta = [0, 0]
for v in G.vs:  # Add node infos: id_cykelknudepkt, x, y
    G.vs[v.index]["id_cykelknudepkt"] = nodes.loc[nodes["nodeID"] == v["name"]].iloc[0][
        "id_cykelknudepkt"
    ]
    G.vs[v.index]["x"] = round(
        nodes.loc[nodes["nodeID"] == v["name"]].iloc[0].geometry.x + delta[0], 3
    )
    G.vs[v.index]["y"] = round(
        nodes.loc[nodes["nodeID"] == v["name"]].iloc[0].geometry.y + delta[1], 3
    )

In [None]:
if debug:
    for e in G.es[:10]:
        print(e)
G.summary()

In [None]:
# Plot to double-check
if debug:
    plot_check(
        G,
        nodes_id,
        nodes_coords,
        vertex_size=get_vertex_size_constant(G.vcount()),
        edge_width=get_edgewidth_constant(G.ecount()),
    )

### Take largest connected component

In [None]:
components = G.connected_components()
memberships = np.unique(components.membership, return_counts=True)
index_gcc = memberships[0][
    memberships[1].argmax(axis=0)
]  # get index of component with most nodes
G = G.subgraph(components[index_gcc])

In [None]:
# Plot to double-check
if debug:
    plot_check(
        G,
        nodes_id,
        nodes_coords,
        vertex_size=get_vertex_size_constant(G.vcount()),
        edge_width=get_edgewidth_constant(G.ecount()),
    )

### Drop self-loops

They can be dangling links which go outside the region which were mistakenly connected to themselves.

In [None]:
G.simplify(
    multiple=True,
    loops=True,
    combine_edges=dict(weight=min, max_slope=max, edge_id=min, geometry="first"),
);

In [None]:
# Plot to double-check
if debug:
    plot_check(
        G,
        nodes_id,
        nodes_coords,
        vertex_size=get_vertex_size_constant(G.vcount()),
        edge_width=get_edgewidth_constant(G.ecount()),
    )

### Drop dangling nodes

In [None]:
# Source: https://codereview.stackexchange.com/questions/284246/deletion-of-nodes-of-degree-1-from-a-python-igraph-graph
vertices = {v for v in G.vs.select(_degree_le=1)}
needs_to_be_checked = set(vertices)
while needs_to_be_checked:
    vertex = needs_to_be_checked.pop()
    for n_vertex in vertex.neighbors():
        if (
            n_vertex in vertices
            or sum(1 for v in n_vertex.neighbors() if v not in vertices) > 1
        ):
            continue
        vertices.add(n_vertex)
        needs_to_be_checked.add(n_vertex)
G.delete_vertices(vertices)

In [None]:
# Plot to double-check
if debug:
    plot_check(
        G,
        nodes_id,
        nodes_coords,
        vertex_size=get_vertex_size_constant(G.vcount()),
        edge_width=get_edgewidth_constant(G.ecount()),
    )

### Drop degree 2 nodes

In [None]:
to_delete_ids = []

# Unclear how to select nodes in igraph by name, so let's iterate through them
for v in G.vs:
    if v.degree() == 2:
        # Remember node to delete
        to_delete_ids.append(v.index)
        # Add a new edge that combines the deleted ones
        sumoflengths = (
            v.incident()[0].attributes()["weight"]
            + v.incident()[1].attributes()["weight"]
        )
        maxofslopes = max(
            [
                v.incident()[0].attributes()["max_slope"],
                v.incident()[1].attributes()["max_slope"],
            ]
        )
        both_edges = shapely.geometry.MultiLineString(
            [
                v.incident()[0].attributes()["geometry"],
                v.incident()[1].attributes()["geometry"],
            ]
        )
        G.add_edge(
            v.neighbors()[0].index,
            v.neighbors()[1].index,
            weight=sumoflengths,
            max_slope=maxofslopes,
            edge_id=v.incident()[0].attributes()["edge_id"],
            geometry=shapely.line_merge(both_edges),
        )
G.delete_vertices(to_delete_ids)

# Re-simplify
G.simplify(
    multiple=True,
    loops=True,
    combine_edges=dict(weight=min, edge_id=min, max_slope=max, geometry="first"),
);

In [None]:
# Plot to double-check
if debug:
    plot_check(
        G,
        nodes_id,
        nodes_coords,
        vertex_size=get_vertex_size_constant(G.vcount()),
        edge_width=get_edgewidth_constant(G.ecount()),
    )

### Re-Drop dangling nodes

In [None]:
# Source: https://codereview.stackexchange.com/questions/284246/deletion-of-nodes-of-degree-1-from-a-python-igraph-graph
vertices = {v for v in G.vs.select(_degree_le=1)}
needs_to_be_checked = set(vertices)
while needs_to_be_checked:
    vertex = needs_to_be_checked.pop()
    for n_vertex in vertex.neighbors():
        if (
            n_vertex in vertices
            or sum(1 for v in n_vertex.neighbors() if v not in vertices) > 1
        ):
            continue
        vertices.add(n_vertex)
        needs_to_be_checked.add(n_vertex)
G.delete_vertices(vertices)

In [None]:
# Plot to double-check
if debug:
    plot_check(
        G,
        nodes_id,
        nodes_coords,
        vertex_size=get_vertex_size_constant(G.vcount()),
        edge_width=get_edgewidth_constant(G.ecount()),
    )

### Re-take largest connected component

In [None]:
components = G.connected_components()
memberships = np.unique(components.membership, return_counts=True)
index_gcc = memberships[0][
    memberships[1].argmax(axis=0)
]  # get index of component with most nodes
G = G.subgraph(components[index_gcc])

In [None]:
# Plot to double-check
if debug:
    plot_check(
        G,
        nodes_id,
        nodes_coords,
        vertex_size=get_vertex_size_constant(G.vcount()),
        edge_width=get_edgewidth_constant(G.ecount()),
    )

## Save preprocessed network data

In [None]:
G.summary()

In [None]:
with lzma.open(PATH["data_out"] + "network_preprocessed0.xz", "wb") as f:
    pickle.dump(G, f)