# 00. Network preprocessing
## Project: Bicycle node network loop analysis

This notebook preprocesses the network data and saves an intermediate format.
The steps are:
- Split up into components
- Simplify: Drop self-loops, dangling nodes, degree 2 nodes
- Snap POIs

Contact: Michael Szell (michael.szell@gmail.com)

Created: 2024-10-01  
Last modified: 2024-10-02  

## To do

- [ ] Handle multiple components (as list?)

## Imports

In [None]:
import geopandas as gpd
import igraph as ig
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
from functools import reduce
import pickle

## Parameters

In [None]:
%run -i setup_parameters.py
np.random.seed(42)

## Functions

In [None]:
%run -i functions.py

## Processing data

### Load data

In [None]:
edges = gpd.read_file(PATH["data_in_network"] + "edges.gpkg")
edges_orig = edges  # Make a deep copy to save the geometries
edges["edge_id"] = edges.index  # Make index the edge id
if not MAXSLOPES_AVAILABLE:  # Generate random data if max_slopes not available
    edges["max_slope"] = np.random.normal(2, 0.8, edges.shape[0])
nodes = gpd.read_file(PATH["data_in_network"] + "nodes.gpkg")
# Set CRS
edges.set_crs("epsg:25832")
nodes.set_crs("epsg:25832");

In [None]:
edges.head()

In [None]:
nodes.head()

In [None]:
nodes_id = list(nodes.nodeID)
nodes_x = list(nodes.geometry.x)
nodes_y = list(nodes.geometry.y)
nodes_coords = list(zip(NormalizeData(nodes_x), NormalizeData(nodes_y)))

In [None]:
# Rename mm_len to weight for igraph
edges = edges.rename(columns={"mm_len": "weight"})
# Drop unused columns
used_columns = {
    "node_start": (),
    "node_end": (),
    "weight": (),
    "max_slope": (),
    "edge_id": (),
}
for c_name, _ in edges.items():
    if c_name not in used_columns:
        del edges[c_name]

# Reorder columns
edges = edges[["node_start", "node_end", "weight", "max_slope", "edge_id"]]
edges = edges.dropna()  # Drop edges with None node_start or node_end
edges

### Turn into igraph object

In [None]:
G = ig.Graph.TupleList(
    edges.itertuples(index=False),
    directed=False,
    weights=False,
    edge_attrs=["weight", "edge_id", "max_slope"],
)

In [None]:
G.summary()

In [None]:
# Plot to double-check
plotCheck(
    G,
    nodes_id,
    nodes_coords,
    vertex_size=get_vertexsize(G.vcount()),
    edge_width=get_edgewidth(G.ecount()),
);

### Take largest connected component

In [None]:
components = G.connected_components()
memberships = np.unique(components.membership, return_counts=True)
index_gcc = memberships[0][
    memberships[1].argmax(axis=0)
]  # get index of component with most nodes
G = G.subgraph(components[index_gcc])

In [None]:
# Plot to double-check
plotCheck(
    G,
    nodes_id,
    nodes_coords,
    vertex_size=get_vertexsize(G.vcount()),
    edge_width=get_edgewidth(G.ecount()),
);

### Drop self-loops

They can be dangling links which go outside the region which were mistakenly connected to themselves.

In [None]:
G.simplify(
    multiple=True,
    loops=True,
    combine_edges=dict(weight=min, max_slope=max, edge_id=min),
);

In [None]:
# Plot to double-check
plotCheck(
    G,
    nodes_id,
    nodes_coords,
    vertex_size=get_vertexsize(G.vcount()),
    edge_width=get_edgewidth(G.ecount()),
);

### Drop dangling nodes

In [None]:
# Source: https://codereview.stackexchange.com/questions/284246/deletion-of-nodes-of-degree-1-from-a-python-igraph-graph
vertices = {v for v in G.vs.select(_degree_le=1)}
needs_to_be_checked = set(vertices)
while needs_to_be_checked:
    vertex = needs_to_be_checked.pop()
    for n_vertex in vertex.neighbors():
        if (
            n_vertex in vertices
            or sum(1 for v in n_vertex.neighbors() if v not in vertices) > 1
        ):
            continue
        vertices.add(n_vertex)
        needs_to_be_checked.add(n_vertex)
G.delete_vertices(vertices)

In [None]:
# Plot to double-check
plotCheck(
    G,
    nodes_id,
    nodes_coords,
    vertex_size=get_vertexsize(G.vcount()),
    edge_width=get_edgewidth(G.ecount()),
);

### Drop degree 2 nodes

In [None]:
to_delete_ids = []

# Unclear how to select nodes in igraph by name, so let's iterate through them
for v in G.vs:
    if v.degree() == 2:
        # Remember node to delete
        to_delete_ids.append(v.index)
        # Add a new edge that combines the deleted ones
        sumoflengths = (
            v.incident()[0].attributes()["weight"]
            + v.incident()[1].attributes()["weight"]
        )
        maxofslopes = max(
            [
                v.incident()[0].attributes()["max_slope"],
                v.incident()[1].attributes()["max_slope"],
            ]
        )
        G.add_edge(
            v.neighbors()[0].index,
            v.neighbors()[1].index,
            weight=sumoflengths,
            max_slope=maxofslopes,
        )
G.delete_vertices(to_delete_ids)

to_delete_ids = []
for e in G.es:
    if e["edge_id"] is None:
        to_delete_ids.append(e.index)
G.delete_edges(to_delete_ids)

# Re-simplify
G.simplify(
    multiple=True,
    loops=True,
    combine_edges=dict(weight=min, max_slope=max, edge_id=min),
);

In [None]:
# Plot to double-check
plotCheck(
    G,
    nodes_id,
    nodes_coords,
    vertex_size=get_vertexsize(G.vcount()),
    edge_width=get_edgewidth(G.ecount()),
);

### Re-Drop dangling nodes

In [None]:
# Source: https://codereview.stackexchange.com/questions/284246/deletion-of-nodes-of-degree-1-from-a-python-igraph-graph
vertices = {v for v in G.vs.select(_degree_le=1)}
needs_to_be_checked = set(vertices)
while needs_to_be_checked:
    vertex = needs_to_be_checked.pop()
    for n_vertex in vertex.neighbors():
        if (
            n_vertex in vertices
            or sum(1 for v in n_vertex.neighbors() if v not in vertices) > 1
        ):
            continue
        vertices.add(n_vertex)
        needs_to_be_checked.add(n_vertex)
G.delete_vertices(vertices)

In [None]:
# Plot to double-check
plotCheck(
    G,
    nodes_id,
    nodes_coords,
    vertex_size=get_vertexsize(G.vcount()),
    edge_width=get_edgewidth(G.ecount()),
);

### Re-take largest connected component

In [None]:
components = G.connected_components()
memberships = np.unique(components.membership, return_counts=True)
index_gcc = memberships[0][
    memberships[1].argmax(axis=0)
]  # get index of component with most nodes
G = G.subgraph(components[index_gcc])

In [None]:
# Plot to double-check
plotCheck(
    G,
    nodes_id,
    nodes_coords,
    vertex_size=get_vertexsize(G.vcount()),
    edge_width=get_edgewidth(G.ecount()),
);

### Snap POIs

Snap POIs to network. POIs come in 3 categories:
- Facilities (water station, bicycle repair station, supermarket, etc.)
- Services (camping ground, hotel, gas station, etc.)
- Attractions (church, museum, beach, etc.)  

If any of these is within reach of a link, the link is assumed to provide water. Further, we define a link's POI diversity $Y \in \{0,1,2,3\}$ as the unique number of POI categories within reach. For simplicity, reach is defined with the same constant `SNAP_THRESHOLD` for all POI categories.

We have a small number of facilities and links, so computationally it should be fine to loop through all pairwise.

In [None]:
poi_files = {
    "facility": [
        "facility.gpkg",
    ],
    "service": [
        "service.gpkg",
    ],
    "attraction": ["poi.gpkg"],
}

In [None]:
# Initialize
for e in G.es:
    e["has_water"] = False
    e["has_facility"] = False
    e["has_service"] = False
    e["has_attraction"] = False
    e["poi_diversity"] = 0

if not POIS_AVAILABLE:  # Create random data for testing
    for e in G.es:
        e["has_facility"] = (
            True if np.random.rand() < 0.1 else False
        )  # Reasonably looking thresholds
        e["has_service"] = True if np.random.rand() < 0.15 else False
        e["has_attraction"] = True if np.random.rand() < 0.05 else False
        poi_diversity = 0
        if e["has_facility"]:
            e["has_water"] = True
            poi_diversity += 1
        if e["has_service"]:
            e["has_water"] = True
            poi_diversity += 1
        if e["has_attraction"]:
            e["has_water"] = True
            poi_diversity += 1
        e["poi_diversity"] = poi_diversity

else:  # Use available poi files
    e_haspoi = {"facility": set(), "service": set(), "attraction": set()}
    for cat in [*poi_files]:
        for f in poi_files[cat]:
            print("Adding POIs from file: " + f)
            pois = gpd.read_file(PATH["data_in_pois"] + f)
            for _, poirow in tqdm(pois.iterrows(), total=pois.shape[0]):
                d = 999999999999
                eid = False
                if poirow["type"]:  # Could add conditions on type later, like Vandpost
                    poi_this = poirow["geometry"]
                    for eindex, erow in edges_orig.iterrows():
                        d_this = poi_this.distance(erow["geometry"])
                        if (
                            d_this < d
                            and erow["edge_id"] in edges_orig["edge_id"].values
                        ):
                            d = d_this
                            eid = erow["edge_id"]
                if eid and d <= SNAP_THRESHOLD:
                    e_haspoi[cat].add(eid)

    e_haswater = set()
    e_haswater = e_haspoi["facility"] | e_haspoi["service"] | e_haspoi["attraction"]

    for e in G.es:
        poi_diversity = 0
        if e["edge_id"] in e_haswater:
            e["has_water"] = True
        if e["edge_id"] in e_haspoi["facility"]:
            e["has_facility"] = True
            poi_diversity += 1
        if e["edge_id"] in e_haspoi["service"]:
            e["has_service"] = True
            poi_diversity += 1
        if e["edge_id"] in e_haspoi["attraction"]:
            e["has_attraction"] = True
            poi_diversity += 1
        e["poi_diversity"] = poi_diversity

#### Plot POI diversity

In [None]:
edge_widths = []
for e in G.es:
    edge_widths.append((e["poi_diversity"] * 2) + 0.25)

fig = plotCheck(
    G,
    nodes_id,
    nodes_coords,
    vertex_size=get_vertexsize(G.vcount()),
    edge_width=edge_widths,
)
plt.text(0, 0.04, "POI diversity")
plt.tight_layout()

#### Plot water links

In [None]:
edge_colors = []
for e in G.es:
    if e["has_water"]:
        edge_colors.append("blue")
    else:
        edge_colors.append("grey")

fig = plotCheck(
    G,
    nodes_id,
    nodes_coords,
    vertex_size=get_vertexsize(G.vcount()),
    edge_color=edge_colors,
)
plt.text(0, 0.04, "Water links highlighted")
plt.tight_layout()

## Save preprocessed network data

In [None]:
G.summary()

In [None]:
with open(PATH["data_out"] + "network_preprocessed.pkl", "wb") as f:
    pickle.dump(G, f)