# NETWORKS TRANSFORMATION

This notebook contains all the functions needed to perform various trasformations to the networks of the tracing usecase, and prepare the data for harmonization.

Datasets Needed:

- Hydro-network dataset
- Sewer network dataset / Discharge Points dataset


## Expected Outputs

- Connection nodes
- Discharge points
- Start and end nodes for water
- Water dataset with start and end ID
- Split water dataset with new start and end ids
- Fully connected water dataset

In [1]:
import time
import pandas as pd
import geopandas as gpd

import os
import sys


In [2]:
path = os.path.dirname(os.path.abspath(''))
os.chdir(path)

print(path)
sys.path.insert(0, path)
sys.path.insert(0, path)
from processing.config import data_src, data_dest

from processing import funcs as f
from processing import config as cfg

c:\Workdir\Develop\GOPEG\preprocessing


In [3]:
sewer_node_id = "CODEKOPPNT"
water_line_id = "VHAS"
water_node_id = "node_id"
prefix = "VL"

## Load Water Network



In [4]:
def prepare_water_data():

    water_data = f.load_data(cfg.type1_water)

    PROJ_CRS = f.set_project_crs(water_data)

    water_data = f.multiline_to_linestring(water_data, PROJ_CRS)

    startnodes_gdf = f.add_beginpoints(water_data, PROJ_CRS)
    endnodes_gdf = f.add_endpoints(water_data, PROJ_CRS)

    water_nodes_df = f.unify_nodes(
        startnodes_gdf, endnodes_gdf, water_line_id, prefix, PROJ_CRS
    )

    water_final = f.add_node_ids_to_edges(
        startnodes_gdf, endnodes_gdf, water_nodes_df, water_line_id, water_data
    )
    return water_final, water_nodes_df, PROJ_CRS

water_final, water_nodes_df, PROJ_CRS = prepare_water_data()

Data successfully loaded
Checking for multiline strings...
MultiLinesStrings: 3


  gdf = linestrings_df.append(multiline_gdf).reset_index(drop=True)


Checking for multiline strings after...
MultiLinesStrings: 0


## Load Sewer Network


Load the sewer network edges and nodes files.
If there is no sewer network, then load discharge points.

In [7]:
def prepare_sewer_data():
    sewer_edges = f.load_data(cfg.type1_sewer_edges)

    sewer_nodes = f.load_data(cfg.type1_sewer_nodes)

    sewer_points = f.load_data(cfg.type1_sewer_points)

    # if sewer_points == None:
    #     return sewer_nodes, sewer_edges
    # else:
    #     sewer_points = f.load_data(cfg.type1_sewer_points)
    return sewer_points, sewer_nodes, sewer_edges

sewer_points, sewer_nodes, sewer_edges = prepare_sewer_data()

Data successfully loaded
Data successfully loaded
Data successfully loaded


In [8]:
def get_sewer_nodes():
    full_sewer_nodes = pd.merge(
        sewer_nodes,
        sewer_points,
        left_on="NRKPNT",
        right_on=sewer_node_id,
        how="left",
    )
    return full_sewer_nodes

full_sewer_nodes = get_sewer_nodes()

## Prepare Water Network
A water network is received as an edge only network wih no nodes. Here, we generate hydro-nodes from the begining and end points of a linestring geometry, and assign them unique ids that can then be added to the water segments as begin and end points.

The following steps are performed:

### 1. Turn multiline water network into single line water network

Multiline water networks are converted into single line water networks by splitting the linestrings into individual linestrings. This is done by 'flattening' the nested list that makes up a multilinestring structure.

This ensures we can extract begin and end points of a water segment.

### 2. Generate begin and end nodes

Get begin and end point geometries by extracting the first and the last point geometries of a linestring.

### 3. Document the nodes

Perform spatial join the start and end dataframes to create one nodes geometry, and Label nodes as either start_id or end_id

### 4. Add the nodes to water segments, and create start and end id columns

In [9]:
def join_water_and_sewer():
    external_nodes = f.find_external_nodes(sewer_edges, "BEGINKPNT", "EINDKPNT")

    ext_nodes_df = (
        sewer_points.query("CODEKOPPNT in @external_nodes")
        .query("VHAS != 0")
        .drop_duplicates(subset=sewer_node_id)
        .drop(columns=["geometry"])
        .merge(
            sewer_nodes[["NRKPNT", "geometry"]],
            left_on=sewer_node_id,
            right_on="NRKPNT",
        )
        .drop(columns=["NRKPNT"])
        .drop_duplicates(subset="geometry")
    )
    water_final_cols = [water_line_id, "geometry"]
    ext_nodes_cols = ["NRHPNT", sewer_node_id, water_line_id, "geometry"]
    sewer_water_df = (
        ext_nodes_df[ext_nodes_cols]
        .merge(water_final[water_final_cols], on=water_line_id, how="left")
        .drop_duplicates(subset="geometry_x", keep="first")
        .query("geometry_y.notnull()")
        .assign(
            new_points=lambda x: f.get_nearest_point(x, "geometry_y", "geometry_x")
        )
    )
    print(sewer_water_df.shape)

    sewer_water_df = gpd.GeoDataFrame(
        sewer_water_df, geometry="new_points", crs=PROJ_CRS
    ).drop_duplicates(subset="new_points")

    conn_node_cols = ["NRHPNT", sewer_node_id, water_line_id, "new_points"]
    water_cols = [water_line_id, sewer_node_id, "geometry_y"]
    connection_nodes_df = (
        sewer_water_df[conn_node_cols]
        .rename(columns={"new_points": "geometry"})
        .reset_index(drop=True)
    )

    connection_nodes_gdf = gpd.GeoDataFrame(
        connection_nodes_df, geometry="geometry", crs=PROJ_CRS
    )
    print("Connection_nodes_df: ", connection_nodes_gdf.shape)

    water_df = (
        sewer_water_df[water_cols]
        .rename(columns={"geometry_y": "geometry"})
        .reset_index(drop=True)
    )
    water_gdf = gpd.GeoDataFrame(water_df, geometry="geometry", crs=PROJ_CRS)
    return connection_nodes_gdf, water_gdf, sewer_water_df

connection_nodes_gdf, water_gdf, sewer_water_df = join_water_and_sewer()

(13904, 6)
Connection_nodes_df:  (11699, 4)


In [10]:
def project_and_split_lines():

    nodes_gdf = connection_nodes_gdf.copy()
    nodes_gdf["coords"] = f.get_point_coords(nodes_gdf)

    water_gdf["coords"] = water_gdf.apply(
        lambda row: f.get_line_coords(row.geometry), axis=1
    )
    print(water_gdf.shape)
    splitlines_df = f.split_lines(water_gdf, nodes_gdf, water_line_id, PROJ_CRS)
    return splitlines_df

splitlines_df = project_and_split_lines()

(11699, 4)
Number of split segments:  15067
15067


In [11]:
def add_splitlines_to_waterGDF():
    water_nodes_df["source"] = "water_node"

    connection_nodes = (
        connection_nodes_gdf[[sewer_node_id, "geometry"]]
        .rename(columns={sewer_node_id: water_node_id})
        .assign(source="connection_node")
    )

    final_nodes_combined = (
        pd.concat([water_nodes_df, connection_nodes])
        .drop_duplicates(subset="geometry", keep="first")
        .reset_index(drop=True)
    )

    waternodes = f.get_new_water_nodes(
        final_nodes_combined, water_nodes_df, prefix, PROJ_CRS
    )

    final_waternodes = (
        waternodes.merge(
            full_sewer_nodes[["STATUS", "LBLTYPE", "NRKPNT"]],
            left_on="sewernode_id",
            right_on="NRKPNT",
            how="left",
        )
        .drop_duplicates(subset="geometry", keep="first")
        .reset_index(drop=True)
    )

    node_id = water_node_id
    line_id = water_line_id
    splitlines_with_ids = f.line_segments_start_end_ids(
        splitlines_df,
        waternodes[["geometry", water_node_id]],
        line_id,
        node_id,
        PROJ_CRS,
    )

    splitlines_final = f.get_new_water_unique_ID(
        splitlines_with_ids, water_final, water_line_id, PROJ_CRS
    )

    segments_to_water = f.merge_segments_to_water(
        splitlines_df, splitlines_final, water_final, water_line_id, PROJ_CRS
    )
    print("Segments to water: ", segments_to_water.shape)
    return final_waternodes, segments_to_water

final_waternodes, segments_to_water = add_splitlines_to_waterGDF()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Segments to water:  (71983, 22)
