# NETWORKS TRANSFORMATION

This notebook contains all the functions needed to perform various trasformations to the networks of the tracing usecase, and prepare the data for harmonization.

Datasets Needed:

- Hydro-network dataset
- Sewer network dataset
- Discharge Points dataset

## Expected Outputs
- Connection nodes
- Discharge points
- Start and end nodes for water
- Water dataset with start and end ID
- Split water dataset with new start and end ids
- Fully connected water dataset

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from shapely.geometry import Point, Polygon, LineString, MultiLineString, MultiPoint
from shapely.wkt import loads
from shapely.ops import nearest_points
from shapely import wkt
import shapely.wkt

import random

import fiona

In [2]:
#pwd

In [3]:
import os
os.chdir('../')

In [4]:
PROJ_CRS = 'EPSG:31370'

In [5]:
def load_data(path, crs=PROJ_CRS):
    """
    Loads the data from the given path, 
    and prints the shape and crs of the data.
    """
    data = gpd.read_file(path)
    print(data.shape)
    print("Original crs:", data.crs)
    data = data.to_crs(crs)
    print("Project crs:", data.crs)
    return data

In [6]:
#wal_water = load_data(r'C:\workdir\Develop\Data_collection\BE\WA\WaterNetwork\HY.PhysicalWaters.Watercourse.gml')

wal_water = load_data(r'data_preprocess\wal_hydronetwork\MESU__CENTRELINES.shp')

(1489, 19)
Original crs: PROJCS["BD72 / Belgian Lambert 72",GEOGCS["BD72",DATUM["Reseau_National_Belge_1972",SPHEROID["International 1924",6378388,297,AUTHORITY["EPSG","7022"]],AUTHORITY["EPSG","6313"]],PRIMEM["Greenwich",0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["latitude_of_origin",90],PARAMETER["central_meridian",4.36748666666667],PARAMETER["standard_parallel_1",49.8333339],PARAMETER["standard_parallel_2",51.1666672333333],PARAMETER["false_easting",150000.01256],PARAMETER["false_northing",5400088.4378],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH]]
Project crs: EPSG:31370


In [7]:
assert wal_water.geometry.nunique() == wal_water.CENTRELINE.nunique()

In [8]:
assert wal_water.EU_SWB_COD.nunique() == wal_water.MS_SWD_COD.nunique()

## Add begin and end points to the water network

In [9]:
#Check for multiline strings in a dataset
def check_multiline(df):
    """This function checks for multiline strings
        from the geometry column in a given dataset"""
    lst = df['geometry'].to_list()
    multiline_count = 0
    for item in lst:
        if isinstance(item, MultiLineString):
            multiline_count += 1
    print("MultiLinesStrings:" , multiline_count)
    

#filter out multilinestring dataset
def multiline_to_linestring(df):
    #filter out multilinestring dataset
    multiline_df = df[df['geometry'].apply(lambda x: isinstance(x, MultiLineString))]
    linestrings_df = df[df.geom_type == 'LineString']
    if len(linestrings_df) == len(df):
        print("No multiline strings found")
        return df

    else:
        print("Checking for multiline strings...")
        check_multiline(df)
        # turn multilinestrings into linestrings
        linestrings = []
        for idx, row in multiline_df.iterrows():
            inlines = row.geometry
            outcoords = [list(item.coords) for item in inlines]
            outline = shapely.geometry.LineString([i for sublist in outcoords for i in sublist])
            #outline_geom = shapely.wkt.dumps(outline)
            linestrings.append(outline)

            
        # add  linestrings to dataframe and drop original geom column
        multiline_df['exploded'] = linestrings
        multiline_df = multiline_df.drop(['geometry'], axis=1).rename(columns={'exploded': 'geometry'}).reset_index(drop=True)
        multiline_gdf = gpd.GeoDataFrame(multiline_df, geometry='geometry', crs=PROJ_CRS)

        gdf = linestrings_df.append(multiline_gdf).reset_index(drop=True)
        print("Checking for multiline strings after...") 
        check_multiline(gdf)

    return gdf

In [10]:
water_data_df = multiline_to_linestring(wal_water)

No multiline strings found


In [11]:
water_data_df.head(2)

Unnamed: 0,OBJECTID,CENTRELINE,SWB_NAME,EU_SWB_COD,MS_SWD_COD,MS_TYPO_CO,MS_TYPO_NA,SWB_CARACT,ID_BASSIN_,NOM_BASSIN,ID_DISTRIC,ID_BASSI00,NOM_DISTRI,AREA_KM2,LENGTH_KM,CONTINUA,FICTICIOUS,REMARQUE,geometry
0,748.0,VE01L_01,Réservoir de la Vesdre,BERW17_VE01L,VE01L,RES_06,Réservoirs fagnards,fortement modifiée,BE_MEUSE_RW_M_VE,Vesdre,BE_MEUSE_RW,BE_MEUSE_RW_M,Meuse,10.790448,5.909195,virtualSegmentLake,True,réservoir,"LINESTRING (272046.907 146660.529, 272472.407 ..."
1,749.0,OU06R_02,Ourthe occidentale III,BERW17_OU06R,OU06R,RIV_06,Rivières ardennaises à pente moyenne,naturelle,BE_MEUSE_RW_M_OU,Ourthe,BE_MEUSE_RW,BE_MEUSE_RW_M,Meuse,98.29956,25.528706,realSurfaceWaterSegment,False,,"LINESTRING (232526.758 85282.629, 232532.005 8..."


Add begin and end point  to the water network.

In [12]:
def add_beginpoints(df):
    startnodes_gdf = df.copy()
    lst = startnodes_gdf['geometry'].to_list()
    beginpoints = []
    for item in lst:
        first = Point(item.coords[0])
        first_precise = shapely.wkt.dumps(first)
        beginpoints.append(first_precise)

    startnodes_gdf['start_point'] = [wkt.loads(g) for g in beginpoints]
    startnodes_gdf = startnodes_gdf.drop(['geometry'], axis=1).rename(columns={'start_point': 'geometry'})
    
    startnodes_gdf = gpd.GeoDataFrame(startnodes_gdf, geometry=startnodes_gdf['geometry'], crs=PROJ_CRS) #.drop(columns=[col])
    return startnodes_gdf


def add_endpoints(df):
    endnodes_gdf = df.copy()
    lst = endnodes_gdf['geometry'].to_list()
    endpoints = []
    for item in lst:
        last = Point(item.coords[-1])
        last_precise = shapely.wkt.dumps(last)
        endpoints.append(last_precise)

    endnodes_gdf['end_point'] = [wkt.loads(g) for g in endpoints]
    endnodes_gdf = endnodes_gdf.drop(['geometry'], axis=1).rename(columns={'end_point': 'geometry'})

    endnodes_gdf = gpd.GeoDataFrame(endnodes_gdf, geometry=endnodes_gdf['geometry'], crs=PROJ_CRS) #.drop(columns=[col])
    return endnodes_gdf

In [13]:
startnodes_gdf = add_beginpoints(water_data_df)
endnodes_gdf = add_endpoints(water_data_df)
print(startnodes_gdf.shape)
print(endnodes_gdf.shape)

  arr = construct_1d_object_array_from_listlike(values)


(1489, 19)
(1489, 19)


  arr = construct_1d_object_array_from_listlike(values)


In [14]:
endnodes_gdf.head(2)

Unnamed: 0,OBJECTID,CENTRELINE,SWB_NAME,EU_SWB_COD,MS_SWD_COD,MS_TYPO_CO,MS_TYPO_NA,SWB_CARACT,ID_BASSIN_,NOM_BASSIN,ID_DISTRIC,ID_BASSI00,NOM_DISTRI,AREA_KM2,LENGTH_KM,CONTINUA,FICTICIOUS,REMARQUE,geometry
0,748.0,VE01L_01,Réservoir de la Vesdre,BERW17_VE01L,VE01L,RES_06,Réservoirs fagnards,fortement modifiée,BE_MEUSE_RW_M_VE,Vesdre,BE_MEUSE_RW,BE_MEUSE_RW_M,Meuse,10.790448,5.909195,virtualSegmentLake,True,réservoir,POINT (274635.469 147570.011)
1,749.0,OU06R_02,Ourthe occidentale III,BERW17_OU06R,OU06R,RIV_06,Rivières ardennaises à pente moyenne,naturelle,BE_MEUSE_RW_M_OU,Ourthe,BE_MEUSE_RW,BE_MEUSE_RW_M,Meuse,98.29956,25.528706,realSurfaceWaterSegment,False,,POINT (240220.769 88038.101)


## Document the nodes

Perform spatial join the start and end dataframes to create one nodes geometry, and Label nodes as either start_id or end_id

In [15]:
def get_nodes(id_col, region):
    nodes_geom = pd.merge(startnodes_gdf[[id_col, 'geometry']], endnodes_gdf[[id_col, 'geometry']], on='geometry', how='outer')\
                    .reset_index(drop=True)
    unique_id_df = nodes_geom[['geometry']].drop_duplicates()\
                                            .reset_index()\
                                            .drop(columns=['index'])
    assert len(unique_id_df) == nodes_geom.geometry.nunique()

    unique_id_df['New_ID'] = range(1, len(unique_id_df) + 1)
    unique_id_df['node_id'] = region + unique_id_df['New_ID'].astype(str)
    gdf = gpd.GeoDataFrame(unique_id_df, geometry=unique_id_df['geometry'], crs=PROJ_CRS).drop(columns=['New_ID'])
    return gdf

In [16]:
water_nodes_df = get_nodes('CENTRELINE', 'WAL')

In [17]:
water_nodes_df.head()

Unnamed: 0,geometry,node_id
0,POINT (272046.907 146660.529),WAL1
1,POINT (232526.758 85282.629),WAL2
2,POINT (193357.056 93211.661),WAL3
3,POINT (188304.050 95172.237),WAL4
4,POINT (201795.869 103739.854),WAL5


In [18]:
water_nodes_df.to_file(r"data_transform\\wal_water_nodes.shp")

  pd.Int64Index,


## Add start_id and end_id to water dataset

In [19]:
def id_to_edges(line_unique_id):
    #Label nodes as either start_id or end_id
    startnodes_merged = gpd.sjoin(startnodes_gdf, water_nodes_df, how='left').rename(columns={'node_id': 'start_ID'}).drop('index_right', axis=1)
    endnodes_merged = gpd.sjoin(endnodes_gdf, water_nodes_df, how='left').rename(columns={'node_id': 'end_ID'}).drop('index_right', axis=1)

    nodes_geom = pd.merge(startnodes_merged, endnodes_merged, on=line_unique_id)

    nodes = nodes_geom[[line_unique_id, 'start_ID', 'end_ID']]

    water_edges_nodes = pd.merge(water_data_df, nodes, left_on=line_unique_id, right_on=line_unique_id) #.drop('id', axis=1)
    return water_edges_nodes

In [20]:
water_final = id_to_edges('CENTRELINE')

In [21]:
water_final.head(2)

Unnamed: 0,OBJECTID,CENTRELINE,SWB_NAME,EU_SWB_COD,MS_SWD_COD,MS_TYPO_CO,MS_TYPO_NA,SWB_CARACT,ID_BASSIN_,NOM_BASSIN,...,ID_BASSI00,NOM_DISTRI,AREA_KM2,LENGTH_KM,CONTINUA,FICTICIOUS,REMARQUE,geometry,start_ID,end_ID
0,748.0,VE01L_01,Réservoir de la Vesdre,BERW17_VE01L,VE01L,RES_06,Réservoirs fagnards,fortement modifiée,BE_MEUSE_RW_M_VE,Vesdre,...,BE_MEUSE_RW_M,Meuse,10.790448,5.909195,virtualSegmentLake,True,réservoir,"LINESTRING (272046.907 146660.529, 272472.407 ...",WAL1,WAL618
1,749.0,OU06R_02,Ourthe occidentale III,BERW17_OU06R,OU06R,RIV_06,Rivières ardennaises à pente moyenne,naturelle,BE_MEUSE_RW_M_OU,Ourthe,...,BE_MEUSE_RW_M,Meuse,98.29956,25.528706,realSurfaceWaterSegment,False,,"LINESTRING (232526.758 85282.629, 232532.005 8...",WAL2,WAL1160


In [22]:
#water_final.to_file(r"data_transform\\wal_water_edges.shp")

## Join water and sewer network

### Load discharge points

DIscharge points are used to connect a sewer network to a water network.

In [23]:
wal_disharge_pts = load_data(r'data_preprocess\wal_hydronetwork\disharge_points.shp')

(206, 33)
Original crs: epsg:4258
Project crs: EPSG:31370


In [73]:
drop_cols_pts = ['rcaCode', 'dcpDischar', 'repCode', 'dcpLatitud', 'dcpLongitu', 'dcpSurface', 'dcpTypeOfR', 'dcpState', 'dcpIrrigat', 'dcpNotAffe', 'dcpMSProvi', 'dcpCOMAcce', 'dcpGroundW', 'dcpWater_2', 'dcpGroun_1', 'dcpRecei_1', 'dcpWFDSu_1', 'dcpWFDRBDR', 'dcpBeginLi', 'dcpEndLife', 'dcpRemarks', 'dcpGeometr', 'rptMStateK']

discharge_pts_df = wal_disharge_pts.drop(drop_cols_pts, axis=1)
discharge_pts_df.drop(discharge_pts_df[discharge_pts_df['dcpWFDRBD'] == 'FRA'].index, inplace=True)

In [75]:
discharge_pts_df.dcpWaterBo.nunique()

1

In [26]:
#wal_disharge_pts.to_file(r'C:\Workdir\Develop\TRACING\geopackage\wal_layers.gpkg', layer='wal_disharge_pts', driver='GPKG')

### 2. Join the two datasets to the nearest geometries

In [27]:
print(discharge_pts_df.shape)
sewer_water_df = gpd.sjoin_nearest(discharge_pts_df, water_final).merge(water_final['geometry'], left_on="index_right", right_index=True)
print(sewer_water_df.shape)

(205, 9)
(205, 31)


In [28]:
sewer_water_df.head(2)

Unnamed: 0,uwwCode,dcpCode,dcpName,dcpNuts,dcpWater_1,dcpReceivi,dcpWFDSubU,dcpWFDRBD,geometry_x,index_right,...,ID_BASSI00,NOM_DISTRI,AREA_KM2,LENGTH_KM,CONTINUA,FICTICIOUS,REMARQUE,start_ID,end_ID,geometry_y
0,BERW9200305,BERWST9200305,Rj Step : ANDENNE (Seilles),BE352,BERW17_MV35R,Meuse II,BEMeuse aval_RW,BEMeuse_RW,POINT (203399.495 132360.671),1450,...,BE_MEUSE_RW_M,Meuse,392.636556,107.424608,realSurfaceWaterSegment,False,,WAL1139,WAL1042,"LINESTRING (202631.963 132060.413, 202635.418 ..."
1,BERW5600101,BERWST5600101,Rj Step : ANDERLUES,BE326,BERW17_HN01R,Haine I,BEHaine_RW,BEEscaut_RW,POINT (142780.026 123729.797),471,...,BE_ESCAUT_RW_E,Escaut,159.610053,53.291554,realSurfaceWaterSegment,False,,WAL417,WAL593,"LINESTRING (143819.969 121844.607, 143824.299 ..."


In [29]:
sewer_water_df[['CENTRELINE', 'dcpWater_1']].sample(5)

Unnamed: 0,CENTRELINE,dcpWater_1
163,SA25R_08,BERW17_SA25R
128,EL14R_01,BERW17_EL14R
40,SA27R_04,BERW17_SA27R
106,MV15R_01,BERW17_MV15R
152,LE30R_06D,BERW17_LE30R


## Find connection point on network

In [30]:
def get_nearest_point(df, line_col, point_col):
    """
    For each point in points_df, find the nearest point in lines_df.
    This is achieved by converting geometry objects to coordinates, 
    and then finding the nearest coordinate in the line.
    """
    geoms = []
    for idx, row in df.iterrows():
        destinations = MultiPoint(row[line_col].coords) #geometry_y
        nearest_geoms = nearest_points(row[point_col], destinations) #geometry_x
        try:
            for coord in destinations:
                if coord == nearest_geoms[1]:
                    geoms.append(coord)
        except ValueError:
            print("No nearest point found for {}".format(row.CODEKOPPNT))
    return geoms

In [31]:
sewer_water_df['new_points'] = get_nearest_point(sewer_water_df, 'geometry_y', 'geometry_x')

  for coord in destinations:
  arr = construct_1d_object_array_from_listlike(values)


In [32]:
assert len(sewer_water_df) == sewer_water_df.geometry_x.nunique()

In [33]:
# 2. Split the dataframe into two dataframes, one for the nodes and one for the water.
#not using new linestrings

nodes_df = sewer_water_df[['dcpCode', 'CENTRELINE', 'dcpWater_1', 'new_points']].rename(columns={'new_points': 'geometry'})
connection_nodes_gdf = gpd.GeoDataFrame(nodes_df, geometry='geometry', crs=PROJ_CRS)

water_df = sewer_water_df[['CENTRELINE', 'EU_SWB_COD', 'geometry_y']].rename(columns={'geometry_y': 'geometry'})
water_gdf = gpd.GeoDataFrame(water_df, geometry='geometry', crs=PROJ_CRS)

In [34]:
print(sewer_water_df.shape)
sewer_water_df.head(2)

(205, 32)


Unnamed: 0,uwwCode,dcpCode,dcpName,dcpNuts,dcpWater_1,dcpReceivi,dcpWFDSubU,dcpWFDRBD,geometry_x,index_right,...,NOM_DISTRI,AREA_KM2,LENGTH_KM,CONTINUA,FICTICIOUS,REMARQUE,start_ID,end_ID,geometry_y,new_points
0,BERW9200305,BERWST9200305,Rj Step : ANDENNE (Seilles),BE352,BERW17_MV35R,Meuse II,BEMeuse aval_RW,BEMeuse_RW,POINT (203399.495 132360.671),1450,...,Meuse,392.636556,107.424608,realSurfaceWaterSegment,False,,WAL1139,WAL1042,"LINESTRING (202631.963 132060.413, 202635.418 ...",POINT (203371.88653999244 132388.86128517054)
1,BERW5600101,BERWST5600101,Rj Step : ANDERLUES,BE326,BERW17_HN01R,Haine I,BEHaine_RW,BEEscaut_RW,POINT (142780.026 123729.797),471,...,Escaut,159.610053,53.291554,realSurfaceWaterSegment,False,,WAL417,WAL593,"LINESTRING (143819.969 121844.607, 143824.299 ...",POINT (142851.3261399978 123686.05688512325)


In [35]:
connection_nodes_gdf

Unnamed: 0,dcpCode,CENTRELINE,dcpWater_1,geometry
0,BERWST9200305,MV35R_12,BERW17_MV35R,POINT (203371.887 132388.861)
1,BERWST5600101,HN01R_01,BERW17_HN01R,POINT (142851.326 123686.057)
94,BERWST5502206,HN01R_01,BERW17_HN01R,POINT (136065.392 127258.512)
95,BERWST5608701,HN01R_01,BERW17_HN01R,POINT (139890.622 127215.504)
2,BERWST8100102,ML16R_03G,BERW17_ML16R,POINT (255773.320 40113.584)
...,...,...,...,...
196,BERWST5206303,SN02C_01,BERW17_SN12R,POINT (141974.632 140020.302)
198,BERWST6212102,MV14R_01,BERW17_MV14R,POINT (227178.142 138738.457)
200,BERWST9308802,SA08R_02,BERW17_SA08R,POINT (154676.543 106116.623)
201,BERWDP253,DE01B_01,BERW17_DE02R,POINT (105096.836 135947.677)


In [36]:
connection_nodes_gdf.to_file(r"data_transform\\wal_connection_nodes.shp")


  pd.Int64Index,


## Join the sewer network to the water network

In [37]:
def get_point_coords(gdf):
    
    """ Returns coordinates as tuples of coordinates"""

    return gdf.geometry.apply(lambda geom: (geom.x, geom.y))


def get_line_coords(line):

    """ Returns a list of tuples of coordinates"""
    
    coords_list = []
    multi_points = MultiPoint(line.coords)

    multi_points_list = [shapely.wkt.dumps(g) for g in multi_points]
    multi_points_geoms = [shapely.wkt.loads(i) for i in multi_points_list]
    for i in multi_points_geoms:
        long, lat = i.x, i.y
        coords_list.append((long, lat))

    return coords_list

In [38]:
nodes_gdf= connection_nodes_gdf.copy()
nodes_gdf['coords'] = get_point_coords(connection_nodes_gdf)

water_gdf['coords'] = water_gdf.apply(lambda row: get_line_coords(row.geometry), axis=1)

  multi_points_list = [shapely.wkt.dumps(g) for g in multi_points]


In [39]:
#nodes_gdf_upload = nodes_gdf.drop(['coords'], axis=1)
#nodes_gdf_upload.to_file(r"C:\Workdir\Develop\TR_USECASE\data_transform\wal_conn_nodes.shp")

In [40]:
#water_discharge_df['connection_nodes'] = get_nearest_point(water_discharge_df, 'geometry_y', 'geometry_x')

## Project and Split Lines

### 1. Split the dataframe into water and discharge points dataframes

In [41]:
def get_line_segments(l, points_list):

    idx_list = [i for i, item in enumerate(l) if item in points_list] # compares the two lists and returns the indexes of occurence
    
    p = [l[i] for i in idx_list] # get correct order of points list on the line

    super_list = []

    start_idx = 0



    #print("Index list: ", idx_list)

    if len(idx_list) == 1 and (p[0] == l[0] or p[0] == l[-1]): #      (i == 0 or i == len(l)-1) and len(idx_list) == 1:
        #print("One split point, at first or last index")
        line_segment = LineString(l)
        super_list.append(line_segment)

    elif len(idx_list) == 2 and (p[0] == l[0] or p[1] == l[-1]):
        #print("Two split points, at first and last index")
        line_segment = LineString(l)
        super_list.append(line_segment)

    else:

        #import pdb; pdb.set_trace()
        for i in idx_list:
            # In the case of the first coordinates of a line being a split point but there are other split points
            if i == 0 and len(idx_list) > 1:
                index_list = len(idx_list)
                
                #print(f"First index is a split point, with {index_list} split points")
                continue

            else:
                #print("Many split points")
                stop_idx = i + 1 #grab list elements until index i
                #print(f"stop index is {stop_idx}")

                line_list = l[start_idx: stop_idx]
                line_segment = LineString(line_list)
                super_list.append(line_segment)
                start_idx = i #reset the start index to the number of the prevous stop index

                #Catch if a split point is at the end of the line list
                #if stop_idx == len(l) or p[-1] == line_list[-1]:
                    #print("Many split points, with a split point at end of list") # stop index goes beyond the line list
                #    break

                 #super list still has one more segment to add
                if len(super_list) == len(idx_list):
                    #print("Add last segment after last split point") # stop index goes beyond the line list
                    #super list still has one more segment to add
                    ##print("Add last segment after last split point") # stop index goes beyond the line list
                    #print("super list still has one more segment to add")
                    #print(f"stop index is {stop_idx}")
                    #print(f'Linestring length {len(l)}')
                    #print(f'Index list {len(idx_list)}')
                    #print(f'Length of superlist {len(super_list)}')
                    #print('Last sgment is "last_segment = l[stop_idx-1:len(l)]"')
                    #n = len(l) - len(super_list)

                    last_segment = l[stop_idx-1:len(l)]
                    if stop_idx == len(l):
                        #print("Split point at end of list") # stop index goes beyond the line list
                        break
                    # n = len(l) - len(super_list)
                    #last_segment = l[stop_idx-1:len(l)] # Grab the last segments of the list from the prevous stop_idx-1, to the end of the lin len(l)
                    else:
                        #print("Split point at end of list")
                        last_segment_geom = LineString(last_segment)
                        super_list.append(last_segment_geom)
                    
                    #if stop_idx == len(l) or p[-1] == line_list[-1]:
                        #print("Split point at end of list") # stop index goes beyond the line list
                        #break

        #if points_list[0] == line_list[0] and points_list[-1] == line_list[-1]:
        #    assert len(super_list) == len(idx_list) - 1
        #elif points_list[0] == line_list[0] or points_list[-1] == line_list[-1]:
        #    assert len(super_list) == len(idx_list)
        #else:
        #    assert len(super_list) == len(idx_list) + 1

    return super_list

# pass a dataframe to the function

def split_lines(water_gdf, nodes_gdf, unique_id):

    #nodes_no_duplicates = nodes_gdf.drop_duplicates(subset='CODEKOPPNT')
    water_no_duplicates = water_gdf.drop_duplicates(subset=unique_id)

    groups = nodes_gdf.groupby(unique_id)

    codes_list = nodes_gdf[unique_id].to_list()
    unique_code_list = list(set(codes_list))
    #print(len(unique_code_list))

    #print(unique_code_list[71:73])

    all_segments = []
    ids = []
    #counter = 0

    for num, i in enumerate(unique_code_list):
        #print(f"Start loop {num, i}")
        
        #group = groups.get_group(i)
        #print("Get points list")

        points_list = groups.get_group(i).coords.to_list()
        #print("Points list: ", points_list)
        
        #print(points_list)

        line = water_no_duplicates[water_no_duplicates[unique_id] == i]['coords'][:1]
        #indx = water_no_duplicates[water_no_duplicates[unique_id] == i].index [0]

        points_list = groups.get_group(i).coords.to_list()
        
        line_segments = get_line_segments(*line, points_list)
        num_segments = len(line_segments)

        all_segments.extend(line_segments)
        #ids.append(indx)
        num_unique_ids = [i] * num_segments
        #get group for each unique code
        #group_ids = groups.get_group(i)[unique_id].to_list()

        #assert len(flat_list) == len(water_no_duplicates)
        ids.extend(num_unique_ids)
    
        #print(f"Line: {i}")
        #print(f'len({line_segments}) line_segments added')
        #print(line_segments)

        #all_segments = all_segments.extend(line_segment)

    #print(all_segments)
        #counter += 1

    #print(unique_code_list[72:74])
    ## Create dataframe with all segments
    print(len(all_segments))
    print(len(ids))
    #ids_list = list(range(len(ids)))
    gdf_segments = gpd.GeoDataFrame(list(range(len(all_segments))), geometry=all_segments, crs=PROJ_CRS)
    #gdf_segments = gpd.GeoDataFrame(gdf_segments, geometry='geometry', crs=PROJ_CRS)
    gdf_segments.columns = ['index', 'geometry']
    gdf_segments[unique_id] = ids
    return gdf_segments

In [42]:
water_gdf.columns

Index(['CENTRELINE', 'EU_SWB_COD', 'geometry', 'coords'], dtype='object')

In [43]:
nodes_gdf.columns

Index(['dcpCode', 'CENTRELINE', 'dcpWater_1', 'geometry', 'coords'], dtype='object')

In [44]:
import time

initialTime = time.time()
splitlines_df = split_lines(water_gdf, nodes_gdf, 'CENTRELINE')
finishTime = time.time()
print(f"Time taken: {finishTime - initialTime}")
print("********************************************************")

384
384
Time taken: 0.8165185451507568
********************************************************


In [45]:
print(splitlines_df.shape)
splitlines_df.head(2)

(384, 3)


Unnamed: 0,index,geometry,CENTRELINE
0,0,"LINESTRING (275109.344 106354.443, 275107.766 ...",ML04R_02_1
1,1,"LINESTRING (275478.907 106761.901, 275482.282 ...",ML04R_02_1


### Combine Nodes

In [86]:
def combine_nodes():
    water_nodes_df['source'] = 'water_node'
    assert len(water_nodes_df) == water_nodes_df.geometry.nunique()

    connection_nodes = connection_nodes_gdf[['dcpCode', 'geometry']].rename(columns={'dcpCode': 'node_id'})
    connection_nodes['source'] = 'connection_node'
    assert connection_nodes.geometry.nunique() == connection_nodes.node_id.nunique()

    wal_nodes_combined = pd.concat([water_nodes_df, connection_nodes])

    final_nodes_combined = wal_nodes_combined.drop_duplicates(subset='geometry', keep='first').reset_index(drop=True) #keep last keeps the connection nodes over the water nodes
    assert len(final_nodes_combined) == wal_nodes_combined.geometry.nunique()

    return final_nodes_combined

In [87]:
final_nodes_combined = combine_nodes()

In [88]:
final_nodes_combined

Unnamed: 0,geometry,node_id,source
0,POINT (272046.907 146660.529),WAL1,water_node
1,POINT (232526.758 85282.629),WAL2,water_node
2,POINT (193357.056 93211.661),WAL3,water_node
3,POINT (188304.050 95172.237),WAL4,water_node
4,POINT (201795.869 103739.854),WAL5,water_node
...,...,...,...
1675,POINT (141974.632 140020.302),BERWST5206303,connection_node
1676,POINT (227178.142 138738.457),BERWST6212102,connection_node
1677,POINT (154676.543 106116.623),BERWST9308802,connection_node
1678,POINT (105096.836 135947.677),BERWDP253,connection_node


In [51]:
#add a sewer node column into the connection nodes

def add_sewernode_id(row):
    if row['source'] == 'connection_node':
        return row['node_id']
    else:
        return None

nodes_all = final_nodes_combined.copy()

nodes_all['sewernode_id'] = nodes_all.apply(add_sewernode_id, axis=1)

In [94]:
def get_water_nodes(prefix):
    nodes_all = final_nodes_combined.copy()
    nodes_all['sewernode_id'] = nodes_all.apply(add_sewernode_id, axis=1)

    conn_df = nodes_all[nodes_all['source'] == 'connection_node']
    water_nodes = nodes_all.loc[nodes_all['source'] == 'water_node']

    nodes_list = water_nodes['node_id'].to_list()
    start_num = max([int(i[len(prefix):]) for i in nodes_list])

    diff = len(nodes_all.index) - len(water_nodes_df.index)

    conn_df['node_id'] = range((start_num + 1), (start_num + diff +1))
    conn_df['node_id'] = prefix + conn_df['node_id'].astype(str)

    nodes_all = pd.concat([water_nodes, conn_df])

    nodes_all_merged = pd.merge(nodes_all, wal_disharge_pts[['dcpCode', 'dcpName', 'dcpWaterBo', 'dcpState']], left_on='sewernode_id', right_on='dcpCode', how='left')

    nodes_all_gdf = gpd.GeoDataFrame(nodes_all_merged, geometry='geometry', crs=PROJ_CRS).reset_index(drop=True)

    return nodes_all_gdf

In [95]:
waternodes = get_water_nodes('WAL')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [96]:
waternodes

Unnamed: 0,geometry,node_id,source,sewernode_id,dcpCode,dcpName,dcpWaterBo,dcpState
0,POINT (272046.907 146660.529),WAL1,water_node,,,,,
1,POINT (232526.758 85282.629),WAL2,water_node,,,,,
2,POINT (193357.056 93211.661),WAL3,water_node,,,,,
3,POINT (188304.050 95172.237),WAL4,water_node,,,,,
4,POINT (201795.869 103739.854),WAL5,water_node,,,,,
...,...,...,...,...,...,...,...,...
1675,POINT (141974.632 140020.302),WAL1676,connection_node,BERWST5206303,BERWST5206303,Rj Step : FELUY (NIE-PRE),FW,1.0
1676,POINT (227178.142 138738.457),WAL1677,connection_node,BERWST6212102,BERWST6212102,Rj Step : NEUVILLE,FW,1.0
1677,POINT (154676.543 106116.623),WAL1678,connection_node,BERWST9308802,BERWST9308802,Rj Step : WALCOURT,FW,1.0
1678,POINT (105096.836 135947.677),WAL1679,connection_node,BERWDP253,BERWDP253,Station de pompage Ecacheries,FW,1.0


In [97]:
waternodes.to_file(r"data_transform\final_wal_nodes_combined.shp")

  pd.Int64Index,
  waternodes.to_file(r"data_transform\final_wal_nodes_combined.shp")


## Get start and end IDs for the line segments, to include new connection nodes

In [56]:
def line_segments_start_end_ids(splitlines_df, water_final, all_nodes_gdf, node_id):
    """ Returns a dataframe with the start and end ids of each line segment """
    #splitlines_df = splitlines_df.drop(columns=['index'])

    splitlines_df['coords'] = splitlines_df.apply(lambda row: get_line_coords(row.geometry), axis=1)
    all_nodes_gdf['coords'] = get_point_coords(all_nodes_gdf)
    #join linestrings to the nearest node, in this case the node attached to the line
    joined_lines_nodes = gpd.sjoin_nearest(splitlines_df, all_nodes_gdf, how='left').reset_index()
    

    idx_start = []
    start_id = []
    idx_end = []
    end_id = []
    for idx, row in joined_lines_nodes.iterrows():
        if row.coords_right == row.coords_left[0]:
            idx_start.append(row['index'])
            start_id.append(row[node_id])
        elif row.coords_right == row.coords_left[-1]:
            idx_end.append(row['index'])
            end_id.append(row[node_id])


    start_id_df = pd.DataFrame({'line_index': idx_start, f'start_{node_id}': start_id})
    end_id_df = pd.DataFrame({'line_index': idx_end, f'end_{node_id}': end_id})
    
    
    start_id_df_all = pd.merge(start_id_df, joined_lines_nodes[['index', node_id, 'CENTRELINE', 'geometry']], left_on='line_index', right_on = 'index', how='left').drop_duplicates('geometry')
    end_id_df_all = pd.merge(end_id_df, joined_lines_nodes[['index',  node_id, 'CENTRELINE', 'geometry']], left_on='line_index', right_on = 'index', how='left').drop_duplicates('geometry')

    
    merged_start_end_df = pd.merge(start_id_df_all, end_id_df_all, on='geometry', how='outer')
    merged_start_end_df = gpd.GeoDataFrame(merged_start_end_df, geometry='geometry', crs=PROJ_CRS)

    assert merged_start_end_df.CENTRELINE_x.all() == merged_start_end_df.CENTRELINE_y.all()

    splitlines_with_ids = merged_start_end_df.drop(['line_index_x', 'index_x', 'node_id_x',	'index_y', 'line_index_y', 'node_id_y', 'CENTRELINE_y'], axis=1)\
                                            .rename(columns={'start_node_id': 'start_ID', 'end_node_id': 'end_ID'})

    assert len(splitlines_with_ids) == splitlines_with_ids.geometry.nunique()

    return splitlines_with_ids

In [57]:
node_id = 'node_id'
splitlines_with_ids = line_segments_start_end_ids(splitlines_df, water_final, waternodes, node_id)

  multi_points_list = [shapely.wkt.dumps(g) for g in multi_points]


In [58]:
splitlines_with_ids.head(2)

Unnamed: 0,start_ID,CENTRELINE_x,geometry,end_ID
0,WAL236,ML04R_02_1,"LINESTRING (275109.344 106354.443, 275107.766 ...",WAL1635
1,WAL1635,ML04R_02_1,"LINESTRING (275478.907 106761.901, 275482.282 ...",WAL235


### Get unique ids for the new split segments based on original string ID

In [59]:
def get_unique_ID(df, col):
    """ Get unique ID for each new split segment in a dataframe
    Assert that the number of unique IDs is equal to the number of split segments
     """
    # the new split lines need a new unique uniqueID value
    df['num_id'] = df.groupby(col).cumcount()+1
    df["new_string_id"] = df[col].astype(str) + '_' + df['num_id'].astype(str)

    #df = df.drop(columns=['num_id', col]).rename(columns={'new_string_id': col})
    #assert df[col].nunique() == len(df)

    return df

In [60]:
splitlines_with_ids.head(2)

Unnamed: 0,start_ID,CENTRELINE_x,geometry,end_ID
0,WAL236,ML04R_02_1,"LINESTRING (275109.344 106354.443, 275107.766 ...",WAL1635
1,WAL1635,ML04R_02_1,"LINESTRING (275478.907 106761.901, 275482.282 ...",WAL235


In [61]:
splitlines_CENTRELINE = get_unique_ID(splitlines_with_ids, 'CENTRELINE_x')
assert len(splitlines_CENTRELINE) == splitlines_CENTRELINE.new_string_id.nunique()

In [62]:
splitlines_CENTRELINE.head(2)

Unnamed: 0,start_ID,CENTRELINE_x,geometry,end_ID,num_id,new_string_id
0,WAL236,ML04R_02_1,"LINESTRING (275109.344 106354.443, 275107.766 ...",WAL1635,1,ML04R_02_1_1
1,WAL1635,ML04R_02_1,"LINESTRING (275478.907 106761.901, 275482.282 ...",WAL235,2,ML04R_02_1_2


In [63]:
splitlines_merged = splitlines_CENTRELINE.merge(water_final, left_on='CENTRELINE_x', right_on='CENTRELINE', how='left')
assert splitlines_merged.CENTRELINE_x.all() == splitlines_merged.CENTRELINE.all()

In [64]:
splitlines_merged.columns

Index(['start_ID_x', 'CENTRELINE_x', 'geometry_x', 'end_ID_x', 'num_id',
       'new_string_id', 'OBJECTID', 'CENTRELINE', 'SWB_NAME', 'EU_SWB_COD',
       'MS_SWD_COD', 'MS_TYPO_CO', 'MS_TYPO_NA', 'SWB_CARACT', 'ID_BASSIN_',
       'NOM_BASSIN', 'ID_DISTRIC', 'ID_BASSI00', 'NOM_DISTRI', 'AREA_KM2',
       'LENGTH_KM', 'CONTINUA', 'FICTICIOUS', 'REMARQUE', 'geometry_y',
       'start_ID_y', 'end_ID_y'],
      dtype='object')

In [65]:
splitlines_final = splitlines_merged.drop(['CENTRELINE_x', 'CENTRELINE', 'num_id', 'start_ID_y', 'end_ID_y', 'geometry_y'], axis=1)\
                                    .rename(columns={'new_string_id': 'CENTRELINE', 'start_ID_x':'start_ID', 'end_ID_x':'end_ID', 'geometry_x': 'geometry'})
splitlines_final.head(2)

Unnamed: 0,start_ID,geometry,end_ID,CENTRELINE,OBJECTID,SWB_NAME,EU_SWB_COD,MS_SWD_COD,MS_TYPO_CO,MS_TYPO_NA,...,ID_BASSIN_,NOM_BASSIN,ID_DISTRIC,ID_BASSI00,NOM_DISTRI,AREA_KM2,LENGTH_KM,CONTINUA,FICTICIOUS,REMARQUE
0,WAL236,"LINESTRING (275109.344 106354.443, 275107.766 ...",WAL1635,ML04R_02_1_1,920.0,Braunlauf II,BERW17_ML04R,ML04R,RIV_04,Ruisseaux ardennais à pente moyenne,...,BE_RHIN_RW_R_ML,Moselle,BE_RHIN_RW,BE_RHIN_RW_R,Rhin,35.037682,10.837123,realSurfaceWaterSegment,False,
1,WAL1635,"LINESTRING (275478.907 106761.901, 275482.282 ...",WAL235,ML04R_02_1_2,920.0,Braunlauf II,BERW17_ML04R,ML04R,RIV_04,Ruisseaux ardennais à pente moyenne,...,BE_RHIN_RW_R_ML,Moselle,BE_RHIN_RW,BE_RHIN_RW_R,Rhin,35.037682,10.837123,realSurfaceWaterSegment,False,


## Gather all linestrings into one dataset

In [66]:
def merge_segments_to_water(split_segments, split_segments_final, water_df, col):
    """This function merges segments to water polygons"""
    # drop the linestrings to be split and merge the df with split lines
    #water_final_str = water_final_str.astype({"VHAS": str}, errors='raise')
    
    split_segments = split_segments.astype({col: str}, errors='raise')
    split_segments_final = split_segments_final.astype({col: str}, errors='raise')
    water_df = water_df.astype({col: str}, errors='raise')

    assert split_segments_final[col].nunique() == split_segments_final['geometry'].nunique()
    assert water_df[col].nunique() == water_df['geometry'].nunique()

    print(len(split_segments_final))
    print(len(water_df))
    linestrings_to_drop = split_segments[col].to_list()
    print('linestrings_to_drop:', len(set(linestrings_to_drop)))

    print("linestrings to drop: ", len(linestrings_to_drop))
    water_df_trimmed = water_df[~water_df[col].isin(linestrings_to_drop)].reset_index(drop=True)
    water_df_drop = water_df[water_df[col].isin(linestrings_to_drop)]
    assert water_df_trimmed[col].nunique() == water_df_trimmed['geometry'].nunique()
    print("water df trimmed: ", len(water_df_trimmed))
    print("water df drop: ", len(water_df_drop))

    #merge the split lines with the original water lines
    #water_df_trimmed_merged = water_df_trimmed.merge(split_segments_line_ids, on=col, how='outer')
    merged_df = gpd.GeoDataFrame(pd.concat([split_segments_final, water_df_trimmed], ignore_index=True), geometry='geometry', crs=PROJ_CRS)
    print("merged df: ", len(merged_df))
    assert merged_df['geometry'].nunique() == merged_df[col].nunique()

    #assert merged_df['geometry'].nunique() == merged_df[col].nunique()
    return merged_df

In [67]:
segments_to_water = merge_segments_to_water(splitlines_df, splitlines_final, water_final, 'CENTRELINE')

384
1489
linestrings_to_drop: 183
linestrings to drop:  384
water df trimmed:  1306
water df drop:  183
merged df:  1690


In [68]:
assert segments_to_water['CENTRELINE'].nunique() == segments_to_water['geometry'].nunique()

In [69]:
network_conn_df = segments_to_water[['CENTRELINE', 'start_ID', 'end_ID']]
network_conn_df.head()

Unnamed: 0,CENTRELINE,start_ID,end_ID
0,ML04R_02_1_1,WAL236,WAL1635
1,ML04R_02_1_2,WAL1635,WAL235
2,EL15R_01_1,WAL168,WAL1610
3,EL15R_01_2,WAL1610,WAL1200
4,EL03C_01_1,WAL216,WAL1494


In [70]:
segments_to_water['LENGTH_M'] = segments_to_water['geometry'].apply(lambda x: x.length)

In [71]:
segments_to_water.head()

Unnamed: 0,start_ID,geometry,end_ID,CENTRELINE,OBJECTID,SWB_NAME,EU_SWB_COD,MS_SWD_COD,MS_TYPO_CO,MS_TYPO_NA,...,NOM_BASSIN,ID_DISTRIC,ID_BASSI00,NOM_DISTRI,AREA_KM2,LENGTH_KM,CONTINUA,FICTICIOUS,REMARQUE,LENGTH_M
0,WAL236,"LINESTRING (275109.344 106354.443, 275107.766 ...",WAL1635,ML04R_02_1_1,920.0,Braunlauf II,BERW17_ML04R,ML04R,RIV_04,Ruisseaux ardennais à pente moyenne,...,Moselle,BE_RHIN_RW,BE_RHIN_RW_R,Rhin,35.037682,10.837123,realSurfaceWaterSegment,False,,753.738794
1,WAL1635,"LINESTRING (275478.907 106761.901, 275482.282 ...",WAL235,ML04R_02_1_2,920.0,Braunlauf II,BERW17_ML04R,ML04R,RIV_04,Ruisseaux ardennais à pente moyenne,...,Moselle,BE_RHIN_RW,BE_RHIN_RW_R,Rhin,35.037682,10.837123,realSurfaceWaterSegment,False,,250.024687
2,WAL168,"LINESTRING (73874.981 159186.774, 73871.194 15...",WAL1610,EL15R_01_1,1376.0,Grande Espierres,BERW17_EL15R,EL15R,RIV_20,Ruisseaux limoneux à pente moyenne,...,Escaut-Lys,BE_ESCAUT_RW,BE_ESCAUT_RW_E,Escaut,21.594859,12.557251,realSurfaceWaterSegment,False,,2774.20292
3,WAL1610,"LINESTRING (71453.101 160100.358, 71438.755 16...",WAL1200,EL15R_01_2,1376.0,Grande Espierres,BERW17_EL15R,EL15R,RIV_20,Ruisseaux limoneux à pente moyenne,...,Escaut-Lys,BE_ESCAUT_RW,BE_ESCAUT_RW_E,Escaut,21.594859,12.557251,realSurfaceWaterSegment,False,,4700.87775
4,WAL216,"LINESTRING (99604.258 131439.791, 99588.008 13...",WAL1494,EL03C_01_1,1483.0,Canal Blaton-Ath,BERW17_EL03C,EL03C,RIV_25,Voies d'eau artificielles,...,Escaut-Lys,BE_ESCAUT_RW,BE_ESCAUT_RW_E,Escaut,11.70372,4.896187,realSurfaceWaterSegment,False,,1396.043895


In [72]:
segments_to_water.to_file(r"data_transform\wal_water_PROCESSED.shp")

  pd.Int64Index,
