In [1]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster()
client = Client(cluster)

In [2]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:38961  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 8.35 GB


In [3]:
import pandas as pd

In [7]:
df = pd.DataFrame({'a': [0, 2, 4, 6] * 10, 'b': [1, 3, 5, 7] * 10})

In [11]:
df.a.apply(lambda x: x**3)

0       0
1       8
2      64
3     216
4       0
5       8
6      64
7     216
8       0
9       8
10     64
11    216
12      0
13      8
14     64
15    216
16      0
17      8
18     64
19    216
20      0
21      8
22     64
23    216
24      0
25      8
26     64
27    216
28      0
29      8
30     64
31    216
32      0
33      8
34     64
35    216
36      0
37      8
38     64
39    216
Name: a, dtype: int64

In [13]:
import dask
import dask.dataframe as dd

In [17]:
ddf = dd.from_pandas(df, npartitions=4)

In [18]:
ddf

Unnamed: 0_level_0,a,b
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1
0,int64,int64
10,...,...
20,...,...
30,...,...
39,...,...


In [25]:
appl = ddf.a.apply(lambda x: x**90)

In [26]:
appl.compute()

0                                                     0
1                          1237940039285380274899124224
2     1532495540865888858358347027150309183618739122...
3     1080469556235987051829919370389914884872401572...
4                                                     0
5                          1237940039285380274899124224
6     1532495540865888858358347027150309183618739122...
7     1080469556235987051829919370389914884872401572...
8                                                     0
9                          1237940039285380274899124224
10    1532495540865888858358347027150309183618739122...
11    1080469556235987051829919370389914884872401572...
12                                                    0
13                         1237940039285380274899124224
14    1532495540865888858358347027150309183618739122...
15    1080469556235987051829919370389914884872401572...
16                                                    0
17                         123794003928538027489

In [1]:
import os

import geopandas as gpd
import pandas as pd
import numpy as np
import momepy as mm
import networkx as nx

from sqlalchemy import create_engine

from consolidate import consolidate, roundabouts, topology

In [57]:
def check(x, y):
    user = os.environ.get('DB_USER')
    pwd = os.environ.get('DB_PWD')
    host = os.environ.get('DB_HOST')
    port = os.environ.get('DB_PORT')

    url = f"postgres+psycopg2://{user}:{pwd}@{host}:{port}/built_env"
    engine = create_engine(url)
    # x, y = xy  # coordinates in epsg 27700
    buffer = 1000  # radius in [m]

    sql = f'SELECT * FROM openroads_200803_topological WHERE ST_DWithin(geometry, ST_SetSRID(ST_Point({x}, {y}), 27700), {buffer})'

    df = gpd.read_postgis(sql, engine, geom_col='geometry')

    areas = [500, 2000]
    compactness = [0.7, 0.9]
    results = []

    for area in areas:
        for comp in compactness:
            topo = consolidate(df, filter_func=roundabouts, area=area, circom=comp)

            G = mm.gdf_to_nx(topo)
            mesh = mm.meshedness(G, radius=None)
            G = mm.subgraph(G, meshedness=True, cds_length=False, mean_node_degree=False, proportion={0: False, 3: False, 4: False}, cyclomatic=False, edge_node_ratio=False, gamma=False, local_closeness=True, closeness_weight=None, verbose=False)
            vals = list(nx.get_node_attributes(G, 'meshedness').values())
            l_mesh_mean = np.mean(vals)
            l_mesh_median = np.median(vals)
            l_mesh_dev = np.std(vals)
            vals = list(nx.get_node_attributes(G, 'local_closeness').values())
            l_close_mean = np.mean(vals)
            l_close_median = np.median(vals)
            l_close_dev = np.std(vals)
            node_density = nx.number_of_nodes(G) / topo.length.sum()

            results += [area, comp, mesh, l_mesh_mean, l_mesh_median, l_mesh_dev, l_close_mean, l_close_median, l_close_dev]
    
    return results

In [35]:
pts = pd.DataFrame([(334289.32, 390468.43), (413600.89, 130366.55), (355619.40, 145872.69), (253464.65, 62056.59)])

In [64]:
pas = pts.apply(lambda x:  check(x[0], x[1]), axis=1)

In [44]:
ddf = dd.from_pandas(pts, npartitions=4)

In [58]:
vals = ddf.apply(lambda x:  check(x[0], x[1]), axis=1, meta=pd.Series(dtype='object'))

In [61]:
computed = vals.compute()

In [65]:
computed

0    [500, 0.7, 0.181739879414298, 0.13087980656383...
1    [500, 0.7, 0.07715582450832073, 0.052926181290...
2    [500, 0.7, 0.052525252525252523, 0.04129909398...
3    [500, 0.7, 0.0851063829787234, 0.0880821904608...
dtype: object

In [66]:
pas

0    [500, 0.7, 0.181739879414298, 0.13087980656383...
1    [500, 0.7, 0.07715582450832073, 0.052926181290...
2    [500, 0.7, 0.052525252525252523, 0.04129909398...
3    [500, 0.7, 0.0851063829787234, 0.0880821904608...
dtype: object

In [67]:
pd.testing.assert_series_equal(computed, pas)

In [77]:
open_names = pd.read_csv('../opname_csv_gb/DATA/ST66.csv', index_col=None)
header = pd.read_csv('../opname_csv_gb/DOC/OS_Open_Names_Header.csv', index_col=None)


In [78]:
header

Unnamed: 0,ID,NAMES_URI,NAME1,NAME1_LANG,NAME2,NAME2_LANG,TYPE,LOCAL_TYPE,GEOMETRY_X,GEOMETRY_Y,...,COUNTY_UNITARY,COUNTY_UNITARY_URI,COUNTY_UNITARY_TYPE,REGION,REGION_URI,COUNTRY,COUNTRY_URI,RELATED_SPATIAL_OBJECT,SAME_AS_DBPEDIA,SAME_AS_GEONAMES


In [84]:
open_names.columns = header.columns

In [88]:
open_names.LOCAL_TYPE.unique()

array(['Suburban Area', 'Village', 'Hamlet', 'City', 'Other Settlement',
       'Town', 'Postcode', 'Named Road', 'Section Of Named Road',
       'Section Of Numbered Road', 'Numbered Road', 'Hill Or Mountain',
       'Woodland Or Forest', 'Inland Water', 'Other Landcover', 'Valley',
       'Railway', 'Urban Greenspace', 'Spot Height', 'Cirque Or Hollow',
       'Further Education', 'Primary Education', 'Secondary Education',
       'Railway Station', 'Non State Secondary Education',
       'Medical Care Accommodation', 'Special Needs Education',
       'Further Education,Non State Secondary Education', 'Hospital',
       'Higher or University Education', 'Non State Primary Education',
       'Hospice', 'Bus Station', 'Electricity Production', 'Airfield',
       'Non State Primary Education,Non State Secondary Education'],
      dtype=object)

In [89]:
open_names[open_names.LOCAL_TYPE == 'City']

Unnamed: 0,ID,NAMES_URI,NAME1,NAME1_LANG,NAME2,NAME2_LANG,TYPE,LOCAL_TYPE,GEOMETRY_X,GEOMETRY_Y,...,COUNTY_UNITARY,COUNTY_UNITARY_URI,COUNTY_UNITARY_TYPE,REGION,REGION_URI,COUNTRY,COUNTRY_URI,RELATED_SPATIAL_OBJECT,SAME_AS_DBPEDIA,SAME_AS_GEONAMES
45,osgb4000000074572822,http://data.ordnancesurvey.co.uk/id/4000000074...,Bath,,,,populatedPlace,City,375093,164923,...,Bath and North East Somerset,http://data.ordnancesurvey.co.uk/id/7000000000...,http://data.ordnancesurvey.co.uk/ontology/admi...,South West,http://data.ordnancesurvey.co.uk/id/7000000000...,England,http://data.ordnancesurvey.co.uk/id/country/en...,,,http://sws.geonames.org/2656173


In [91]:
import fiona

In [92]:
fiona.listlayers('../opname_gpkg_gb/data/opname_gb.gpkg')

['NamedPlace']

In [13]:
major = gpd.read_file('../Major_Towns_and_Cities__December_2015__Boundaries.geojson')

In [21]:
major['geometry'] = major.centroid.to_crs(27700)

In [22]:
major

Unnamed: 0,objectid,tcity15cd,tcity15nm,st_areashape,st_lengthshape,geometry
0,1,J01000001,Barnsley,2.568247e+07,115099.860000,POINT (434627.651 407669.833)
1,2,J01000002,Basildon,2.551499e+07,119299.838000,POINT (570873.671 189207.516)
2,3,J01000003,Basingstoke,2.918502e+07,93900.388003,POINT (463065.018 152019.027)
3,4,J01000004,Bath,2.423750e+07,92099.940000,POINT (374795.202 164832.639)
4,5,J01000005,Bedford,2.016749e+07,71300.186000,POINT (505742.180 250327.575)
...,...,...,...,...,...,...
107,108,J01000108,Woking,2.908998e+07,92799.937100,POINT (500684.336 159059.830)
108,109,J01000109,Wolverhampton,5.932501e+07,131300.020000,POINT (391271.002 299238.180)
109,110,J01000110,Worcester,2.423248e+07,89800.048000,POINT (385584.581 255410.125)
110,111,J01000111,Worthing,2.458249e+07,47000.022000,POINT (512796.047 103944.437)


In [41]:
pts = pd.DataFrame(major.set_index('tcity15nm').geometry.apply(lambda x: (x.x, x.y)))

In [42]:
pts

Unnamed: 0_level_0,geometry
tcity15nm,Unnamed: 1_level_1
Barnsley,"(434627.6513354856, 407669.832611784)"
Basildon,"(570873.670668373, 189207.51646598754)"
Basingstoke,"(463065.0179892903, 152019.0270463108)"
Bath,"(374795.2024792732, 164832.6389714771)"
Bedford,"(505742.17987574486, 250327.57510175492)"
...,...
Woking,"(500684.33641567733, 159059.83044815497)"
Wolverhampton,"(391271.00219945755, 299238.17997642065)"
Worcester,"(385584.5812961166, 255410.12549394125)"
Worthing,"(512796.0471801633, 103944.43660039204)"


In [44]:
pts.to_csv('major_cities.csv')

In [None]:
docker run --network="host"  --rm -ti -p 8786:8786 darribas/gds_dev:5.0 start.sh dask-worker 138.253.73.214:8786

In [3]:
import os

import geopandas as gpd
import pandas as pd
import numpy as np
import momepy as mm
import networkx as nx
import dask.dataframe as dd

from sqlalchemy import create_engine

# from data_processing.vector_data.consolidate import consolidate, roundabouts, topology

In [4]:
def check(xy):
    
    from itertools import combinations
    import collections

    import pygeos
    import numpy as np
    import pandas as pd
    import geopandas as gpd
    import momepy as mm

    from shapely.ops import polygonize
    from scipy.spatial import Voronoi


    # helper functions
    def get_ids(x, ids):
        return ids[x]


    mp = np.vectorize(get_ids, excluded=["ids"])


    def dist(p1, p2):
        return np.sqrt(((p1[0] - p2[0]) ** 2) + ((p1[1] - p2[1]) ** 2))


    def get_verts(x, voronoi_diagram):
        return voronoi_diagram.vertices[x]


    def _average_geometry(lines, poly=None, distance=2):
        """
        Returns average geometry.


        Parameters
        ----------
        lines : list
            LineStrings connected at endpoints forming a closed polygon
        poly : shapely.geometry.Polygon
            polygon enclosed by `lines`
        distance : float
            distance for interpolation

        Returns list of averaged geometries
        """
        if not poly:
            polygons = list(polygonize(lines))
            if len(polygons) == 1:
                poly = polygons[0]
            else:
                raise ValueError("given lines do not form a single polygon")
        # get an additional line around the lines to avoid infinity issues with Voronoi
        extended_lines = [poly.buffer(distance).exterior] + lines

        # interpolate lines to represent them as points for Voronoi
        points = np.empty((0, 2))
        ids = []

        pygeos_lines = pygeos.from_shapely(extended_lines)
        lengths = pygeos.length(pygeos_lines)
        for ix, (line, length) in enumerate(zip(pygeos_lines, lengths)):
            pts = pygeos.line_interpolate_point(
                line, np.linspace(0.1, length - 0.1, num=int((length - 0.1) // distance))
            )  # .1 offset to keep a gap between two segments
            points = np.append(points, pygeos.get_coordinates(pts), axis=0)
            ids += [ix] * len(pts)

            # here we might also want to append original coordinates of each line
            # to get a higher precision on the corners, but it does not seem to be
            # necessary based on my tests.

        # generate Voronoi diagram
        voronoi_diagram = Voronoi(points)

        # get all rigdes and filter only those between the two lines
        pts = voronoi_diagram.ridge_points
        mapped = mp(pts, ids=ids)

        # iterate over segment-pairs
        edgelines = []
        for a, b in combinations(range(1, len(lines) + 1), 2):
            mask = (
                np.isin(mapped[:, 0], [a, b])
                & np.isin(mapped[:, 1], [a, b])
                & (mapped[:, 0] != mapped[:, 1])
            )
            rigde_vertices = np.array(voronoi_diagram.ridge_vertices)
            verts = rigde_vertices[mask]

            # generate the line in between the lines
            edgeline = pygeos.line_merge(
                pygeos.multilinestrings(get_verts(verts, voronoi_diagram))
            )
            snapped = pygeos.snap(edgeline, pygeos_lines[a], distance)
            edgelines.append(snapped)
        return edgelines


    def consolidate(network, distance=2, epsilon=2, filter_func=None, **kwargs):
        """
        Consolidate edges of a network, takes care of geometry only. No
        attributes are preserved at the moment.

        The whole process is split into several steps:
        1. Polygonize network
        2. Find polygons which are likely caused by dual lines and other
           geometries to be consolidated.
        3. Iterate over those polygons and generate averaged geometry
        4. Remove invalid and merge together with new geometry.

        Step 2 needs work, this is just a first attempt based on shape and area
        of the polygon. We will have to come with clever options here and
        allow their specification, because each network will need different
        parameters.

        Either before or after these steps needs to be done node consolidation,
        but in a way which does not generate overlapping geometries.
        Overlapping geometries cause (unresolvable) issues with Voronoi.

        Parameters
        ----------
        network : GeoDataFrame (LineStrings)

        distance : float
            distance for interpolation

        epsilon : float
            tolerance for simplification

        filter_func : function
            function which takes gdf of polygonized network and returns mask of invalid
            polygons (those which should be consolidated)

        **kwargs
            Additional kwargs passed to filter_func
        """

        # polygonize network
        polygonized = polygonize(network.geometry)
        geoms = [g for g in polygonized]
        gdf = gpd.GeoDataFrame(geometry=geoms, crs=network.crs)

        # filter potentially incorrect polygons
        mask = filter_func(gdf, **kwargs)
        invalid = gdf.loc[mask]

        sindex = network.sindex

        # iterate over polygons which are marked to be consolidated
        # list segments to be removed and the averaged geoms replacing them
        averaged = []
        to_remove = []
        for poly in invalid.geometry:
            real = network.iloc[sindex.query(poly.exterior, predicate="intersects")]
            mask = real.intersection(poly.exterior).type.isin(
                ["LineString", "MultiLineString"]
            )
            real = real[mask]
            lines = list(real.geometry)
            to_remove += list(real.index)

            if lines:
                av = _average_geometry(lines, poly, distance)
                averaged += av

        # drop double lines
        clean = network.drop(set(to_remove))

        # merge new geometries with the existing network
        averaged = gpd.array.from_shapely(averaged, crs=network.crs).simplify(epsilon)
        result = pd.concat([clean, gpd.GeoDataFrame(geometry=averaged[~averaged.is_empty])])
        merge = topology(result)

        return merge


    def roundabouts(gdf, area=5000, circom=0.6):
        """
        Filter out roundabouts
        """

        # calculate parameters
        gdf["area"] = gdf.geometry.area
        gdf["circom"] = mm.CircularCompactness(gdf, "area").series
        # select valid and invalid network-net_blocks
        mask = (gdf["area"] < area) & (gdf["circom"] > circom)
        return mask


    def filter_comp(gdf, max_size=10000, circom_max=0.2):
        """
        Filter based on max size and compactness

        Parameters
        ----------
        gdf : GeoDataFrame
            polygonized network
        max_size : float
            maximum size of a polygon to be considered potentially invalid
        circom_max : float
            maximum circular compactness of a polygon to be considered
            potentially invalid.

        Returns boolean series

        """
        # calculate parameters
        gdf["area"] = gdf.geometry.area
        gdf["circom"] = mm.CircularCompactness(gdf, "area").series
        # select valid and invalid network-net_blocks
        mask = (gdf["area"] < max_size) & (gdf["circom"] < circom_max)
        return mask


    def topology(gdf):
        """
        Clean topology of existing LineString geometry by removal of nodes of degree 2.

        Parameters
        ----------
        gdf : GeoDataFrame
            (Multi)LineString data of street network
        """

        # explode to avoid MultiLineStrings
        # double reset index due to the bug in GeoPandas explode
        df = gdf.reset_index(drop=True).explode().reset_index(drop=True)

        # get underlying pygeos geometry
        geom = df.geometry.values.data

        # extract array of coordinates and number per geometry
        coords = pygeos.get_coordinates(geom)
        indices = pygeos.get_num_coordinates(geom)

        # generate a list of start and end coordinates and create point geometries
        edges = [0]
        i = 0
        for ind in indices:
            ix = i + ind
            edges.append(ix - 1)
            edges.append(ix)
            i = ix
        edges = edges[:-1]
        points = pygeos.points(np.unique(coords[edges], axis=0))

        # query LineString geometry to identify points intersecting 2 geometries
        tree = pygeos.STRtree(geom)
        inp, res = tree.query_bulk(points, predicate="intersects")
        unique, counts = np.unique(inp, return_counts=True)
        merge = res[np.isin(inp, unique[counts == 2])]

        # filter duplications and create a dictionary with indication of components to be merged together
        dups = [item for item, count in collections.Counter(merge).items() if count > 1]
        split = np.split(merge, len(merge) / 2)
        components = {}
        for i, a in enumerate(split):
            if a[0] in dups or a[1] in dups:
                if a[0] in components.keys():
                    i = components[a[0]]
                elif a[1] in components.keys():
                    i = components[a[1]]
            components[a[0]] = i
            components[a[1]] = i

        # iterate through components and create new geometries
        new = []
        for c in set(components.values()):
            keys = []
            for item in components.items():
                if item[1] == c:
                    keys.append(item[0])
            new.append(pygeos.line_merge(pygeos.union_all(geom[keys])))

        # remove incorrect geometries and append fixed versions
        df = df.drop(merge)
        final = gpd.GeoSeries(new).explode().reset_index(drop=True)
        return df.append(
            gpd.GeoDataFrame({df.geometry.name: final}, geometry=df.geometry.name),
            ignore_index=True,
        )

    user = 'martin'
    pwd = 'gdsl2020'
    host = '138.253.73.214'
    port = '5432'

    url = f"postgres+psycopg2://{user}:{pwd}@{host}:{port}/built_env"
    engine = create_engine(url)
    # x, y = xy  # coordinates in epsg 27700
    buffer = 10000  # radius in [m]

    sql = f'SELECT * FROM openroads_200803_topological WHERE ST_DWithin(geometry, ST_SetSRID(ST_Point({xy[0][1:-1]}), 27700), {buffer})'

    df = gpd.read_postgis(sql, engine, geom_col='geometry')

#     areas = range(500, 2501, 500)
#     compactness = np.linspace(0.7, 0.9, 5)
    areas=[500]
    compactness=[0.8]
    results = []

    for area in areas:
        for comp in compactness:
            topo = consolidate(df, filter_func=roundabouts, area=area, circom=comp)

            G = mm.gdf_to_nx(topo)
            mesh = mm.meshedness(G, radius=None)
            G = mm.subgraph(G, meshedness=True, cds_length=False, mean_node_degree=False, proportion={0: False, 3: False, 4: False}, cyclomatic=False, edge_node_ratio=False, gamma=False, local_closeness=True, closeness_weight=None, verbose=False)
            vals = list(nx.get_node_attributes(G, 'meshedness').values())
            l_mesh_mean = np.mean(vals)
            l_mesh_median = np.median(vals)
            l_mesh_dev = np.std(vals)
            vals = list(nx.get_node_attributes(G, 'local_closeness').values())
            l_close_mean = np.mean(vals)
            l_close_median = np.median(vals)
            l_close_dev = np.std(vals)
            node_density = nx.number_of_nodes(G) / topo.length.sum()

            results += [area, comp, mesh, l_mesh_mean, l_mesh_median, l_mesh_dev, l_close_mean, l_close_median, l_close_dev]
    
    return results

In [5]:
cities = pd.read_csv('https://gist.githubusercontent.com/martinfleis/4148eeb0ea19e7808e761e097a877196/raw/f3001eeb7294eb330c6675d1cab307cd562baa5c/major_cities.csv', index_col=0)

In [6]:
ddf = dd.from_pandas(cities.iloc[:4], npartitions=4)

In [7]:
vals = ddf.apply(lambda x:  check(x), axis=1, meta=pd.Series(dtype='object'))

In [10]:
computed = vals.compute()

In [9]:
computed

tcity15nm
Barnsley       [500, 0.8, 0.0897190163533298, 0.0637388607946...
Basildon       [500, 0.8, 0.0757020757020757, 0.0516538411705...
Basingstoke    [500, 0.8, 0.07523718467308965, 0.052268476211...
Bath           [500, 0.8, 0.08491286537526904, 0.057112326713...
dtype: object