# Prepare buildings data for matrix computations

In [1]:
import numpy as np
import geopandas as gpd
import pandas as pd

import dask.dataframe as dd
import dask_geopandas

nparts = 4

In [2]:
import pyproj

In [3]:
df = gpd.read_file("Ontario.geojson").set_crs(epsg=4326)

In [4]:
toronto_bounds = gpd.read_file("toronto_boundary.geojson").set_crs(epsg=4326)

In [5]:
toronto_bounds

Unnamed: 0,geometry
0,GEOMETRYCOLLECTION (MULTIPOLYGON (((-79.49263 ...


In [6]:
ddf = dask_geopandas.from_geopandas(df, npartitions=nparts)

In [7]:
ddf = ddf[ddf.within(toronto_bounds.geometry[0])]

In [8]:
# ontario data comes in polygons, so convert to geometric CRS and get centroid, then convert back to lat long
ddf["centroid"] = ddf.to_crs(epsg=4087).geometry.centroid.to_crs(epsg=4326)

In [9]:
# break out lat long into their own columns
ddf["lat"] = ddf["centroid"].y
ddf["lon"] = ddf["centroid"].x

In [10]:
ddf.compute()

Unnamed: 0,geometry,centroid,lat,lon
1653,"POLYGON ((-79.41231 43.70879, -79.41237 43.708...",POINT (-79.41242 43.70886),43.708857,-79.412416
1654,"POLYGON ((-79.40951 43.70889, -79.40939 43.708...",POINT (-79.40948 43.70896),43.708958,-79.409479
1655,"POLYGON ((-79.41501 43.71027, -79.41495 43.710...",POINT (-79.41493 43.71022),43.710219,-79.414928
1656,"POLYGON ((-79.41321 43.71193, -79.41325 43.712...",POINT (-79.41330 43.71197),43.711973,-79.413304
1657,"POLYGON ((-79.41515 43.71513, -79.41519 43.715...",POINT (-79.41524 43.71517),43.715173,-79.415241
...,...,...,...,...
3779934,"POLYGON ((-79.47896 43.65441, -79.47893 43.654...",POINT (-79.47887 43.65436),43.654359,-79.478868
3779935,"POLYGON ((-79.47934 43.64538, -79.47929 43.645...",POINT (-79.47924 43.64533),43.645327,-79.479236
3779936,"POLYGON ((-79.47800 43.64637, -79.47798 43.646...",POINT (-79.47786 43.64636),43.646360,-79.477864
3779937,"POLYGON ((-79.47298 43.65540, -79.47287 43.655...",POINT (-79.47293 43.65546),43.655456,-79.472928


In [11]:
coords_ddf = ddf[["lat","lon"]]

In [12]:
coords_ddf

Unnamed: 0_level_0,lat,lon
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1
0,float64,float64
945462,...,...
1890924,...,...
2836386,...,...
3781846,...,...


In [13]:
import dask.bag as db
coords_dict = db.from_delayed(coords_ddf.map_partitions(pd.DataFrame.to_dict, orient='records').to_delayed())

In [15]:
import json
coords_dict.map(json.dumps).to_textfiles('coords_*.json')

['/home/hadoop/coords_0.json',
 '/home/hadoop/coords_1.json',
 '/home/hadoop/coords_2.json',
 '/home/hadoop/coords_3.json']

In [16]:
def write_comma_sep_dict(filename, f_w):
    count = 0
    with open(filename) as f_r:

        lines = f_r.readlines()

        for line in lines:

            if count != 0:
                f_w.write(",")

            f_w.write(line.strip())

            count += 1
    
    f_r.close()

In [17]:
def write_input_json(write_filename, coords_filename, costing_type="auto"):
    
    f = open(write_filename, "w")
    
    f.write('{"sources":[')

    write_comma_sep_dict(coords_filename, f)

    f.write('],"targets":[')

    write_comma_sep_dict(coords_filename, f)

    f.write('],"costing":"' + costing_type + '"}')

    f.close()

In [18]:
for i in range(nparts):
    write_input_json("Ontario_" + str(i) +".json", "coords_"+ str(i) +".json")

In [19]:
import boto3
import json

In [20]:
s3 = boto3.client('s3')
s3_resource = boto3.resource('s3')

In [25]:
for i in range(nparts):
    s3_resource.Bucket('bl-buildings').upload_file("Ontario_" + str(i) + ".json", Key="Ontario_" + str(i))