In [1]:
#Install packages:

import dask.dataframe as dd
import dask_geopandas as dg  
import geopandas as gpd
import pandas as pd
import numpy as np



In [2]:
def read_convert_crs_and_df(url, epsg, num_partitions, columns_to_include = None):
    '''
    Function to:
        (1) Read spatial data from a URL
        (2) Convert it to the given EPSG code
        (3) Removes columns not listed in columns_to_include
        (3) Convert it to a dask geodataframe
    
    Inputs:
        url (string): a link to the data
        epsg (string): the four digit epsg code
        num_partitions (int): the number of partitions the data should be stored on
        columns_to_include (list of strings): list of columns to include (OPTIONAL)


    Output:
        dask_gdf (dask geodataframe)
    '''
    #Read in as geopandas df:
    df = gpd.read_file(url)
    print("Read in as gpd...")
    
    
    #Maybe remove columns:
    if columns_to_include is not None:
        df = df[columns_to_include]
        print("Converted columns ... ")
    
    #Convert CRS:
    epsg_code = "EPSG:" + epsg 
    df = df.to_crs(epsg_code)
    print("Changed CRS... ")
    
    #Convert to dask: 
    dask_gdf = dg.from_geopandas(df, npartitions= num_partitions)
    print("Converted to dask... ")
    
    return dask_gdf


#Test:
com_area_link = "https://data.cityofchicago.org/api/geospatial/cauq-8yn6?method=export&format=Shapefile"    
com_area_cols = ["area_numbe", "community", "geometry"]

ca = read_convert_crs_and_df(com_area_link, 
                             epsg = "3435", 
                             num_partitions = 4, 
                             columns_to_include = com_area_cols)

Read in as gpd...
Converted columns ... 
Changed CRS... 
Converted to dask... 


In [3]:
ca.head()

Unnamed: 0,area_numbe,community,geometry
0,35,DOUGLAS,"POLYGON ((1181573.250 1886828.039, 1181571.200..."
1,36,OAKLAND,"POLYGON ((1186289.356 1876750.733, 1186247.037..."
2,37,FULLER PARK,"POLYGON ((1176344.998 1871187.546, 1176346.500..."
3,38,GRAND BOULEVARD,"POLYGON ((1182322.043 1876674.730, 1182323.670..."
4,39,KENWOOD,"POLYGON ((1186289.356 1876750.733, 1186290.775..."


In [4]:
#com areas:
com_area_link = "https://data.cityofchicago.org/api/geospatial/cauq-8yn6?method=export&format=Shapefile"    
com_area_cols = ["area_numbe", "community", "geometry"]

ca = read_convert_crs_and_df(com_area_link, 
                             epsg = "3435", 
                             num_partitions = 4, 
                             columns_to_include = com_area_cols)

Read in as gpd...
Converted columns ... 
Changed CRS... 
Converted to dask... 


In [5]:
#Current buildings
url_current = "https://data.cityofchicago.org/api/geospatial/hz9b-7nh8?method=export&format=Shapefile"
buildings = read_convert_crs_and_df(url_current, 
                                   epsg = "3435",
                                   num_partitions = 4)

Read in as gpd...
Changed CRS... 
Converted to dask... 


In [None]:
#Buildings 2012"
url_2012 = "https://data.cityofchicago.org/download/w2v3-isjw/application%2Fzip"

buildings_2012 = read_convert_crs_and_df(url_2012, 
                                   epsg = "3435",
                                   num_partitions = 4)


In [None]:
#Buildings 2013: 
url_2013 = "https://data.cityofchicago.org/api/geospatial/6mpq-sfwi?method=export&format=Shapefile"
buildings_2013 = read_convert_crs_and_df(url_2013, 
                                   epsg = "3435",
                                   num_partitions = 4)

In [None]:
#Buildings 2015:
url_2015 = "https://data.cityofchicago.org/api/geospatial/qv97-3bvb?method=export&format=Shapefile"
    
buildings_2015 = read_convert_crs_and_df(url_2015, 
                                         epsg = "3435",
                                         num_partitions = 4)


In [6]:
#Get areas of buildings:
buildings["building_area"] = buildings.area



In [None]:
#Spatial Join: 

In [20]:
buildings_current = dg.sjoin(buildings, ca)

In [10]:
def convert_to_dask_df(dask_geodf):
    '''
    Function that converts a dask geodataframe to a dask dataframe
    
    Input:
        dask_geodf: a dask geodataframe
    Outputs:
        dask_df: a dask dataframe
    '''
    colnames = list(dask_geodf.columns)
    colnames.remove("geometry")
    
    dask_df = dask_geodf[colnames]
    
    return dask_df

In [21]:
buildings_currrent = convert_to_dask_df(buildings_current)

In [25]:
buildings_current.head()

Unnamed: 0,date_bldg_,time_bldg_,bldg_condi,date_bld_2,time_bld_2,date_bld_3,time_bld_3,bldg_id,bldg_name1,bldg_name2,...,vacancy_st,x_coord,y_coord,year_built,z_coord,geometry,building_area,index_right,area_numbe,community
1,1998-04-01,08:00:00.000,SOUND,1998-04-01,08:00:00.000,,,46828.0,,,...,,1145427.0,1938878.0,1937.0,0.0,"POLYGON ((1145406.643 1938862.039, 1145406.143...",1138.125,11,12,FOREST GLEN
378,1998-04-01,08:00:00.000,SOUND,1998-04-01,08:00:00.000,,,16250.0,,,...,,1135930.0,1944821.0,1942.0,0.0,"POLYGON ((1135931.643 1944799.039, 1135922.643...",896.5,11,12,FOREST GLEN
545,1998-04-01,08:00:00.000,,1998-04-01,08:00:00.000,,,46368.0,,,...,,1143811.0,1938963.0,0.0,0.0,"POLYGON ((1143822.643 1938948.039, 1143799.643...",598.0,11,12,FOREST GLEN
556,1998-04-01,08:00:00.000,SOUND,1998-04-01,08:00:00.000,,,57154.0,,,...,,1143657.0,1937303.0,1965.0,0.0,"POLYGON ((1143640.643 1937268.539, 1143615.643...",2385.5,11,12,FOREST GLEN
640,1998-04-01,08:00:00.000,SOUND,1998-04-01,08:00:00.000,,,10672.0,,,...,,1133527.0,1946536.0,1953.0,0.0,"POLYGON ((1133522.143 1946502.539, 1133515.143...",1800.0,11,12,FOREST GLEN


In [27]:
sum_stats = buildings_currrent.groupby("community").agg({'building_area': ['mean', "count"]})

In [30]:
sum_stats2 = sum_stats.reset_index()
sum_stats2.head()

Unnamed: 0_level_0,community,building_area,building_area
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
0,ALBANY PARK,1416.745552,11710
1,ARCHER HEIGHTS,2843.952289,5400
2,ARMOUR SQUARE,2014.677017,2454
3,ASHBURN,1138.85039,23356
4,AUBURN GRESHAM,1269.315974,20196


In [35]:
def wrapper_building_details_by_ca(url, epsg, num_partitions, year, columns_to_include = None):
    '''
    Inputs:
        url (string): a link to the data
        epsg (string): the four digit epsg code
        num_partitions (int): the number of partitions the data should be stored on
        year (string): the year of the data
        columns_to_include (list of strings): list of columns to include (OPTIONAL)
    Outputs:
        sum_stats (dask df): dataframe with information on the square footage and number of 
                             buildings in a community area in a given year

    '''
    buildings = read_convert_crs_and_df(url, epsg, num_partitions, columns_to_include)
   
    buildings["building_area"] = buildings.area
    buildings_current = dg.sjoin(buildings, ca)
    
    buildings_currrent = convert_to_dask_df(buildings_current)    
    sum_stats = buildings_currrent.groupby("community").agg({'building_area': ['mean', "count"]})
    sum_stats["year"] = year
    
    return sum_stats

In [36]:
sum_stats = wrapper_building_details_by_ca(url_current, 
                                           epsg = "3435",
                                           num_partitions = 4, 
                                           year = "2021", 
                                           columns_to_include = None)

Read in as gpd...
Changed CRS... 
Converted to dask... 


In [37]:
sum_stats.head()

Unnamed: 0_level_0,building_area,building_area,year
Unnamed: 0_level_1,mean,count,Unnamed: 3_level_1
community,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
ALBANY PARK,1416.745552,11710,2021
ARCHER HEIGHTS,2843.952289,5400,2021
ARMOUR SQUARE,2014.677017,2454,2021
ASHBURN,1138.85039,23356,2021
AUBURN GRESHAM,1269.315974,20196,2021
