-
Notifications
You must be signed in to change notification settings - Fork 0
/
OBPL-Canada-2021.py
186 lines (136 loc) · 9.67 KB
/
OBPL-Canada-2021.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""
This script is part of the Open Building Population project.
Copyright 2024 Maxim Fortin
Maxim Fortin (2024): Code for processing building population estimates provided by Maxim Fortin under the Apache 2.0 License.
For more information about Maxim Fortin and the original works included in this distribution, please visit: https://www.maximfortin.com/project/obpl-ca-2021/
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import geopandas as gpd
from pyogrio import read_dataframe, write_dataframe
import json
import time
import datetime
tstart = time.perf_counter()
# File path for census file
DA_fn = 'filepath' # Enter file path for Dissemination Area geospatial dataset, including population
CSD_fn = 'filepath' # Enter file path for Census Subdivision geospatial dataset, including population
CD_fn = 'filepath' # Enter file path for Census Division geospatial dataset, including population
PT_fn = 'filepath' # Enter file path for Province geospatial dataset, including population
# File path for building footprint file
footprints_fn = 'filepath' # Enter file path for building footprints file
# Load census data, re-project CRS to EPSG:3978
DA_df = read_dataframe(DA_fn)
CSD_df = read_dataframe(CSD_fn)
CD_df = read_dataframe(CD_fn)
PT_df = read_dataframe(PT_fn)
DA_df['geometry'] = DA_df['geometry'].to_crs(epsg=3978)
CSD_df['geometry'] = CSD_df['geometry'].to_crs(epsg=3978)
CD_df['geometry'] = CD_df['geometry'].to_crs(epsg=3978)
PT_df['geometry'] = PT_df['geometry'].to_crs(epsg=3978)
print('Census data loaded...')
# Load building footprints, re-project CRS to EPSG:3978 and extract centroids
footprints_df = read_dataframe(footprints_fn)
footprints_df['geometry'] = footprints_df['geometry'].to_crs(epsg=3978) # convert building footprints CRS to EPSG:3978
footprints_df['BLDG_ID'] = footprints_df.index + 1 # plus 1 to match FID in geopackage
points_df = footprints_df.copy()
points_df['geometry'] = points_df['geometry'].centroid # assign centroid of each footprint to building points layer
print('Building footprints loaded...')
# Join building points and census polygons, count buildings, add column names and datatype
pip_DA = gpd.sjoin(points_df, DA_df, how='left') # perform spatial join for building points in census polygons
list_bldg_DA = pip_DA.DAUID.value_counts() # create list with count of buildings in each census polygon
list_DA_df = list_bldg_DA.to_frame(name='COUNT_DA') # convert geoseries list to dataframe
list_DA_df['DAUID'] = list_DA_df.index # assign index value to new column to make future merging easier
list_DA_df['COUNT_DA'] = list_DA_df['COUNT_DA'].astype('int64') # change field type from object to int64
list_DA_df['DAUID'] = list_DA_df['DAUID'].astype('int64') # change field type from object to int64
list_DA_df.reset_index(drop=True, inplace=True) # reset index to remove ambiguity for later merge
pip_CSD = gpd.sjoin(points_df, CSD_df, how='left') # perform spatial join for building points in census polygons
list_bldg_CSD = pip_CSD.CSDUID.value_counts() # create list with count of buildings in each census polygon
list_CSD_df = list_bldg_CSD.to_frame(name='COUNT_CSD') # convert geoseries list to dataframe
list_CSD_df['CSDUID'] = list_CSD_df.index # assign index value to new column to make future merging easier
list_CSD_df['COUNT_CSD'] = list_CSD_df['COUNT_CSD'].astype('int64') # change field type from object to int64
list_CSD_df['CSDUID'] = list_CSD_df['CSDUID'].astype('int64') # change field type from object to int64
list_CSD_df.reset_index(drop=True, inplace=True) # reset index to remove ambiguity for later merge
pip_CD = gpd.sjoin(points_df, CD_df, how='left') # perform spatial join for building points in census polygons
list_bldg_CD = pip_CD.CDUID.value_counts() # create list with count of buildings in each census polygon
list_CD_df = list_bldg_CD.to_frame(name='COUNT_CD') # convert geoseries list to dataframe
list_CD_df['CDUID'] = list_CD_df.index # assign index value to new column to make future merging easier
list_CD_df['COUNT_CD'] = list_CD_df['COUNT_CD'].astype('int64') # change field type from object to int64
list_CD_df['CDUID'] = list_CD_df['CDUID'].astype('int64') # change field type from object to int64
list_CD_df.reset_index(drop=True, inplace=True) # reset index to remove ambiguity for later merge
pip_PT = gpd.sjoin(points_df, PT_df, how='left') # perform spatial join for building points in census polygons
list_bldg_PT = pip_PT.PRUID.value_counts() # create list with count of buildings in each census polygon
list_PT_df = list_bldg_PT.to_frame(name='COUNT_PT') # convert geoseries list to dataframe
list_PT_df['PRUID'] = list_PT_df.index # assign index value to new column to make future merging easier
list_PT_df['COUNT_PT'] = list_PT_df['COUNT_PT'].astype('int64') # change field type from object to int64
list_PT_df['PRUID'] = list_PT_df['PRUID'].astype('int64') # change field type from object to int64
list_PT_df.reset_index(drop=True, inplace=True) # reset index to remove ambiguity for later merge
print('Points-in-Polygon analysis and building count completed...')
# Merge census polygons with building value count, calculate population ratio for each census unit
DA_df['DAUID'] = DA_df['DAUID'].astype('int64')
DA_df = DA_df.merge(list_DA_df, how='left', on='DAUID', suffixes=('_orig', '_count'))
DA_df['DA_RATIO'] = DA_df['DA_Population_2021_C1_COUNT_TOTAL'] / DA_df['COUNT_DA']
CSD_df['CSDUID'] = CSD_df['CSDUID'].astype('int64')
CSD_df = CSD_df.merge(list_CSD_df, how='left', on='CSDUID', suffixes=('_orig', '_count'))
CSD_df['CSD_RATIO'] = CSD_df['CSD_Population_2021_C1_COUNT_TOTAL'] / CSD_df['COUNT_CSD']
CD_df['CDUID'] = CD_df['CDUID'].astype('int64')
CD_df = CD_df.merge(list_CD_df, how='left', on='CDUID', suffixes=('_orig', '_count'))
CD_df['CD_RATIO'] = CD_df['CD_Population_2021_C1_COUNT_TOTAL'] / CD_df['COUNT_CD']
PT_df['PRUID'] = PT_df['PRUID'].astype('int64')
PT_df = PT_df.merge(list_PT_df, how='left', on='PRUID', suffixes=('_orig', '_count'))
PT_df['PT_RATIO'] = PT_df['PT_Population_2021_C1_COUNT_TOTAL'] / PT_df['COUNT_PT']
print('Ratios calculated for each census unit...')
# Add census ratios to building points and assign population value
points_df = gpd.sjoin(points_df, DA_df, how='left')
points_df = points_df[['BLDG_ID', 'geometry', 'properties', 'DAUID','DA_RATIO']]
points_df = gpd.sjoin(points_df, CSD_df, how='left')
points_df = points_df[['BLDG_ID', 'geometry', 'properties', 'DAUID', 'DA_RATIO', 'CSDUID', 'CSD_RATIO']]
points_df = gpd.sjoin(points_df, CD_df, how='left')
points_df = points_df[['BLDG_ID', 'geometry', 'properties', 'DAUID', 'DA_RATIO', 'CSDUID', 'CSD_RATIO', 'CDUID', 'CD_RATIO']]
points_df = gpd.sjoin(points_df, PT_df, how='left')
points_df = points_df[['BLDG_ID', 'geometry', 'properties', 'DAUID', 'DA_RATIO', 'CSDUID', 'CSD_RATIO', 'CDUID', 'CD_RATIO', 'PRUID', 'PT_RATIO']]
points_df['BLDG_POP'] = points_df['DA_RATIO'].fillna(points_df['CSD_RATIO'].fillna(points_df['CD_RATIO'])).astype('float').round(3)
points_df['POP_SOURCE'] = points_df.apply(lambda x: 'DA_RATIO' if not gpd.pd.isnull(x['DA_RATIO']) else ('CSD_RATIO' if not gpd.pd.isnull(x['CSD_RATIO']) else 'CD_RATIO'), axis=1)
# Adjust number of decimals for columns DA_RATIO, CSD_RATIO, and CD_RATIO in points_df
points_df['DA_RATIO'] = points_df['DA_RATIO'].fillna(points_df['CSD_RATIO'].fillna(points_df['CD_RATIO'])).astype('float').round(3)
points_df['CSD_RATIO'] = points_df['CSD_RATIO'].astype('float').round(3)
points_df['CD_RATIO'] = points_df['CD_RATIO'].astype('float').round(3)
points_df['PT_RATIO'] = points_df['PT_RATIO'].astype('float').round(3)
print('Building population calculated and added to buildings...')
# Format building height field in building data
def extract_height(json_str):
try:
json_data = json.loads(json_str)
height = json_data.get('height', None)
if height is not None:
return round(height, 2)
except:
pass
return None
points_df['HEIGHT'] = points_df['properties'].apply(extract_height)
points_df.drop(columns=['properties'], inplace=True)
points_df = points_df[['BLDG_ID', 'geometry', 'HEIGHT', 'DAUID', 'DA_RATIO', 'CSDUID','CSD_RATIO','CDUID', 'CD_RATIO', 'PRUID', 'PT_RATIO', 'BLDG_POP', 'POP_SOURCE']]
print('Point layer finalized...')
# Create footprint layer with population data
footprints_df = footprints_df.merge(points_df, how='left', on='BLDG_ID')
footprints_df.drop(columns=['type', 'properties', 'geometry_y'], inplace=True)
footprints_df = footprints_df.rename(columns={'geometry_x':'geometry'})
footprints_df = footprints_df[['BLDG_ID', 'geometry', 'HEIGHT', 'DAUID', 'DA_RATIO', 'CSDUID','CSD_RATIO','CDUID', 'CD_RATIO', 'PRUID', 'PT_RATIO', 'BLDG_POP', 'POP_SOURCE']]
footprints_df = gpd.GeoDataFrame(footprints_df, geometry='geometry')
print('Footprint layer finalized...')
# Save to file
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") # create timestamp to include in saved files
write_dataframe(points_df, f'Points_{current_time}.gpkg', driver='GPKG')
write_dataframe(footprints_df, f'Footprints_{current_time}.gpkg', driver='GPKG')
tend = time.perf_counter()
elapsed_time = tend - tstart
hours, remainder = divmod(elapsed_time, 3600)
minutes, seconds = divmod(remainder, 60)
print(f"Script execution time: {int(hours)} hours, {int(minutes)} minutes, {int(seconds)} seconds...")