<a href="https://colab.research.google.com/github/npr99/IN-CORE_notebooks/blob/main/IN_CORE_2dv2_Lumberton_AddressPointInventory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Read in and clean Lumberton Address Point file
This program reads in the Address Point Inventory and makes sure it is ready for IN-CORE.

Cleaning steps include:
1. Check Unique ID - non-missing primary key
2. Check projection
3. Save as CSV file with Well Know Text (WKT) polygon information

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import math as math
import numpy as np
import geopandas as gpd
import pandas as pd
import shapely
import folium as fm # folium has more dynamic maps - but requires internet connection
import os # For saving output to path



In [None]:
# Display versions being used - important information for replication
import sys
print("Python Version     ", sys.version)
print("numpy version:     ", np.__version__)
print("geopandas version: ", gpd.__version__)
print("pandas version:    ", pd.__version__)
print("shapely version:   ", shapely.__version__)
print("folium version:    ", fm.__version__)

Python Version      3.7.10 | packaged by conda-forge | (default, Feb 19 2021, 15:37:01) [MSC v.1916 64 bit (AMD64)]
numpy version:      1.20.2
geopandas version:  0.9.0
pandas version:     0.24.2
shapely version:    1.7.1
folium version:     0.9.1


In [None]:
# Store Program Name for output files to have the same name
programname = "IN-CORE_2dv2_Lumberton_AddressPointInventory_2021-04-27"
# Make directory to save output
if not os.path.exists(programname):
    os.mkdir(programname)

## Read in Building Inventory File
Read in CSV file created by the program "IN-CORE_1dv2_[]_EstimateAddressPoints_[]"

This file contains each building with an estimate for the number of housing units in each building. 

The estimate of housing units provides some information to deterine the number of address points in each building.


In [None]:
source_program = "IN-CORE_1dv2_Lumberton_EstimateAddressPoints_2021-04-15"
building_csv = source_program+"/"+source_program+"_huestimate_EPSG4326.csv"
building_df = pd.read_csv(building_csv, dtype={'blockid': object})
building_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,guid,BLOCKID10,STATEFP10,COUNTYFP10,TRACTCE10,PUMGEOID10,PUMNAME10,PLCGEOID10,...,ErrorCheck1,Res_Area,Sum_Res_Area,residentialAP2,residentialAP2v2,residentialAP2v2_sum,bldgcountv2_sum,DiffCount2,ErrorCheck2,residentialAP3v1
0,0,0,2d32aeff-7b75-47e6-b7a5-4f4adca4b021,371559613011113,37.0,155.0,961301.0,3705100.0,Robeson County (West)--Lumberton City PUMA,3739700.0,...,4. HU>AP,0,60596,0.0,0.0,82.0,25.0,2.0,4. HU>AP,0.0
1,1,11462,400586f1-a265-4454-ba08-30eda28b974c,371559613011113,37.0,155.0,961301.0,3705100.0,Robeson County (West)--Lumberton City PUMA,3739700.0,...,4. HU>AP,3434,60596,3.910258,5.0,82.0,25.0,2.0,4. HU>AP,6.0
2,2,14415,b7b5e4ce-431f-4e7d-8a8a-619a9c205571,371559613011113,37.0,155.0,961301.0,3705100.0,Robeson County (West)--Lumberton City PUMA,3739700.0,...,4. HU>AP,5685,60596,6.473447,7.0,82.0,25.0,2.0,4. HU>AP,8.0
3,3,14416,47fcaa3f-8590-4f7c-8764-405acb671b19,371559613011113,37.0,155.0,961301.0,3705100.0,Robeson County (West)--Lumberton City PUMA,3739700.0,...,4. HU>AP,2412,60596,2.746518,4.0,82.0,25.0,2.0,4. HU>AP,5.0
4,4,14417,63d1d4df-db7b-40f6-bace-a70fbb0466b1,371559613011113,37.0,155.0,961301.0,3705100.0,Robeson County (West)--Lumberton City PUMA,3739700.0,...,4. HU>AP,2976,60596,3.388739,4.0,82.0,25.0,2.0,4. HU>AP,5.0


## Check Unique ID


In [None]:
# Confirm Primary Key is Unique and Non-Missing
building_df.guid.describe()

count                                    20091
unique                                   20091
top       853ae178-f224-4c96-a127-cf5af89396bd
freq                                         1
Name: guid, dtype: object

## Keep primary columns
The building inventory with estimates of housing units has many columns but only a few are needed to generate the address point inventory. The variable residentialAP3v1 provides the estimate for the number of housing units in the building.

In [None]:
select_cols = ['guid','BLOCKID10','geometry','residentialAP3v1']
building_df_cols = building_df[select_cols]
building_df_cols.head()

Unnamed: 0,guid,BLOCKID10,geometry,residentialAP3v1
0,2d32aeff-7b75-47e6-b7a5-4f4adca4b021,371559613011113,POINT (-78.99633432765292 34.6543576103806),0.0
1,400586f1-a265-4454-ba08-30eda28b974c,371559613011113,POINT (-78.99711777506781 34.65410219535778),6.0
2,b7b5e4ce-431f-4e7d-8a8a-619a9c205571,371559613011113,POINT (-78.9964649803694 34.65469315140241),8.0
3,47fcaa3f-8590-4f7c-8764-405acb671b19,371559613011113,POINT (-78.99692677846114 34.65433561263469),5.0
4,63d1d4df-db7b-40f6-bace-a70fbb0466b1,371559613011113,POINT (-78.99688850436857 34.65469345430336),5.0


# Read in Residential Address Point Count File
The residential address point count file is based on the 2010 Census count of housing units and group quarters. This file provides the basis for estimating the number of housing units in buildings.

The address point inventory will be a combination of the Building Inventory and the Residenital Address Point Count file.

In [None]:
# Read in Census Block Data
source_program = 'IN-CORE_1av2_Lumberton_CleanBlockData_2021-04-15'
census_blocks_csv = source_program+"/"+source_program+"EPSG4269.csv"
census_blocks_df = pd.read_csv(census_blocks_csv, dtype={'BLOCKID10': object,'COUNTYFP10': str})
census_blocks_df.head()

Unnamed: 0.1,Unnamed: 0,STATEFP10,COUNTYFP10,TRACTCE10,BLOCKCE,BLOCKID10,PARTFLG,HOUSING10,POP10,geometry,...,blockid,tothupoints,popcount,HU100,POP100,popdiff,PLCGEOID10,PLCNAME10,PUMGEOID10,PUMNAME10
0,0,37,155,961900,2028,371559619002028,N,14,52,"POLYGON ((-79.22246 34.458838, -79.222528 34.4...",...,371559619002028,14,51,14,52,1,,,3705100,Robeson County (West)--Lumberton City PUMA
1,1,37,155,961900,2054,371559619002054,N,1,3,"POLYGON ((-79.179851 34.40192, -79.180036 34.4...",...,371559619002054,1,3,1,3,0,,,3705100,Robeson County (West)--Lumberton City PUMA
2,2,37,155,961700,2069,371559617002069,N,41,99,"POLYGON ((-79.172814 34.480917, -79.172749 34....",...,371559617002069,41,99,41,99,0,,,3705100,Robeson County (West)--Lumberton City PUMA
3,3,37,155,961700,2065,371559617002065,N,6,22,"POLYGON ((-79.15764299999999 34.503279, -79.15...",...,371559617002065,6,21,6,22,1,,,3705100,Robeson County (West)--Lumberton City PUMA
4,4,37,155,961700,2058,371559617002058,N,19,55,"POLYGON ((-79.15830299999999 34.497355, -79.15...",...,371559617002058,19,55,19,55,0,,,3705100,Robeson County (West)--Lumberton City PUMA


## Keep primary columns
The address point county file has many columns but only a few are needed to generate the address point inventory.

In [None]:
# Check Columns
cols = [col for col in census_blocks_df]
cols

['Unnamed: 0',
 'STATEFP10',
 'COUNTYFP10',
 'TRACTCE10',
 'BLOCKCE',
 'BLOCKID10',
 'PARTFLG',
 'HOUSING10',
 'POP10',
 'geometry',
 'CountySelect',
 'rppnt4269',
 'blk104269',
 'blockid',
 'tothupoints',
 'popcount',
 'HU100',
 'POP100',
 'popdiff',
 'PLCGEOID10',
 'PLCNAME10',
 'PUMGEOID10',
 'PUMNAME10']

In [None]:
select_cols = ['blockid','geometry','rppnt4269','tothupoints']
census_blocks_df_cols = census_blocks_df[select_cols]
census_blocks_df_cols.head()

Unnamed: 0,blockid,geometry,rppnt4269,tothupoints
0,371559619002028,"POLYGON ((-79.22246 34.458838, -79.222528 34.4...",POINT (-79.22459088452018 34.4587885),14
1,371559619002054,"POLYGON ((-79.179851 34.40192, -79.180036 34.4...",POINT (-79.18141042208816 34.4060815),1
2,371559617002069,"POLYGON ((-79.172814 34.480917, -79.172749 34....",POINT (-79.16201616013601 34.4876855),41
3,371559617002065,"POLYGON ((-79.15764299999999 34.503279, -79.15...",POINT (-79.16259555882354 34.5013035),6
4,371559617002058,"POLYGON ((-79.15830299999999 34.497355, -79.15...",POINT (-79.14702005235881 34.497883),19


# Prepare Building Inventory to Expand Based on Housing Unit Estimate
For the address point inventory to work there needs to be one observation for each possible housing unit. This means that for buildings that have muliple housing units there will be one address point for each housing unit.

For places that do not have buildings but have people the address point inventory will provide details on housing units impacted outside of the study area.

In [None]:
# The residential address point estimate provides information on how many housing units are in each building.
building_df_cols['residentialAP3v1'].describe()

count    20090.000000
mean         1.302140
std          1.009192
min          0.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         41.000000
Name: residentialAP3v1, dtype: float64

In [None]:
# If the residentialAP3v1 is used to expand the dataset observations without residential address points will be lost.
# To keep all buildings add an expand variable
building_df_cols.loc[(building_df_cols['residentialAP3v1']==0),'expandvar'] = 1
building_df_cols.loc[(building_df_cols['residentialAP3v1']>0),'expandvar'] = building_df_cols['residentialAP3v1']
# Check to make sure expand variable was generated correctly
pd.crosstab(building_df_cols['expandvar'].loc[building_df_cols['expandvar']<=3],
            building_df_cols['residentialAP3v1'], margins=True, margins_name="Total")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


residentialAP3v1,0.0,1.0,2.0,3.0,Total
expandvar,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,2209,11171,0,0,13380
2.0,0,0,5983,0,5983
3.0,0,0,0,468,468
Total,2209,11171,5983,468,19831


## Expand Building Inventory
Using the expand variable expand building inventory.

In [None]:
building_df_cols['expandvar'].describe()

count    20090.000000
mean         1.412096
std          0.911027
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         41.000000
Name: expandvar, dtype: float64

In [None]:
building_df_cols.loc[(building_df_cols['expandvar']<0)]

Unnamed: 0,guid,BLOCKID10,geometry,residentialAP3v1,expandvar


In [None]:
building_df_cols.loc[(building_df_cols.expandvar.isna())]

Unnamed: 0,guid,BLOCKID10,geometry,residentialAP3v1,expandvar
33,6b481629-e0c6-48f6-b1ce-d57f65d35cb6,371559608021059,POINT (-79.0284731543055 34.60277281976215),,


In [None]:
building_df_cols.loc[(building_df_cols.expandvar.isna(),'expandvar')] = 1
building_df_cols.loc[(building_df_cols.expandvar.isna())]

Unnamed: 0,guid,BLOCKID10,geometry,residentialAP3v1,expandvar


In [None]:
# The housing unit data frame is the expanded building dataframe
building_df_cols_expand = building_df_cols.reindex(
    building_df_cols.index.repeat(building_df_cols['expandvar']))

In [None]:
 building_df_cols_expand.guid.describe()

count                                    28370
unique                                   20091
top       ffa72530-7ac7-46f1-9cc9-7950b250e870
freq                                        41
Name: guid, dtype: object

In [None]:
building_df_cols_expand.loc[building_df_cols_expand.guid == "86823d05-10ce-4073-a8dd-68ee6917e8b0"]

Unnamed: 0,guid,BLOCKID10,geometry,residentialAP3v1,expandvar
14,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559613011113,POINT (-78.99834381070323 34.65525992502589),8.0,8.0
14,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559613011113,POINT (-78.99834381070323 34.65525992502589),8.0,8.0
14,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559613011113,POINT (-78.99834381070323 34.65525992502589),8.0,8.0
14,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559613011113,POINT (-78.99834381070323 34.65525992502589),8.0,8.0
14,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559613011113,POINT (-78.99834381070323 34.65525992502589),8.0,8.0
14,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559613011113,POINT (-78.99834381070323 34.65525992502589),8.0,8.0
14,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559613011113,POINT (-78.99834381070323 34.65525992502589),8.0,8.0
14,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559613011113,POINT (-78.99834381070323 34.65525992502589),8.0,8.0


## Expand Residential Address Point Count File
Using the count of address point variable expand residential address point count file

In [None]:
census_blocks_df_cols['blockid'].describe()

count    5.799000e+03
mean     3.715596e+14
std      6.468066e+06
min      3.715596e+14
25%      3.715596e+14
50%      3.715596e+14
75%      3.715596e+14
max      3.715596e+14
Name: blockid, dtype: float64

In [None]:
census_blocks_df_cols['tothupoints'].describe()

count    5799.000000
mean        9.105191
std        16.497869
min         0.000000
25%         0.000000
50%         3.000000
75%        11.000000
max       191.000000
Name: tothupoints, dtype: float64

In [None]:
# The expand variable can not have missing values
census_blocks_df_cols.loc[(census_blocks_df_cols['tothupoints'].isna()),'expandvar'] = 0
census_blocks_df_cols.loc[(census_blocks_df_cols['tothupoints']>=0),'expandvar'] = census_blocks_df_cols['tothupoints']
# Check to make sure expand variable was generated correctly
census_blocks_df_cols['expandvar'].describe()

count    5799.000000
mean        9.105191
std        16.497869
min         0.000000
25%         0.000000
50%         3.000000
75%        11.000000
max       191.000000
Name: expandvar, dtype: float64

In [None]:
census_blocks_df_cols.loc[(census_blocks_df_cols.expandvar.isna())]

Unnamed: 0,blockid,geometry,rppnt4269,tothupoints,expandvar


In [None]:
census_blocks_df_cols_expand = census_blocks_df_cols.reindex(
    census_blocks_df_cols.index.repeat(census_blocks_df_cols['expandvar']))

In [None]:
census_blocks_df_cols_expand.blockid.describe()

count    5.280100e+04
mean     3.715596e+14
std      6.280271e+06
min      3.715596e+14
25%      3.715596e+14
50%      3.715596e+14
75%      3.715596e+14
max      3.715596e+14
Name: blockid, dtype: float64

In [None]:
census_blocks_df_cols_expand.head()

Unnamed: 0,blockid,geometry,rppnt4269,tothupoints,expandvar
0,371559619002028,"POLYGON ((-79.22246 34.458838, -79.222528 34.4...",POINT (-79.22459088452018 34.4587885),14,14
0,371559619002028,"POLYGON ((-79.22246 34.458838, -79.222528 34.4...",POINT (-79.22459088452018 34.4587885),14,14
0,371559619002028,"POLYGON ((-79.22246 34.458838, -79.222528 34.4...",POINT (-79.22459088452018 34.4587885),14,14
0,371559619002028,"POLYGON ((-79.22246 34.458838, -79.222528 34.4...",POINT (-79.22459088452018 34.4587885),14,14
0,371559619002028,"POLYGON ((-79.22246 34.458838, -79.222528 34.4...",POINT (-79.22459088452018 34.4587885),14,14


## Merge Two Address Point Files 
Combing the address points based on building inventory and the address points based on the 2010 Census will create one file that has address points for the entire county. 

The compbined file will show where the building inventory may be missing information within the study community. The combined file will also help to show the populations impacted both inside the study community and in neighboring areas.

To merge the two files need to add a counter to each file by blockid.

In [None]:
# Add counter by block id - use cummulative count method
census_blocks_df_cols_expand['blockidcounter'] = census_blocks_df_cols_expand.groupby('blockid').cumcount()

In [None]:
census_blocks_df_cols_expand.head()

Unnamed: 0,blockid,geometry,rppnt4269,tothupoints,expandvar,blockidcounter
0,371559619002028,"POLYGON ((-79.22246 34.458838, -79.222528 34.4...",POINT (-79.22459088452018 34.4587885),14,14,0
0,371559619002028,"POLYGON ((-79.22246 34.458838, -79.222528 34.4...",POINT (-79.22459088452018 34.4587885),14,14,1
0,371559619002028,"POLYGON ((-79.22246 34.458838, -79.222528 34.4...",POINT (-79.22459088452018 34.4587885),14,14,2
0,371559619002028,"POLYGON ((-79.22246 34.458838, -79.222528 34.4...",POINT (-79.22459088452018 34.4587885),14,14,3
0,371559619002028,"POLYGON ((-79.22246 34.458838, -79.222528 34.4...",POINT (-79.22459088452018 34.4587885),14,14,4


In [None]:
# Add counter by block id - use cummulative count method
building_df_cols_expand['blockidcounter'] = building_df_cols_expand.groupby('BLOCKID10').cumcount()
building_df_cols_expand.head()

Unnamed: 0,guid,BLOCKID10,geometry,residentialAP3v1,expandvar,blockidcounter
0,2d32aeff-7b75-47e6-b7a5-4f4adca4b021,371559613011113,POINT (-78.99633432765292 34.6543576103806),0.0,1.0,0
1,400586f1-a265-4454-ba08-30eda28b974c,371559613011113,POINT (-78.99711777506781 34.65410219535778),6.0,6.0,1
1,400586f1-a265-4454-ba08-30eda28b974c,371559613011113,POINT (-78.99711777506781 34.65410219535778),6.0,6.0,2
1,400586f1-a265-4454-ba08-30eda28b974c,371559613011113,POINT (-78.99711777506781 34.65410219535778),6.0,6.0,3
1,400586f1-a265-4454-ba08-30eda28b974c,371559613011113,POINT (-78.99711777506781 34.65410219535778),6.0,6.0,4


In [None]:
# Merge 2 files based on blockid and blockid counter - keep all observations from both files with full outer join
address_point_inventory = pd.merge(building_df_cols_expand, census_blocks_df_cols_expand,
                                  left_on=['BLOCKID10','blockidcounter'], right_on=['blockid','blockidcounter'], how='outer')

In [None]:
# Check merge - examples were Building Id is missing
displaycols = ['guid','BLOCKID10']
condition = address_point_inventory['guid'].isna()
address_point_inventory[displaycols].loc[condition].head()

Unnamed: 0,guid,BLOCKID10
28370,,
28371,,
28372,,
28373,,
28374,,


In [None]:
# Check merge - examples were there is no census data
displaycols = ['guid','BLOCKID10']
condition = address_point_inventory['tothupoints'].isnull()
address_point_inventory[displaycols].loc[condition].head()

Unnamed: 0,guid,BLOCKID10
84,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559600000000.0
85,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559600000000.0
86,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559600000000.0
87,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559600000000.0
88,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559600000000.0


### Fix issue with missing blockid vs BLOCKID10

In [None]:
address_point_inventory.loc[address_point_inventory.guid == "86823d05-10ce-4073-a8dd-68ee6917e8b0"]

Unnamed: 0,guid,BLOCKID10,geometry_x,residentialAP3v1,expandvar_x,blockidcounter,blockid,geometry_y,rppnt4269,tothupoints,expandvar_y
82,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559600000000.0,POINT (-78.99834381070323 34.65525992502589),8.0,8.0,82,371559600000000.0,"POLYGON ((-78.997281 34.656738, -78.996881 34....",POINT (-78.99912009206406 34.65508),84.0,84.0
83,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559600000000.0,POINT (-78.99834381070323 34.65525992502589),8.0,8.0,83,371559600000000.0,"POLYGON ((-78.997281 34.656738, -78.996881 34....",POINT (-78.99912009206406 34.65508),84.0,84.0
84,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559600000000.0,POINT (-78.99834381070323 34.65525992502589),8.0,8.0,84,,,,,
85,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559600000000.0,POINT (-78.99834381070323 34.65525992502589),8.0,8.0,85,,,,,
86,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559600000000.0,POINT (-78.99834381070323 34.65525992502589),8.0,8.0,86,,,,,
87,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559600000000.0,POINT (-78.99834381070323 34.65525992502589),8.0,8.0,87,,,,,
88,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559600000000.0,POINT (-78.99834381070323 34.65525992502589),8.0,8.0,88,,,,,
89,86823d05-10ce-4073-a8dd-68ee6917e8b0,371559600000000.0,POINT (-78.99834381070323 34.65525992502589),8.0,8.0,89,,,,,


In [None]:
address_point_inventory['blockid'].describe()

count    5.280100e+04
mean     3.715596e+14
std      6.280271e+06
min      3.715596e+14
25%      3.715596e+14
50%      3.715596e+14
75%      3.715596e+14
max      3.715596e+14
Name: blockid, dtype: float64

In [None]:
address_point_inventory['BLOCKID10'].describe()

count    2.837000e+04
mean     3.715596e+14
std      3.536915e+06
min      3.715596e+14
25%      3.715596e+14
50%      3.715596e+14
75%      3.715596e+14
max      3.715596e+14
Name: BLOCKID10, dtype: float64

In [None]:
address_point_inventory.loc[address_point_inventory.blockid.isna(),
                                                                'blockid'] = address_point_inventory['BLOCKID10']

## Create Unique ID for each Address Point
The address point id can be a combination of the block id a counter

In [None]:
cols = [col for col in address_point_inventory]
cols

['guid',
 'BLOCKID10',
 'geometry_x',
 'residentialAP3v1',
 'expandvar_x',
 'blockidcounter',
 'blockid',
 'geometry_y',
 'rppnt4269',
 'tothupoints',
 'expandvar_y']

### The  Address Point ID is based on the building id first then the block id
In the best case scenario every address point is connected to the a building but in cases where the building id is missing then the address point is based on the Census Block ID.

In [None]:
address_point_inventory.loc[(address_point_inventory['guid'].isna()),
                            'strctid'] = address_point_inventory.apply(lambda x: "CB"+ str(x['blockid']).zfill(36), axis=1)
address_point_inventory.loc[(address_point_inventory['guid'].notna()),
                            'strctid'] = address_point_inventory.apply(lambda x: "ST"+ str(x['guid']).zfill(36), axis=1)
# Confirm Primary Key is Unique and Non-Missing
address_point_inventory[['strctid']].head(10)

Unnamed: 0,strctid
0,ST2d32aeff-7b75-47e6-b7a5-4f4adca4b021
1,ST400586f1-a265-4454-ba08-30eda28b974c
2,ST400586f1-a265-4454-ba08-30eda28b974c
3,ST400586f1-a265-4454-ba08-30eda28b974c
4,ST400586f1-a265-4454-ba08-30eda28b974c
5,ST400586f1-a265-4454-ba08-30eda28b974c
6,ST400586f1-a265-4454-ba08-30eda28b974c
7,STb7b5e4ce-431f-4e7d-8a8a-619a9c205571
8,STb7b5e4ce-431f-4e7d-8a8a-619a9c205571
9,STb7b5e4ce-431f-4e7d-8a8a-619a9c205571


In [None]:
# Sort Address Points by The first part of the address point 
address_point_inventory.sort_values(by=['strctid'])
# Add Counter by Building
address_point_inventory['apcounter'] = address_point_inventory.groupby('strctid').cumcount()

# Are there any examples were the block building counter does not equal the blockid counter?
displaycols = ['guid','blockid','tothupoints','blockidcounter','apcounter']
condition = address_point_inventory['blockidcounter']!=address_point_inventory['apcounter']
address_point_inventory[displaycols].loc[condition].head()

Unnamed: 0,guid,blockid,tothupoints,blockidcounter,apcounter
1,400586f1-a265-4454-ba08-30eda28b974c,371559600000000.0,84.0,1,0
2,400586f1-a265-4454-ba08-30eda28b974c,371559600000000.0,84.0,2,1
3,400586f1-a265-4454-ba08-30eda28b974c,371559600000000.0,84.0,3,2
4,400586f1-a265-4454-ba08-30eda28b974c,371559600000000.0,84.0,4,3
5,400586f1-a265-4454-ba08-30eda28b974c,371559600000000.0,84.0,5,4


To make a unique id for the address points need to have a combination of unique values. The first part of the address point id is based on either the building id or the block id.  Within each Building or Census Block the counter variable provides a way to identify address points within a block.

In [None]:
address_point_inventory['addrptid'] = address_point_inventory.apply(lambda x: x['strctid'] + "AP" +
                                                                 str(int(x['apcounter'])).zfill(6), axis=1)
# Move Primary Key Column to first Column
cols = ['addrptid']  + [col for col in address_point_inventory if col != 'addrptid']
address_point_inventory = address_point_inventory[cols]
address_point_inventory[['addrptid','blockid', 'apcounter']].head(6)

Unnamed: 0,addrptid,blockid,apcounter
0,ST2d32aeff-7b75-47e6-b7a5-4f4adca4b021AP000000,371559600000000.0,0
1,ST400586f1-a265-4454-ba08-30eda28b974cAP000000,371559600000000.0,0
2,ST400586f1-a265-4454-ba08-30eda28b974cAP000001,371559600000000.0,1
3,ST400586f1-a265-4454-ba08-30eda28b974cAP000002,371559600000000.0,2
4,ST400586f1-a265-4454-ba08-30eda28b974cAP000003,371559600000000.0,3
5,ST400586f1-a265-4454-ba08-30eda28b974cAP000004,371559600000000.0,4


In [None]:
# Confirm Primary Key is Unique and Non-Missing
address_point_inventory.addrptid.describe()

count                                              61505
unique                                             61505
top       CB0000000000000000000371559618012006.0AP000019
freq                                                   1
Name: addrptid, dtype: object

## Generate Flag Variables 
For the merged dataset identify cases where either building or census data is missing.

In [None]:
# Create Address Poing Flag Variable
address_point_inventory['flag_ap'] = 0
address_point_inventory.loc[(address_point_inventory['tothupoints'].isnull()),'flag_ap'] = 1
address_point_inventory.loc[(address_point_inventory['guid'].isna()),'flag_ap'] = 2
address_point_inventory.loc[(address_point_inventory['blockid'].isnull()),'flag_ap'] = 3
# Check to make sure expand variable was generated correctly
address_point_inventory.groupby(['flag_ap']).count()

Unnamed: 0_level_0,addrptid,guid,BLOCKID10,geometry_x,residentialAP3v1,expandvar_x,blockidcounter,blockid,geometry_y,rppnt4269,tothupoints,expandvar_y,strctid,apcounter
flag_ap,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,19666,19666,19666,19666,19665,19666,19666,19666,19666,19666,19666,19666,19666,19666
1,8704,8704,8704,8704,8704,8704,8704,8704,0,0,0,0,8704,8704
2,33135,0,0,0,0,0,33135,33135,33135,33135,33135,33135,33135,33135


## Set Geometry for Address Points
The location of the address point will be important for identifying the hazard impact. There are two options for the address point location.
1. If there is a building representative point use the building representative point 
2. If there building data is missing use the representative point from the census block

In [None]:
address_point_inventory[['geometry_x','geometry_y','rppnt4269']].head()

Unnamed: 0,geometry_x,geometry_y,rppnt4269
0,POINT (-78.99633432765292 34.6543576103806),"POLYGON ((-78.997281 34.656738, -78.996881 34....",POINT (-78.99912009206406 34.65508)
1,POINT (-78.99711777506781 34.65410219535778),"POLYGON ((-78.997281 34.656738, -78.996881 34....",POINT (-78.99912009206406 34.65508)
2,POINT (-78.99711777506781 34.65410219535778),"POLYGON ((-78.997281 34.656738, -78.996881 34....",POINT (-78.99912009206406 34.65508)
3,POINT (-78.99711777506781 34.65410219535778),"POLYGON ((-78.997281 34.656738, -78.996881 34....",POINT (-78.99912009206406 34.65508)
4,POINT (-78.99711777506781 34.65410219535778),"POLYGON ((-78.997281 34.656738, -78.996881 34....",POINT (-78.99912009206406 34.65508)


In [None]:
# Set Address Point Geometry
# The default geometry is the building representative point
address_point_inventory['geometry'] = address_point_inventory['geometry_x']
# When the building representative point is missing use the Census Block Representative Point
address_point_inventory.loc[(address_point_inventory['geometry'].isnull()),'geometry'] = address_point_inventory['rppnt4269']

## Identify Residential Address Points
For Address Points that have an estimate for the number of housing units, or if the building data is missing then the address point is likely to be a residential address point.

The knowledge that an address point is residential will help prioritize the allocation of housing units to address points.

For address points in buildings with more than housing unit the number of housing units also provides a way to prioritize renters and owners. With renters more likely to be allocated to buildings with greater numbers of housing units.

In [None]:
address_point_inventory['residential'] = 0
# If the building id is missing then the address point is residential
address_point_inventory.loc[(address_point_inventory['guid'].isna()),'residential'] = 1
# The the variable residentialAP3v1 is greater than 0 then the address point is residential
address_point_inventory.loc[(address_point_inventory['residentialAP3v1']>0),'residential'] = 1
# Check new variable
address_point_inventory.groupby(['flag_ap']).count()

Unnamed: 0_level_0,addrptid,guid,BLOCKID10,geometry_x,residentialAP3v1,expandvar_x,blockidcounter,blockid,geometry_y,rppnt4269,tothupoints,expandvar_y,strctid,apcounter,geometry,residential
flag_ap,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,19666,19666,19666,19666,19665,19666,19666,19666,19666,19666,19666,19666,19666,19666,19666,19666
1,8704,8704,8704,8704,8704,8704,8704,8704,0,0,0,0,8704,8704,8704,8704
2,33135,0,0,0,0,0,33135,33135,33135,33135,33135,33135,33135,33135,33135,33135


## Identify observations that represent the primary building
In some future exploration cases it would be of interest to run cross tabulations on just the buildings, instead of all of the address points. To identify the buildings it is possible to use the address point counter (apcounter) and the address point flag (flag_ap). If the counter is 0 and the flag is 0 or 1 then the address point observation is the first address point in a building.

In [None]:
# create a binary variable 0 - not the primary building observation, 1 - use to count buildings
address_point_inventory['bldgobs'] = 0
# If the ap count is 0 and the flag is 0 or 1 then the bldgobs should be 1
address_point_inventory.loc[(address_point_inventory['apcounter'] == 0) &
                            (address_point_inventory['flag_ap'] <= 1), 'bldgobs'] = 1
# Check new variable
address_point_inventory.groupby(['bldgobs']).count()

Unnamed: 0_level_0,addrptid,guid,BLOCKID10,geometry_x,residentialAP3v1,expandvar_x,blockidcounter,blockid,geometry_y,rppnt4269,tothupoints,expandvar_y,strctid,apcounter,flag_ap,geometry,residential
bldgobs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,41414,8279,8279,8279,8279,8279,41414,41414,38572,38572,38572,38572,41414,41414,41414,41414,41414
1,20091,20091,20091,20091,20090,20091,20091,20091,14229,14229,14229,14229,20091,20091,20091,20091,20091


The cross tabulation confirms that the building observations equal the number of building observations in the inventory.

## Keep primary columns
The address point county file has many columns but only a few are needed to generate the address point inventory.

In [None]:
# Rename residentialAP3v1 housing unit estiamte (huestimate)
address_point_inventory['huestimate'] = address_point_inventory['residentialAP3v1']

In [None]:
cols = [col for col in address_point_inventory]
cols

['addrptid',
 'guid',
 'BLOCKID10',
 'geometry_x',
 'residentialAP3v1',
 'expandvar_x',
 'blockidcounter',
 'blockid',
 'geometry_y',
 'rppnt4269',
 'tothupoints',
 'expandvar_y',
 'strctid',
 'apcounter',
 'flag_ap',
 'geometry',
 'residential',
 'bldgobs',
 'huestimate']

In [None]:
select_cols = ['addrptid','strctid','guid','blockid','geometry','huestimate','residential','bldgobs','flag_ap']
address_point_inventory_cols = address_point_inventory[select_cols]
address_point_inventory_cols.head()

Unnamed: 0,addrptid,strctid,guid,blockid,geometry,huestimate,residential,bldgobs,flag_ap
0,ST2d32aeff-7b75-47e6-b7a5-4f4adca4b021AP000000,ST2d32aeff-7b75-47e6-b7a5-4f4adca4b021,2d32aeff-7b75-47e6-b7a5-4f4adca4b021,371559600000000.0,POINT (-78.99633432765292 34.6543576103806),0.0,0,1,0
1,ST400586f1-a265-4454-ba08-30eda28b974cAP000000,ST400586f1-a265-4454-ba08-30eda28b974c,400586f1-a265-4454-ba08-30eda28b974c,371559600000000.0,POINT (-78.99711777506781 34.65410219535778),6.0,1,1,0
2,ST400586f1-a265-4454-ba08-30eda28b974cAP000001,ST400586f1-a265-4454-ba08-30eda28b974c,400586f1-a265-4454-ba08-30eda28b974c,371559600000000.0,POINT (-78.99711777506781 34.65410219535778),6.0,1,0,0
3,ST400586f1-a265-4454-ba08-30eda28b974cAP000002,ST400586f1-a265-4454-ba08-30eda28b974c,400586f1-a265-4454-ba08-30eda28b974c,371559600000000.0,POINT (-78.99711777506781 34.65410219535778),6.0,1,0,0
4,ST400586f1-a265-4454-ba08-30eda28b974cAP000003,ST400586f1-a265-4454-ba08-30eda28b974c,400586f1-a265-4454-ba08-30eda28b974c,371559600000000.0,POINT (-78.99711777506781 34.65410219535778),6.0,1,0,0


## Merge Address Point inventory with Building and Census Data
To analyize the impact of the hazard the address point inventory needs to include building information and census place information. The building information will include building type, year built, and appraised values (when available). The Census information will include city name and count information.

In [None]:
# Keep coloumns for merge
merge_cols = ['guid','archetype']
building_df_merge_cols = building_df[merge_cols]
building_df_merge_cols.head()

Unnamed: 0,guid,archetype
0,2d32aeff-7b75-47e6-b7a5-4f4adca4b021,0
1,400586f1-a265-4454-ba08-30eda28b974c,2
2,b7b5e4ce-431f-4e7d-8a8a-619a9c205571,2
3,47fcaa3f-8590-4f7c-8764-405acb671b19,2
4,63d1d4df-db7b-40f6-bace-a70fbb0466b1,2


In [None]:
# Confirm Primary Key is Unique and Non-Missing
building_df_merge_cols.guid.describe()

count                                    20091
unique                                   20091
top       853ae178-f224-4c96-a127-cf5af89396bd
freq                                         1
Name: guid, dtype: object

In [None]:
# merge selected columns from building inventory to address point inventory
address_point_inventory_cols_bldg = pd.merge(address_point_inventory_cols, building_df_merge_cols,
                                  left_on='guid', right_on='guid', how='left')

### Merge Select Columns from Census Block Data

In [None]:
# For the merge only need a select number of columns
merge_cols = ['blockid','PLCGEOID10','PLCNAME10','COUNTYFP10']
census_blocks_df_merge_cols = census_blocks_df[merge_cols]
census_blocks_df_merge_cols.head()

Unnamed: 0,blockid,PLCGEOID10,PLCNAME10,COUNTYFP10
0,371559619002028,,,155
1,371559619002054,,,155
2,371559617002069,,,155
3,371559617002065,,,155
4,371559617002058,,,155


In [None]:
# Confirm Primary Key is Unique and Non-Missing
census_blocks_df_merge_cols.blockid.describe()

count    5.799000e+03
mean     3.715596e+14
std      6.468066e+06
min      3.715596e+14
25%      3.715596e+14
50%      3.715596e+14
75%      3.715596e+14
max      3.715596e+14
Name: blockid, dtype: float64

In [None]:
# merge selected columns from building inventory to address point inventory
address_point_inventory_cols_bldg_block = pd.merge(address_point_inventory_cols_bldg, census_blocks_df_merge_cols,
                                  left_on='blockid', right_on='blockid', how='left')

### Identify Unicorporated Areas with Place Name
There are many address points that fall just outside of city limits in unincorprated places. For these areas use the county inforamation to label the place names as the County Name.

In [None]:
address_point_inventory_cols_bldg_block.loc[(address_point_inventory_cols_bldg_block['PLCNAME10'].isna()) &
                                            (address_point_inventory_cols_bldg_block['COUNTYFP10'] == '155'),
                                             'PLCNAME10'] = "Unincorporated Robeson County"
# Check new variable
pd.crosstab(address_point_inventory_cols_bldg_block['PLCNAME10'], 
            address_point_inventory_cols_bldg_block['COUNTYFP10'], margins=True, margins_name="Total")

COUNTYFP10,155,Total
PLCNAME10,Unnamed: 1_level_1,Unnamed: 2_level_1
Barker Ten Mile,499,499
Elrod,192,192
Fairmont,1255,1255
Lumber Bridge,51,51
Lumberton,13294,13294
Marietta,79,79
Maxton,1041,1041
McDonald,49,49
Orrum,50,50
Parkton,209,209


## Add X Y variables 
To be consistent with previous address point inventories add X and Y variables

In [None]:
# Convert Data Frame to Geodataframe
address_point_inventory_cols_bldg_block_gdf = gpd.GeoDataFrame(address_point_inventory_cols_bldg_block)

# Use shapely.wkt loads to convert WKT to GeoSeries
from shapely.wkt import loads

address_point_inventory_cols_bldg_block_gdf['geometry'] = address_point_inventory_cols_bldg_block_gdf['geometry'].apply(lambda x: loads(x))

In [None]:
address_point_inventory_cols_bldg_block_gdf['x'] = address_point_inventory_cols_bldg_block_gdf['geometry'].x
address_point_inventory_cols_bldg_block_gdf['y'] = address_point_inventory_cols_bldg_block_gdf['geometry'].y
address_point_inventory_cols_bldg_block_gdf[['geometry','x','y']].head(10)

Unnamed: 0,geometry,x,y
0,POINT (-78.99633 34.65436),-78.996334,34.654358
1,POINT (-78.99712 34.65410),-78.997118,34.654102
2,POINT (-78.99712 34.65410),-78.997118,34.654102
3,POINT (-78.99712 34.65410),-78.997118,34.654102
4,POINT (-78.99712 34.65410),-78.997118,34.654102
5,POINT (-78.99712 34.65410),-78.997118,34.654102
6,POINT (-78.99712 34.65410),-78.997118,34.654102
7,POINT (-78.99646 34.65469),-78.996465,34.654693
8,POINT (-78.99646 34.65469),-78.996465,34.654693
9,POINT (-78.99646 34.65469),-78.996465,34.654693


## Save Work as CSV
A CSV file with the Well Known Text (WKT) geometry provides flexiblity for saving and working with files.

In [None]:
address_point_inventory_cols_bldg_block.loc[address_point_inventory_cols.index, 'ap4326'] = address_point_inventory_cols['geometry']
address_point_inventory_cols_bldg_block['ap4326'].label = "Address Point Location EPSG 4326 (WKT)"
address_point_inventory_cols_bldg_block['ap4326'].notes = "Address Point Location in EPSG 4326"

In [None]:
# Move Foriegn Key Columns Block ID State, County, Tract to first Columns
first_columns = ['addrptid','guid','strctid','blockid','PLCNAME10','PLCGEOID10','COUNTYFP10']
cols = first_columns + [col for col in address_point_inventory_cols_bldg_block if col not in first_columns]
address_point_inventory_cols_bldg_block = address_point_inventory_cols_bldg_block[cols]

In [None]:
# Save Work at this point as CSV
savefile = sys.path[0]+"/"+programname+"/"+programname+"_EPSG4326.csv"
address_point_inventory_cols_bldg_block.to_csv(savefile, index=False)