# National Address Database (NAD)
Release 8 of the NAD, containing 65.4 million records
https://www.transportation.gov/gis/national-address-database/national-address-database-nad-disclaimer
The National Address Database (NAD) is, with few exceptions, aggregated from state data, which is in turn aggregated from local data. 
While it is considered to be authoritative and every reasonable effort is made to ensure the accuracy and completeness of the data, ...

### Schema
https://www.transportation.gov/sites/dot.gov/files/docs/mission/gis/national-address-database/308816/nad-schema-v2.pdf


In [None]:
# nad = gp.read_file('NAD_r8.gdb') # TOO BIG, cant read
# list(gp.read_file('NAD_template_20200408.gdb')) # list columns

In [35]:
import geopandas as gp
import fiona
import jsonlines
%load_ext autotime

SF52 = { '01': 'AL', '02': 'AK', '04': 'AZ', '05': 'AR', 
                      '06': 'CA', '08': 'CO', '09': 'CT', '10': 'DE', 
                      '11': 'DC', '12': 'FL', '13': 'GA', '15': 'HI', 
                      '16': 'ID', '17': 'IL', '18': 'IN', '19': 'IA', 
                      '20': 'KS', '21': 'KY', '22': 'LA', '23': 'ME', 
                      '24': 'MD', '25': 'MA', '26': 'MI', '27': 'MN', 
                      '28': 'MS', '29': 'MO', '30': 'MT', '31': 'NE', 
                      '32': 'NV', '33': 'NH', '34': 'NJ', '35': 'NM', 
                      '36': 'NY', '37': 'NC', '38': 'ND', '39': 'OH', 
                      '40': 'OK', '41': 'OR', '42': 'PA', '44': 'RI', 
                      '45': 'SC', '46': 'SD', '47': 'TN', '48': 'TX', 
                      '49': 'UT', '50': 'VT', '51': 'VA', '53': 'WA', 
                      '54': 'WV', '55': 'WI', '56': 'WY', '72': 'PR'}
SF52R = {v:k for k,v in SF52.items()}

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 1.28 ms (started: 2022-01-14 11:48:15 -05:00)


##### Addr_Type: somewhat useful (large unknown): AZ, ME, 
-- states with accurate Addr_Type, can be used for #passings: DC, MA, MT, NJ, OH, RI, VT
##### Placement: useful -- to get distance building-to-road: CT, IN, ME, MA, MT, NJ, NY, NC, VA


# NAD 19: full extraction

In [31]:
def write_features(abbrv, features):
    file = f"nad19_full/nad_{abbrv}.jsonl"
    with jsonlines.open(file, 'a') as writer:
        for fea in features:
            # select fewer fields from feature (in order to reduce file size)
            fea['properties'] = {col: fea['properties'][col] for col in nad_cols}
            writer.write(fea)
            
# 19 whole states where submitted address data has been processed
NAD_19 = { '04': 'AZ', '09': 'CT', '10': 'DE', '11': 'DC', '18': 'IN', '23': 'ME', 
    '25': 'MA', '30': 'MT', '34': 'NJ', '35': 'NM', '36': 'NY', '37': 'NC', 
    '39': 'OH', '41': 'OR', '44': 'RI', '47': 'TN', '49': 'UT', '50': 'VT', '51': 'VA'}
NAD_19R = {v: k for (k, v) in NAD_19.items()}

nad_cols = ['County', 
            'StN_PreMod', 'StN_PreDir', 'StN_PreTyp', 'StN_PreSep', 'StreetName', 
            'StN_PosTyp', 'StN_PosDir', 'StN_PosMod', 'AddNum_Pre', 'Add_Number', 'AddNum_Suf',
            'Building', 'Floor', 'Unit', 'Room', 'Addtl_Loc',
            'GUID', 'NatGrid_Coord', 'Addr_Type', 'Placement', 'Source', 'LastUpdate']

count = valid_count = output_count = 0
prev_abbrv = ""
features = []

layer = fiona.open('NAD_r8.gdb') # https://nationaladdressdata.s3.amazonaws.com/NAD_r8.zip
print(f"{layer.name=}, feature counts i.e. {len(layer)=}")
# len(layer) # 65,460,370
# layer.bounds # (-17654801.645, 3006814.6655, -7452907.270500001, 9140669.309)
nad_layer_crs = layer.crs['init']

for feature in layer:
    count += 1
    abbrv = feature['properties']['State']
    valid_count += (abbrv != None)
    if abbrv and abbrv.upper() in NAD_19.values():
        output_count += 1
        # SWITCH STATE or features became too large: write aggregated features into NAD_{prev_abbrv}; re-init features
        if prev_abbrv != abbrv or len(features) > 1000000:
            # write current            
            # print("prev_abbrv state ", prev_abbrv, abbrv, len(features), len(features) > 1000000)
            write_features(prev_abbrv, features)
            prev_abbrv = abbrv
            features = [feature]
        else:
            features.append(feature)
                
# last write
if features:
    abbrv = features[0]['properties']['State']
    if abbrv and abbrv.upper() in NAD_19.values():
        write_features(abbrv, features)
        
# of 65.4 mil records, only 43.8 records have 'State' field belong to NAD_19 group
print(count, valid_count, output_count)  # 65460370 65460327, 43867416
# TIME: 50min

layer.name='NAD', feature counts i.e. len(layer)=65460370
65460370 65460327 43867416
time: 54min 5s (started: 2022-01-13 11:58:20 -05:00)


# NAD6: 6 states with comprehensive and detailed (Addr_Type) NAD data


### Parse and write to disk

In [None]:
NAD6 = {SF52R[abbrv]:abbrv for abbrv in ['DC', 'MA', 'ME', 'MT', 'RI', 'VT']}

addr_fields = ['StN_PreMod', 'StN_PreDir', 'StN_PreTyp', 'StN_PreSep', 
            'StreetName', 'StN_PosTyp', 'StN_PosDir', 'StN_PosMod', 
            'AddNum_Pre', 'Add_Number', 'AddNum_Suf', 'LandmkPart', 'LandmkName', 
            'Building', 'Floor', 'Unit', 'Room', 'Addtl_Loc', 'Milepost']

usng_cols = ['addr', 'GUID', 'NatGrid_Coord', 'Addr_Type']
def write_features(abbrv, features):
    file = f"nad6/nad_{abbrv}.jsonl"
    with jsonlines.open(file, 'a') as writer:
        for fea in features:
            # Concat address fields
            fea['properties']['addr'] = ' '.join(str(fea['properties'][addr_col]) 
                                                 for addr_col in addr_fields if fea['properties'][addr_col])
            # select fewer fields from feature (in order to reduce file size)
            fea['properties'] = {col: fea['properties'][col] for col in usng_cols}
            writer.write(fea)


count = valid_count = output_count = 0
prev_abbrv = ""
features = []

layer = fiona.open('NAD_r8.gdb') # https://nationaladdressdata.s3.amazonaws.com/NAD_r8.zip
nad_layer_crs = layer.crs['init']

for feature in layer:
    count += 1
    abbrv = feature['properties']['State']
    valid_count += (abbrv != None)
    if abbrv and abbrv.upper() in NAD6.values():
        output_count += 1
        # SWITCH STATE or features became too large: write aggregated features into NAD_{prev_abbrv}; re-init features
        if prev_abbrv != abbrv or len(features) > 1000000:
            # write current            
            # print("prev_abbrv state ", prev_abbrv, abbrv, len(features), len(features) > 1000000)
            write_features(prev_abbrv, features)
            prev_abbrv = abbrv
            features = [feature]
        else:
            features.append(feature)
                
# last write
if features:
    abbrv = features[0]['properties']['State']
    if abbrv and abbrv.upper() in NAD6.values():
        write_features(abbrv, features)
        
print(count, valid_count, output_count)  # 65460370 65460327
# TIME: 30min

In [None]:
### Read jsonl files
with jsonlines.open(f'nad6/nad_VT.jsonl') as reader: 
    vtdf = gp.GeoDataFrame.from_features(reader, crs= nad_layer_crs)
vtdf.shape, len(vtdf.NatGrid_Coord.unique()), len(vtdf.addr.unique())

# NAD Overview

In [3]:
for sf, abbrv in NAD_19.items():
    with jsonlines.open(f'nadcoords/nad_{abbrv}.jsonl') as reader: 
        df = gp.GeoDataFrame.from_features(reader, crs= layer_crs)
        print(abbrv, df.shape)
        print(df.Addr_Type.value_counts())
        print(df.Placement.value_counts())
        

AZ (3175998, 3)
Unknown        2691647
Residential     403129
Open             45228
Commercial       27938
FLAGGED           7472
Other              531
Government          45
Religious            5
Recreation           2
Educational          1
Name: Addr_Type, dtype: int64
Unknown                 3170825
Structure - Rooftop        4827
Property Access             163
Other                       134
Structure - Interior         27
Parcel - Other               21
Parcel - Centroid             1
Name: Placement, dtype: int64
CT (1156849, 3)
Unknown        1156723
Residential        120
Other                3
Commercial           2
Religious            1
Name: Addr_Type, dtype: int64
Structure - Rooftop          556491
Unknown                      303165
Parcel - Other               296214
Structure - Interior Unit       659
Structure - Entrance            282
Other                            37
Property Access Point             1
Name: Placement, dtype: int64
DE (514654, 3)
Unknown    5