In [1]:
###############################################################################
# File name: BatchAddressMatch_final.ipynb
#
# Description: Takes an exported set of warren group properties from 
#              geocode_fixes.do that have incorrect lat/lon geocoding and 
#              uploaded them to the census's geocoder api website, downloads 
#              the correct lat/lon coordinates and saves it to a new file.
#
# Inputs: T:\boston_zoning\working_paper\data\warren\geocode_fixes\<various>.txt
#
# Outputs: T:\boston_zoning\working_paper\data\warren\geocode_fixes_<date>.csv
#
# Author: Nicholas Chiumenti
#
# Created: 12.01.2020
# Last Updated: 2.10.2022
###############################################################################

In [1]:
import os
import requests
import csv
import time

In [2]:
# defines the file extract function
def geocoder_batch(path):
    "Stores the text files of 'path' as a list for uploading to Census GeoCoder."
    
    # define empty list to store file paths
    filelist = []

    # set working directory
    os.chdir(path)
    print(f"Working directory changed to:\n'{os.getcwd()}'")

    # get contents of working directory
    contents = os.listdir(path)
    
    # for all contents of 'path', use only the .txt files
    for i in range(len(contents)):
        
        # get root and extention as string
        root, ext = os.path.splitext(os.path.abspath(contents[i]))
        
        # if extentension is .txt
        if ext == ".txt":
            
            # store the file path in the list
            file = os.path.abspath(contents[i])
            
            filelist.append(file)

    print(f"{len(filelist)} files will be uploaded to Census GeoCoder...")
    
    return filelist

# defintes the api upload function
def batch_request(filelist):
    "Iterates 'filelist' uploading to Census GeoCoder and returning matches."
    
    ## local cluster proxy set
    # os.environ['http_proxy'] = 'http://p1web4.frb.org:8080'
    # os.environ['https_proxy'] = 'http://p1web4.frb.org:8080'
    
    ## AWS cluster proxy set
    # os.environ['http_proxy'] = 'http://10.121.131.6:8080'
    # os.environ['https_proxy'] = 'http://10.121.131.6:8080'
    
    ## JupyterHub Proxy
    os.environ["https_proxy"] = "http://p1proxy.frb.org:8080"
    os.environ["http_proxy"] = "http://p1proxy.frb.org:8080"
    os.environ["no_proxy"] = "frb.org,frb.pvt,frb.gov"

    ca_bundle_fp = "/etc/pki/tls/certs/ca-bundle.crt"
        
    ## API url for Census GeoCoder
    url = 'https://geocoding.geo.census.gov/geocoder/geographies/addressbatch'
    
    ## Specify parameters for geocode matching
    payload = {'benchmark':'Public_AR_Current','vintage':'Census2010_Current'}
    
    matches = ""
    
    ## Iterates over filelist uploading to GeoCoder
    for i in range(len(filelist)):
        
        start_time = time.time()

        print(f"Uploading {i+1} of {len(filelist)}: '{os.path.basename(filelist[i])}'...")

        files = {'addressFile': (filelist[i], open(filelist[i], 'rb')),}

        # r = requests.post(url, files=files, data = payload) # non-JupyterHub version
        r = requests.post(url, files=files, data = payload, verify = ca_bundle_fp) # JupyterHub version

        matches = matches + r.text
        
        time.sleep(3)
        
        end_time = time.time()
        
        minutes, seconds = divmod(end_time-start_time,60)
        
        print(f" {int(minutes)} minutes {int(seconds)} seconds elapsed")
    
    ## Returns a string of matches from all files
    return matches

# defines the save .csv file function
def csv_export(matches):
    "Saves matches as a date stamped CSV file."
    
    Matches = matches
    
    ## CSV variable headers
    Headers = ["RECORD ID NUMBER","INPUT ADDRESS","TIGER ADDRESS RANGE MATCH INDICATOR", \
               "TIGER MATCH TYPE","TIGER OUTPUT ADDRESS","INTERPOLATED LONGITUDE AND LATITUDE", \
               "TIGERLINE ID","TIGERLINE ID SIDE","STATE CODE", \
               "COUNTY CODE","TRACT CODE","BLOCK CODE"]
        
    DateStamp = time.strftime("%Y%m%d")
    
    Data = [i for i in Matches.split("\n")]
    
    with open(f'/home/a1nfc04/python_projects/census_geocoder_api/geocoder_export_{DateStamp}.csv', 'w',newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        
        csvwriter.writerow(Headers)
        
        csvwriter.writerows(csv.reader(Data))
        
    print("finished!")

# The below cells run each function individually starting with geocoder_batch()

## Listing the upload files using geocoder_batch()

In [4]:
path = "/home/a1nfc04/python_projects/census_geocoder_api/address_files"
filelist = geocoder_batch(path)

for i in range(len(filelist)):
    print(os.path.basename(filelist[i]))

Working directory changed to:
'/home/home03/a1nfc04/python_projects/census_geocoder_api/address_files'
16 files will be uploaded to Census GeoCoder...
address_corrections_12.txt
address_corrections_05.txt
address_corrections_09.txt
address_corrections_16.txt
address_corrections_10.txt
address_corrections_06.txt
address_corrections_08.txt
address_corrections_02.txt
address_corrections_04.txt
address_corrections_01.txt
address_corrections_14.txt
address_corrections_15.txt
address_corrections_13.txt
address_corrections_11.txt
address_corrections_07.txt
address_corrections_03.txt


## Batch upload a bunch of files and return 1 .csv file

In [None]:
%%time

path = "/home/a1nfc04/local_to_aws/Python_Projects/census_geocoder_api/address_files"

filelist = geocoder_batch(path)

matches = batch_request(filelist)

csv_export(matches)

# Running this program on the CH40B properties .csv file 

In [5]:
%%time

## specify the correct file
filelist = ["/home/a1nfc04/python_projects/census_geocoder_api/address_files/chapter40b_geocode_inputs_20211020.csv"]

## upload to get geocodes
matches = batch_request(filelist)

# export matches as a csv
csv_export(matches)

Uploading 1 of 1: 'chapter40b_geocode_inputs_20211020.csv'...
 25 minutes 1 seconds elapsed
finished!
CPU times: user 58.4 ms, sys: 34.7 ms, total: 93.1 ms
Wall time: 25min 1s
