Goal: Geocode hospital addresses to get lat/lon values. Export a combined file for use in further analysis/mapping.

First, we will take a closer look at what the adresses look like and do some geocoding-specific data cleaning. 

In [16]:
import pandas as pd
import re
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import pandas as pd
from tqdm import tqdm
import time
import os
import requests


# Keeping pandas from truncating long strings
pd.set_option('display.max_colwidth', None)

### Load data

In [17]:
file_path = '../data/yearly_hospital_lists/processed/combined_df.csv'
combined_df = pd.read_csv(file_path)

In [18]:
zip_centroids = pd.read_csv('../data/zip_code_centroids/ZIP_Code_Population_Weighted_Centroids_-8037460774014549482.csv', dtype={"STD_ZIP5": str})

### Cleaning

#### Trailing commas

In [19]:
# Remove trailing commas and leading/trailing whitespace from the address field
combined_df['ADDRESS'] = combined_df['ADDRESS'].str.strip().str.rstrip(',')

#### Things in parentheses
This is floor or building info that we can do away with

In [20]:
combined_df['ADDRESS'] = combined_df['ADDRESS'].str.replace(r'\([^)]*\)', '', regex=True).str.strip()

#### PO boxes
There are about 275 rows that contain the word "box", which will be problematic for geocoding. We will clean this data with this approach:
- If the address starts with what we deem to be a valid street address,s strip the PO Box info and just use the address to geocode.
- If the address appears to be a PO Box only with no street address, replace with a fallback of zip code centroids (from a HUD dataset)

The patch_final_box_cases function handles a few edge cases manually.

In [21]:
# --- Fallback for PO Box-only entries ---
def fallback_to_zip_centroid(address):
    zip_match = re.search(r'\b\d{5}\b', address)
    return f"ZIP_CENTER_{zip_match.group()}" if zip_match else "ZIP_CENTER_UNKNOWN"

# --- Final cleaning logic ---
# def clean_extended_box_cases(address):
#     addr = address.strip()
#     addr_upper = addr.upper()

#     # Fallback cases — treat as non-mappable
#     if re.match(r'^\s*(P\.?\s*O\.?|POST\s+OFFICE)\s+BOX', addr_upper):
#         return None
#     if re.match(r'^\s*BOX\s+[A-Z0-9]+', addr_upper):
#         return None
#     if re.search(r'\b(CALLER\s+)?BOX\s+[A-Z0-9]{1,6}', addr_upper):
#         return None

#     # Remove dangling "P O"
#     if re.search(r'\bP\s*O\b', addr_upper):
#         addr = re.sub(r',?\s*\(?\bP\s*O\b\)?', '', addr, flags=re.IGNORECASE)

#     # Remove numeric BOX forms: BOX 123, / BOX 456, BOX#789
#     addr = re.sub(r'[,/]*\s*BOX\s*#?\s*\d+\b', '', addr, flags=re.IGNORECASE)

#     return addr.strip().strip(',')

def clean_extended_box_cases(address):
    addr = address.strip()
    addr_upper = addr.upper()

    # === 1. Handle PO Box and common patterns ===
    if re.match(r'^\s*(P\.?\s*O\.?|POST\s+OFFICE)\s+BOX', addr_upper):
        return None
    if re.match(r'^\s*BOX\s+[A-Z0-9]+', addr_upper):
        return None
    if re.search(r'\b(CALLER\s+)?BOX\s+[A-Z0-9]{1,6}', addr_upper):
        return None

    # Remove dangling "P O"
    addr = re.sub(r',?\s*\(?\bP\s*O\b\)?', '', addr, flags=re.IGNORECASE)

    # Remove BOX #### or BOX A
    addr = re.sub(r'[,/]*\s*BOX\s*#?\s*\w+\b', '', addr, flags=re.IGNORECASE)

    # Normalize "U S HIGHWAY" → "US Hwy"
    # addr = re.sub(r'\bU\s*S\s*HIGHWAY\b', 'US Hwy', addr, flags=re.IGNORECASE)

    # Remove STE ###, SUITE ###, FLOOR ###, BLDG ###
    addr = re.sub(r'\b(STE|STE\.|SUITE|FLOOR|BLDG)[\s#]*\d*[A-Z]*\b', '', addr, flags=re.IGNORECASE)

    # === 🔢 Convert spelled-out numbers at start of address ===
    number_map = {
        'ONE': '1', 'TWO': '2', 'THREE': '3', 'FOUR': '4', 'FIVE': '5',
        'SIX': '6', 'SEVEN': '7', 'EIGHT': '8', 'NINE': '9', 'TEN': '10'
    }
    for word, digit in number_map.items():
        addr = re.sub(rf'^\b{word}\b(?=\s)', digit, addr, flags=re.IGNORECASE)

    # Final cleanup
    addr = re.sub(r'\s{2,}', ' ', addr).strip()
    addr = re.sub(r',\s*,', ',', addr).strip(',')
    return addr

# --- Manual patch for known cases that slipped through ---
def patch_final_box_cases(address):
    address = address.strip()
    if address.upper().startswith("420 34TH ST BOX"):
        return "420 34TH ST"
    elif "GIBSON BOULEVARD" in address.upper():
        return "5400 GIBSON BOULEVARD SE, 4TH FLOOR"
    elif "225 E CHICAGO" in address.upper():
        return "225 E CHICAGO"
    else:
        return address

In [22]:
# Initial clean
combined_df['cleaned_address'] = combined_df['ADDRESS'].apply(clean_extended_box_cases)

# ZIP centroid fallback for unmappable rows
combined_df['cleaned_address'] = combined_df.apply(
    lambda row: fallback_to_zip_centroid(row['ADDRESS']) if pd.isna(row['cleaned_address']) else row['cleaned_address'],
    axis=1
)

# Manual patching
combined_df['cleaned_address'] = combined_df['cleaned_address'].apply(patch_final_box_cases)

In [23]:
combined_df[combined_df['cleaned_address'].str.contains('ZIP')]

Unnamed: 0,ID,NAME,ADDRESS,CITY,STATE,ZIP,FIPS,RUCA,RURAL_STATUS,TOTAL_BEDS,ACUTE_BEDS,YEAR,HOSPITAL_TYPE,TYPE,TOTAL_BEDS.1,cleaned_address
35,10073,CLAY COUNTY HOSPITAL,83825 HIGHWAY 9 P O BOX 1270,ASHLAND,AL,36251,1027,2.0,Rural,53.0,46.0,2023,Acute,,,ZIP_CENTER_83825
83,11304,OCHSNER CHOCTAW GENERAL,"401 VANITY FAIR LANE, PO BOX 618",BUTLER,AL,36904,1023,10.0,Rural,25.0,25.0,2023,Acute,,,ZIP_CENTER_UNKNOWN
91,20018,YUKON KUSKOKWIM DELTA REG HOSPITAL,PO BOX 287,BETHEL,AK,99559,2050,,Rural,50.0,34.0,2023,Acute,,,ZIP_CENTER_UNKNOWN
94,21301,PROVIDENCE VALDEZ MEDICAL CENTER,PO BOX 550,VALDEZ,AK,99686,2063,,Rural,11.0,11.0,2023,Acute,,,ZIP_CENTER_UNKNOWN
95,21302,PROVIDENCE SEWARD MEDICAL CENTER,"417 FIRST AVENUE, PO BOX 365",SEWARD,AK,99664,2122,10.0,Rural,6.0,6.0,2023,Acute,,,ZIP_CENTER_UNKNOWN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54157,494022,POPLAR SPRINGS HOSPITAL,350 POPLAR DRIVE PO BOX 3060,PETERSBURG,VA,23805,51730.0,1.0,Urban,130.0,95.0,2021,Specialty,PSYCH,,ZIP_CENTER_UNKNOWN
54162,501992,SUNRISE HAVEN,PO BOX 6057 24423 100TH AVENUE SOUTH,KENT,WA,98064,53033.0,1.0,Urban,8.0,4.0,2021,Specialty,RELIGIOUS NON-MED,,ZIP_CENTER_24423
54166,503300,SEATTLE CHILDREN'S HOSPITAL,"4800 SAND POINT WAY NE, PO BOX C-5371",SEATTLE,WA,98105,53033.0,1.0,Urban,250.0,343.0,2021,Specialty,CHILD,,ZIP_CENTER_UNKNOWN
54167,503301,MARY BRIDGE CHILDREN'S HOSPITAL,317 MARTIN LUTHER KING JR W BOX 5299,TACOMA,WA,98415,53053.0,1.0,Urban,68.0,,2021,Specialty,CHILD,,ZIP_CENTER_UNKNOWN


### Build full address column

In [24]:
# Build full address 
combined_df['full_address'] = (
    combined_df['cleaned_address'] + ', ' +
    combined_df['CITY'] + ', ' +
    combined_df['STATE'] + ' ' +
    combined_df['ZIP'].astype(str)
)

combined_df.shape

(54262, 17)

In [25]:
# Drop duplicate addresses to save time/bandwidth on geocoding
unique_addresses = combined_df[['full_address']].drop_duplicates().reset_index(drop=True)

unique_addresses.shape

(7384, 1)

### Look for remaining potential issues before geocoding

In [26]:
# See if we have any missing values that we need for geocoding
cols_to_check = ['NAME', 'ADDRESS', 'CITY', 'STATE', 'ZIP']
missing_rows = combined_df[combined_df[cols_to_check].isnull().any(axis=1)]
print(f"Rows with missing values: {len(missing_rows)}")

Rows with missing values: 0


In [27]:
# Null or blank
nulls = unique_addresses[unique_addresses['full_address'].isnull() | (unique_addresses['full_address'].str.strip() == '')]
print(f"Null or blank addresses: {len(nulls)}")

# Addresses that are unusually short
short = unique_addresses[unique_addresses['full_address'].str.len() < 15]
print(f"Suspiciously short addresses (<15 chars): {len(short)}")

# Addresses with repeated commas (may indicate missing fields)
messy = unique_addresses[unique_addresses['full_address'].str.contains(r',\s*,')]
print(f"Addresses with double commas: {len(messy)}")

Null or blank addresses: 0
Suspiciously short addresses (<15 chars): 0
Addresses with double commas: 0


### Geocode

In [30]:
import pandas as pd
import requests
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm

# === Setup ===
geolocator = Nominatim(user_agent="hospital_access_pipeline")
rate_limited_geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# === Safe Nominatim wrapper ===
def safe_nominatim_geocode(x):
    loc = rate_limited_geocode(x)
    if loc:
        return pd.Series([loc.latitude, loc.longitude])
    else:
        return pd.Series([None, None])

# === Census API wrapper with timeout ===
def census_geocode(address):
    try:
        response = requests.get(
            "https://geocoding.geo.census.gov/geocoder/locations/onelineaddress",
            params={"address": address, "benchmark": "Public_AR_Current", "format": "json"},
            timeout=2
        )
        match = response.json()['result']['addressMatches'][0]
        return match['coordinates']['y'], match['coordinates']['x']
    except Exception:
        return None, None

# === Load in-progress file ===
existing = pd.read_csv("../data/yearly_hospital_lists/geocoded/in_progress_geocoding.csv")

# === Filter out completed rows ===
already_done = set(existing['full_address'])

to_geocode = unique_addresses[
    (~unique_addresses['full_address'].str.startswith("ZIP_CENTER_")) &
    (~unique_addresses['full_address'].isin(already_done))
].copy()

zip_center_rows = unique_addresses[
    unique_addresses['full_address'].str.startswith("ZIP_CENTER_")
].copy()

# === Add lat/lon cols if not present ===
for col in ['lat_census', 'lon_census', 'lat_nominatim', 'lon_nominatim']:
    if col not in to_geocode.columns:
        to_geocode[col] = None

# === Chunking + geocoding loop ===
chunks = [to_geocode.iloc[i:i+50].copy() for i in range(0, len(to_geocode), 50)]
geocoded_chunks = []

for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1} of {len(chunks)}")

    # --- Census step ---
    needs_census = chunk[chunk['lat_census'].isna()].copy()
    tqdm.pandas(desc="Census geocoding")
    needs_census[['lat_census', 'lon_census']] = needs_census['full_address'].progress_apply(
        lambda x: pd.Series(census_geocode(x))
    )
    chunk.update(needs_census)

    # --- Nominatim step ---
    needs_nominatim = chunk[
        chunk['lat_census'].isna() & chunk['lat_nominatim'].isna()
    ].copy()
    tqdm.pandas(desc="Nominatim geocoding")
    needs_nominatim[['lat_nominatim', 'lon_nominatim']] = needs_nominatim['full_address'].progress_apply(
        safe_nominatim_geocode
    )
    chunk.update(needs_nominatim)

    # --- Append + save ---
    geocoded_chunks.append(chunk)
    pd.concat([existing] + geocoded_chunks).to_csv(
        "../data/yearly_hospital_lists/geocoded/in_progress_geocoding.csv", index=False
    )

    print(f"✅ Chunk {i+1}/{len(chunks)} complete — saved {len(existing) + len(pd.concat(geocoded_chunks))} rows")


Processing chunk 1 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:51<00:00,  2.24s/it]
Nominatim geocoding:  67%|████████████████        | 4/6 [00:03<00:01,  1.04it/s]RateLimiter caught an error, retrying (0/2 tries). Called with (*('5950 STATE ROUTE 6 WEST, TUNKHANNOCK, PA 18657',), **{}).
Traceback (most recent call last):
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 1349, in getresponse
    response.begin()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 316, i

✅ Chunk 1/49 complete — saved 4800 rows
Processing chunk 2 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.24s/it]
Nominatim geocoding: 100%|██████████████████████| 11/11 [00:10<00:00,  1.02it/s]


✅ Chunk 2/49 complete — saved 4850 rows
Processing chunk 3 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding: 100%|██████████████████████| 10/10 [00:10<00:00,  1.06s/it]


✅ Chunk 3/49 complete — saved 4900 rows
Processing chunk 4 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.26s/it]
Nominatim geocoding:  72%|███████████████▉      | 26/36 [00:30<00:12,  1.24s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('ROAD NUMBER 2 KM 173.4 CAIN ALTO, SAN GERMAN, PR 00683',), **{}).
Traceback (most recent call last):
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 1349, in getresponse
    response.begin()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", lin

✅ Chunk 4/49 complete — saved 4950 rows
Processing chunk 5 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:54<00:00,  2.29s/it]
Nominatim geocoding: 100%|██████████████████████| 16/16 [00:23<00:00,  1.45s/it]


✅ Chunk 5/49 complete — saved 5000 rows
Processing chunk 6 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding: 100%|████████████████████████| 2/2 [00:01<00:00,  1.29it/s]


✅ Chunk 6/49 complete — saved 5050 rows
Processing chunk 7 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:51<00:00,  2.24s/it]
Nominatim geocoding:  62%|██████████████▏        | 8/13 [00:07<00:05,  1.03s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('9421 EAST SIDE DRIVE EXTENSION, POB 299, NEWTON, MS 39345',), **{}).
Traceback (most recent call last):
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 1349, in getresponse
    response.begin()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", 

✅ Chunk 7/49 complete — saved 5100 rows
Processing chunk 8 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:53<00:00,  2.26s/it]
Nominatim geocoding:  90%|████████████████████▋  | 9/10 [00:10<00:01,  1.39s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('HIGHWAY 18, MAIN ST., . 159, PINE RIDGE, SD 57770',), **{}).
Traceback (most recent call last):
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 1349, in getresponse
    response.begin()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 316

✅ Chunk 8/49 complete — saved 5150 rows
Processing chunk 9 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:51<00:00,  2.24s/it]
Nominatim geocoding:  20%|████▌                  | 3/15 [00:01<00:07,  1.57it/s]RateLimiter caught an error, retrying (0/2 tries). Called with (*('500 WEST 4TH STREET , 4TH, ODESSA, TX 79761',), **{}).
Traceback (most recent call last):
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 1349, in getresponse
    response.begin()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 316, in b

✅ Chunk 9/49 complete — saved 5200 rows
Processing chunk 10 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding:  16%|███▋                   | 3/19 [00:01<00:11,  1.41it/s]RateLimiter caught an error, retrying (0/2 tries). Called with (*('PONCE DE LEON AVENUE STOP 37 1/2, SAN JUAN, PR 00918',), **{}).
Traceback (most recent call last):
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 1349, in getresponse
    response.begin()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 

✅ Chunk 10/49 complete — saved 5250 rows
Processing chunk 11 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding: 100%|██████████████████████| 24/24 [00:29<00:00,  1.25s/it]


✅ Chunk 11/49 complete — saved 5300 rows
Processing chunk 12 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.26s/it]
Nominatim geocoding:  89%|█████████████████████▎  | 8/9 [00:07<00:01,  1.03s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('50 MEDICAL PARK EAST DRIVE 8TH, BIRMINGHAM, AL 35235',), **{}).
Traceback (most recent call last):
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 1349, in getresponse
    response.begin()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 

✅ Chunk 12/49 complete — saved 5350 rows
Processing chunk 13 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:51<00:00,  2.24s/it]
Nominatim geocoding: 100%|████████████████████████| 4/4 [00:04<00:00,  1.14s/it]


✅ Chunk 13/49 complete — saved 5400 rows
Processing chunk 14 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:51<00:00,  2.24s/it]
Nominatim geocoding: 100%|████████████████████████| 6/6 [00:05<00:00,  1.05it/s]


✅ Chunk 14/49 complete — saved 5450 rows
Processing chunk 15 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:51<00:00,  2.24s/it]
Nominatim geocoding: 100%|████████████████████████| 6/6 [00:07<00:00,  1.21s/it]


✅ Chunk 15/49 complete — saved 5500 rows
Processing chunk 16 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding: 100%|████████████████████████| 8/8 [00:09<00:00,  1.15s/it]


✅ Chunk 16/49 complete — saved 5550 rows
Processing chunk 17 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.26s/it]
Nominatim geocoding: 100%|████████████████████████| 2/2 [00:01<00:00,  1.12it/s]


✅ Chunk 17/49 complete — saved 5600 rows
Processing chunk 18 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding: 100%|████████████████████████| 6/6 [00:07<00:00,  1.26s/it]


✅ Chunk 18/49 complete — saved 5650 rows
Processing chunk 19 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding:  57%|█████████████▋          | 4/7 [00:02<00:02,  1.44it/s]RateLimiter caught an error, retrying (0/2 tries). Called with (*('E 65TH ST AT LAKE MICHIGAN, CHICAGO, IL 60649',), **{}).
Traceback (most recent call last):
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 1349, in getresponse
    response.begin()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 316, in

✅ Chunk 19/49 complete — saved 5700 rows
Processing chunk 20 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding: 100%|████████████████████████| 8/8 [00:07<00:00,  1.03it/s]


✅ Chunk 20/49 complete — saved 5750 rows
Processing chunk 21 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding:  57%|█████████████▋          | 4/7 [00:03<00:03,  1.27s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('929 NORTH ST FRANCIS, 6TH , NORTH TOWER, WICHITA, KS 67214',), **{}).
Traceback (most recent call last):
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 1349, in getresponse
    response.begin()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py",

✅ Chunk 21/49 complete — saved 5800 rows
Processing chunk 22 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:51<00:00,  2.24s/it]
Nominatim geocoding: 100%|████████████████████████| 7/7 [00:09<00:00,  1.31s/it]


✅ Chunk 22/49 complete — saved 5850 rows
Processing chunk 23 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding: 100%|████████████████████████| 8/8 [00:09<00:00,  1.22s/it]


✅ Chunk 23/49 complete — saved 5900 rows
Processing chunk 24 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:53<00:00,  2.27s/it]
Nominatim geocoding: 100%|████████████████████████| 4/4 [00:05<00:00,  1.31s/it]


✅ Chunk 24/49 complete — saved 5950 rows
Processing chunk 25 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:51<00:00,  2.24s/it]
Nominatim geocoding: 100%|████████████████████████| 2/2 [00:01<00:00,  1.09it/s]


✅ Chunk 25/49 complete — saved 6000 rows
Processing chunk 26 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:51<00:00,  2.23s/it]
Nominatim geocoding: 100%|████████████████████████| 6/6 [00:05<00:00,  1.02it/s]


✅ Chunk 26/49 complete — saved 6050 rows
Processing chunk 27 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding:  22%|█████▎                  | 2/9 [00:00<00:02,  2.58it/s]RateLimiter caught an error, retrying (0/2 tries). Called with (*('218 A SUNSET ROAD, 3RD, WILLINGBORO, NJ 08046',), **{}).
Traceback (most recent call last):
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 1349, in getresponse
    response.begin()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 316, in

✅ Chunk 27/49 complete — saved 6100 rows
Processing chunk 28 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding: 100%|██████████████████████| 10/10 [00:11<00:00,  1.18s/it]


✅ Chunk 28/49 complete — saved 6150 rows
Processing chunk 29 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.24s/it]
Nominatim geocoding: 100%|██████████████████████| 10/10 [00:12<00:00,  1.21s/it]


✅ Chunk 29/49 complete — saved 6200 rows
Processing chunk 30 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding: 100%|████████████████████████| 8/8 [00:10<00:00,  1.27s/it]


✅ Chunk 30/49 complete — saved 6250 rows
Processing chunk 31 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:51<00:00,  2.24s/it]
Nominatim geocoding: 100%|████████████████████████| 9/9 [00:10<00:00,  1.14s/it]


✅ Chunk 31/49 complete — saved 6300 rows
Processing chunk 32 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.26s/it]
Nominatim geocoding:  40%|█████████▏             | 4/10 [00:05<00:10,  1.70s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('283 SOUTH BUTLER ROAD, MT GRETNA, PA 17064',), **{}).
Traceback (most recent call last):
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 1349, in getresponse
    response.begin()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 316, in be

✅ Chunk 32/49 complete — saved 6350 rows
Processing chunk 33 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding: 100%|████████████████████████| 5/5 [00:07<00:00,  1.45s/it]


✅ Chunk 33/49 complete — saved 6400 rows
Processing chunk 34 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding: 100%|████████████████████████| 7/7 [00:10<00:00,  1.51s/it]


✅ Chunk 34/49 complete — saved 6450 rows
Processing chunk 35 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:51<00:00,  2.24s/it]
Nominatim geocoding: 100%|████████████████████████| 4/4 [00:03<00:00,  1.06it/s]


✅ Chunk 35/49 complete — saved 6500 rows
Processing chunk 36 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.24s/it]
Nominatim geocoding: 100%|██████████████████████| 11/11 [00:10<00:00,  1.05it/s]


✅ Chunk 36/49 complete — saved 6550 rows
Processing chunk 37 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding: 100%|████████████████████████| 8/8 [00:07<00:00,  1.06it/s]


✅ Chunk 37/49 complete — saved 6600 rows
Processing chunk 38 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding: 100%|████████████████████████| 8/8 [00:08<00:00,  1.12s/it]


✅ Chunk 38/49 complete — saved 6650 rows
Processing chunk 39 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.24s/it]
Nominatim geocoding: 100%|██████████████████████| 10/10 [00:09<00:00,  1.02it/s]


✅ Chunk 39/49 complete — saved 6700 rows
Processing chunk 40 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding: 100%|████████████████████████| 4/4 [00:03<00:00,  1.10it/s]


✅ Chunk 40/49 complete — saved 6750 rows
Processing chunk 41 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding:  50%|████████████            | 3/6 [00:01<00:02,  1.46it/s]RateLimiter caught an error, retrying (0/2 tries). Called with (*('1101 MEDICAL CENTER BLVD 7TH, MARRERO, LA 70072',), **{}).
Traceback (most recent call last):
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 1349, in getresponse
    response.begin()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 316, 

✅ Chunk 41/49 complete — saved 6800 rows
Processing chunk 42 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.26s/it]
Nominatim geocoding: 100%|████████████████████████| 9/9 [00:11<00:00,  1.23s/it]


✅ Chunk 42/49 complete — saved 6850 rows
Processing chunk 43 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding: 100%|████████████████████████| 4/4 [00:05<00:00,  1.25s/it]


✅ Chunk 43/49 complete — saved 6900 rows
Processing chunk 44 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.24s/it]
Nominatim geocoding: 100%|████████████████████████| 9/9 [00:15<00:00,  1.70s/it]


✅ Chunk 44/49 complete — saved 6950 rows
Processing chunk 45 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:51<00:00,  2.24s/it]
Nominatim geocoding: 100%|████████████████████████| 7/7 [00:07<00:00,  1.08s/it]


✅ Chunk 45/49 complete — saved 7000 rows
Processing chunk 46 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.24s/it]
Nominatim geocoding: 100%|████████████████████████| 6/6 [00:06<00:00,  1.10s/it]


✅ Chunk 46/49 complete — saved 7050 rows
Processing chunk 47 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.25s/it]
Nominatim geocoding:   0%|                               | 0/12 [00:00<?, ?it/s]RateLimiter caught an error, retrying (0/2 tries). Called with (*('506 E SAN ANTONIO STREET 3 EAST, VICTORIA, TX 77901',), **{}).
Traceback (most recent call last):
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "/Users/rattnern/.local/share/virtualenvs/scripts-uQiJCpmS/lib/python3.9/site-packages/urllib3/connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 1349, in getresponse
    response.begin()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/http/client.py", line 3

✅ Chunk 47/49 complete — saved 7100 rows
Processing chunk 48 of 49


Census geocoding: 100%|█████████████████████████| 50/50 [01:52<00:00,  2.26s/it]
Nominatim geocoding: 100%|██████████████████████| 18/18 [00:17<00:00,  1.02it/s]


✅ Chunk 48/49 complete — saved 7150 rows
Processing chunk 49 of 49


Census geocoding: 100%|█████████████████████████| 10/10 [00:22<00:00,  2.24s/it]
Nominatim geocoding: 100%|████████████████████████| 4/4 [00:03<00:00,  1.08it/s]

✅ Chunk 49/49 complete — saved 7160 rows





In [36]:
# Load the in-progress file (fully geocoded at this point)
geocoded_df = pd.read_csv("../data/yearly_hospital_lists/geocoded/in_progress_geocoding.csv")

# ZIP fallback step
geocoded_df['zip5'] = geocoded_df['full_address'].str.extract(r'(\d{5})$')
geocoded_df = geocoded_df.merge(zip_centroids[['STD_ZIP5', 'LATITUDE', 'LONGITUDE']],
              left_on='zip5', right_on='STD_ZIP5', how='left')

geocoded_df['final_latitude'] = geocoded_df['lat_census'].combine_first(geocoded_df['lat_nominatim']).combine_first(geocoded_df['LATITUDE'])
geocoded_df['final_longitude'] = geocoded_df['lon_census'].combine_first(geocoded_df['lon_nominatim']).combine_first(geocoded_df['LONGITUDE'])

geocoded_df['used_zip_fallback'] = (
    geocoded_df['final_latitude'].eq(geocoded_df['LATITUDE']) &
    geocoded_df['lat_census'].isna() &
    geocoded_df['lat_nominatim'].isna()
)

def determine_source(row):
    if pd.notna(row['lat_census']):
        return "Census"
    elif pd.notna(row['lat_nominatim']):
        return "Nominatim"
    elif row['used_zip_fallback']:
        return "ZIP"
    else:
        return "None"

geocoded_df['geocode_source'] = geocoded_df.apply(determine_source, axis=1)


In [38]:
geocoded_df.to_csv("../data/yearly_hospital_lists/geocoded/geocoded.csv", index=False)

In [41]:
geocoded_df[geocoded_df['geocode_source']=='None'].shape

(44, 13)

In [37]:
geocoded_df

Unnamed: 0,full_address,lat_census,lon_census,lat_nominatim,lon_nominatim,zip5,STD_ZIP5,LATITUDE,LONGITUDE,final_latitude,final_longitude,used_zip_fallback,geocode_source
0,"1108 ROSS CLARK CIRCLE, DOTHAN, AL 36301",31.215415,-85.361498,,,36301,36301,31.182941,-85.399582,31.215415,-85.361498,False,Census
1,"2505 U S HIGHWAY 431 NORTH, BOAZ, AL 35957",34.220288,-86.159001,,,35957,35957,34.204288,-86.170510,34.220288,-86.159001,False,Census
2,"1701 VETERANS DRIVE, FLORENCE, AL 35630",,,34.805043,-87.650814,35630,35630,34.825097,-87.664501,34.805043,-87.650814,False,Nominatim
3,"702 N MAIN ST, OPP, AL 36467",31.291599,-86.255377,,,36467,36467,31.282233,-86.255099,31.291599,-86.255377,False,Census
4,"101 HOSPITAL CIRCLE, LUVERNE, AL 36049",,,31.693749,-86.264702,36049,36049,31.741186,-86.277176,31.693749,-86.264702,False,Nominatim
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7155,"2230 JOE BATTLE BLVD, EL PASO, TX 79938",31.781821,-106.267581,,,79938,79938,31.794494,-106.222888,31.781821,-106.267581,False,Census
7156,"267 N STATE HIGHWAY 360, MANSFIELD, TX 76063",,,32.568701,-97.082291,76063,76063,32.578347,-97.122017,32.568701,-97.082291,False,Nominatim
7157,"750 W CENTRAL TEXAS EXPRESSWAY, HARKER HEIGHTS, TX 76548",31.080668,-97.645475,,,76548,76548,31.063386,-97.653123,31.080668,-97.645475,False,Census
7158,"8903 FLOYD CURL DRIVE, SAN ANTONIO, TX 78240",,,29.517506,-98.586115,78240,78240,29.520767,-98.600025,29.517506,-98.586115,False,Nominatim
