### Description
Author: T. Majidzadeh

Date Created: February 26, 2025

Date Updated: February 26, 2025

Purpose: For each address, get the Nominatim address from OSM.

In [1]:
import getpass
import pandas as pd
import numpy as np
import re
import time
import os
from geopy.adapters import AioHTTPAdapter
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import requests

In [37]:
email = getpass.getpass("Enter your email.")
nominatim_service = Nominatim(
    user_agent=f"{email}"
)
geolocator = nominatim_service.geocode

Enter your email. ········


In [21]:
pm_data = pd.read_csv("..\\data\\pm_data_small_appended.csv").astype(str)

In [23]:
pm_data['building_name_and_address'] = pm_data[['building_name', 'raw_address']] \
    .apply(lambda x: " ".join([re.sub(x.raw_address, "", x.building_name), x.raw_address]), axis=1)
pm_data['building_name_and_address'] = pm_data \
    .apply(lambda x: re.sub("nan", "", x.building_name_and_address).strip(), axis=1)
pm_data['building_name_and_address'] = pm_data['building_name_and_address'].str.replace('\n', ' ', regex=True)
pm_data['building_name_and_address'] = pm_data['building_name_and_address'].str.replace(' +', ' ', regex=True)
pm_data_sample = pm_data[0:5]

In [210]:
addresses = []
retry_attempts = 3  # Max retries before skipping

for i, (name_and_addr, addr) in enumerate(zip(pm_data['building_name_and_address'], pm_data['raw_address'])):
    print(f"Processing {i}: {addr}")
    
    success = False  # Track if we got a valid geocode result

    for attempt in range(retry_attempts):
        try:
            geocode = geolocator(addr)
            if geocode:
                addresses.append(geocode.address)
                success = True
                break  # Stop retrying since we succeeded
        except (GeocoderTimedOut, GeocoderServiceError, requests.exceptions.RequestException, Exception) as e:
            print(f"Error on attempt {attempt + 1}: {type(e).__name__} - {e}")
            time.sleep(1.1 ** attempt)  # Exponential backoff

    if not success:  # If everything fails, append an empty value
        print("Failed to geocode, using missing value")
        addresses.append("")

    time.sleep(1.1)  # Respect Nominatim's rate limit

    if i % 1000 == 0 and i > 0:  # Save progress every 1000 entries
        pd.DataFrame(addresses).to_csv('..\\data\\pm_osm_addresses_temp.csv', index=False)

print("Geocoding completed.")


Processing 0: 4000 Wisconsin Ave NW, Washington, DC 20016
Processing 1: 10511 Strathmore Hall St, North Bethesda, MD 20852
Processing 2: 348 S Hauser Blvd, Los Angeles, CA 90036
Processing 3: 8300 Wisconsin Ave, Bethesda, MD 20814
Processing 4: 901 E Phillips Ln, Centennial, CO 80122
Processing 5: 270 Third Street, Cambridge, MA 02142
Processing 6: 3645 Habersham Rd NE, Atlanta, GA 30305
Processing 7: 33 Rogers Street, Cambridge, MA 02142
Processing 8: 2301 Ostracod Ln, Raleigh, NC 27610
Processing 9: 401 Briar Ridge Drive, San Jose, CA 95123
Processing 10: 22980 Vista Edera Cir, Estero, FL 33928
Failed to geocode, using missing value
Processing 11: 2171 Peachtree Rd NW, Atlanta, GA 30309
Processing 12: 2300 Catalina Cir, Oceanside, CA 92056
Processing 13: 3455 Table Mesa Dr, Boulder, CO 80305
Processing 14: 6220 W 3rd St, Los Angeles, CA 90036
Processing 15: Hudson & Kneeland, Boston, MA 02111
Failed to geocode, using missing value
Processing 16: 1000 Peachtree Park Dr NE, Atlanta, GA

In [212]:
pd.DataFrame(addresses).to_csv('..\\data\\pm_osm_addresses_temp.csv')

In [220]:
output_pm_data = pm_data
output_pm_data['osm_address'] = addresses

In [232]:
output_pm_data.to_csv('..\\data\\pm_osm_addresses.csv')
os.remove('..\\data\\pm_osm_addresses_temp.csv')

In [39]:
cbsa_data = pd.read_csv('..\\data\\cbsa_data.csv').astype(str)
cbsa_data['state zip'] = cbsa_data[['state', 'zipCode']].apply(" ".join, axis=1)
cbsa_data['full_address'] = cbsa_data[['address', 'city', 'state zip']].apply(", ".join, axis=1)
cbsa_data['building_name_and_address'] = cbsa_data[['propertyName', 'full_address']].apply(" ".join, axis=1)

  cbsa_data = pd.read_csv('..\\data\\cbsa_data.csv').astype(str)


In [None]:
addresses = []
retry_attempts = 3  # Max retries before skipping

for i, (name_and_addr, addr) in enumerate(zip(cbsa_data['building_name_and_address'], cbsa_data['full_address'])):
    print(f"Processing {i}: {addr}")
    
    success = False  # Track if we got a valid geocode result

    for attempt in range(retry_attempts):
        try:
            geocode = geolocator(addr)
            if geocode:
                addresses.append(geocode.address)
                success = True
                break  # Stop retrying since we succeeded
        except (GeocoderTimedOut, GeocoderServiceError, requests.exceptions.RequestException, Exception) as e:
            print(f"Error on attempt {attempt + 1}: {type(e).__name__} - {e}")
            time.sleep(1.1 ** attempt)  # Exponential backoff

    if not success:  # If everything fails, append an empty value
        print("Failed to geocode, using missing value")
        addresses.append("")

    time.sleep(1.1)  # Respect Nominatim's rate limit

    if i % 1000 == 0 and i > 0:  # Save progress every 1000 entries
        pd.DataFrame(addresses).to_csv('..\\data\\cbsa_osm_addresses_temp.csv', index=False)

print("Geocoding completed.")

output_cbsa_data = cbsa_data.drop('building_name_and_address')
output_cbsa_data['cbsa_osm_address'] = addresses

Processing 0: 2400 Arrowhead Dr, Abilene, TX 79606
Processing 1: 5249 US-277, Abilene, TX 79605
Processing 2: 5450 Texas Ave, Abilene, TX 79605
Processing 3: 3549 Curry Ln, Abilene, TX 79606
Processing 4: 3549 Cedar Run Rd, Abilene, TX 79606
Processing 5: 3201 S 23rd St, Abilene, TX 79605
Processing 6: 2010 S Clack St, Abilene, TX 79606
Processing 7: 1111 Musken Rd, Abilene, TX 79601
Processing 8: 3602 Rolling Green Dr, Abilene, TX 79606
Processing 9: 4450 Ridgemont Dr, Abilene, TX 79606
Processing 10: 1351 Andy St, Abilene, TX 79605
Processing 11: 1315 Musken Rd, Abilene, TX 79601
Processing 12: 3501 Curry Ln, Abilene, TX 79606
Processing 13: 2701 Southwest Dr, Abilene, TX 79605
Processing 14: 1948 Denton St, Abilene, TX 79605
Processing 15: 5125 Fairmont St, Abilene, TX 79605
Processing 16: 1000 S Clack St, Abilene, TX 79605
Processing 17: 3525 Rolling Green Dr, Abilene, TX 79606
Processing 18: 500 N Judge Ely Blvd, Abilene, TX 79601
Processing 19: 2601 Nonesuch Rd, Abilene, TX 79606

In [None]:
output_cbsa_data.to_csv('..\\data\\cbsa_osm_addresses.csv')
os.remove('..\\data\cbsa_osm_addresses_temp')