For all orgs that made the cut 1
* we want to enrich them with county statistics
* this requires ein-county mapping
* which requires finding the lat-lon coordinates of an org from address so we can check which county contains the given coordinates using shapefiles (first match is selected)
* which requires using Google Maps GeoCoding API

Geocoding API is expensive so we want to limit the lookups to the orgs that made the first cut and get their unique set of addresses (multiple orgs could be using the same address)





In [21]:
!pip install googlemaps rtree

Collecting rtree
  Downloading Rtree-1.2.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (535 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m535.2/535.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rtree
Successfully installed rtree-1.2.0


In [41]:
# Native modules
import csv
import time
import zipfile

# External modules
import fiona
import googlemaps
import pandas as pd
import requests
import rtree
import shapely

from google.cloud import bigquery, storage


In [43]:
API_KEY = "INSERT API KEY"
location = "INSERT LOCATION"
project_id = "INSERT PROJECT ID"
bucket_name = "INSERT BUCKET NAME"
prefix = "irs-form-990"

In [6]:
gmaps = googlemaps.Client(key=API_KEY)

In [9]:
client = bigquery.Client(location=location)

In [8]:
# Identify the list of EINs and their address that require a geocoding lookup
query = """
SELECT ein, CONCAT(TRIM(address), ', ', TRIM(city), ', ', TRIM(state), ', ', zip) AS combined_address
FROM analysis.cut_2_v20240212_filtered
WHERE geo_id IS NULL
AND CAST(ein AS STRING) NOT IN (SELECT DISTINCT ein FROM reference.ein_lat_lon_2022)
"""


In [10]:
query_job = client.query(query)

In [11]:
orgs = query_job.to_dataframe()

In [12]:
orgs.head()

Unnamed: 0,ein,combined_address


In [13]:
# Find the geocodes for each organization

with open('orgs_without_coordinates.csv', 'w', newline='', encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['ein', 'latitude', 'longitude'])

    for index, org in enumerate(orgs.itertuples()):
        ein = org.ein
        address = org.combined_address
        geocode = gmaps.geocode(address)
        if len(geocode) == 0:
            lat = None
            lng = None
        else:
            lat = geocode[0]['geometry']['location']['lat']
            lng = geocode[0]['geometry']['location']['lng']

        writer.writerow([ein, lat, lng])

        if index % 10000 == 0:
            print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), index)


In [15]:
df = pd.read_csv('orgs_without_coordinates.csv')

In [17]:
df

Unnamed: 0,ein,latitude,longitude


In [16]:
# Set EIN to string
df['ein'] = df['ein'].astype(str)

In [18]:
table_id = "reference.ein_lat_lon_2022"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",
)

job = client.load_table_from_dataframe(df, table_id, job_config=job_config)

job.result()  # Waits for the job to complete

LoadJob<project=decent-carving-397313, location=us-central1, id=f070dada-4cbc-4339-8dbd-b2c525410bd1>

Now we have the coordinates for all EINs, time to find their county using shapefiles

In [24]:
shape_file_url = "https://www2.census.gov/geo/tiger/TIGER2021/COUNTY/tl_2021_us_county.zip"

In [25]:
# Download the shape file
r = requests.get(shape_file_url, allow_redirects=True)
open('tl_2021_us_county.zip', 'wb').write(r.content)

82328531

In [28]:
with zipfile.ZipFile('tl_2021_us_county.zip', 'r') as zip_ref:
    zip_ref.extractall('tl_2021_us_county')

In [29]:
shapefile = fiona.open('tl_2021_us_county/tl_2021_us_county.shp')

In [30]:
shapefile.schema

{'properties': {'STATEFP': 'str:2',
  'COUNTYFP': 'str:3',
  'COUNTYNS': 'str:8',
  'GEOID': 'str:5',
  'NAME': 'str:100',
  'NAMELSAD': 'str:100',
  'LSAD': 'str:2',
  'CLASSFP': 'str:2',
  'MTFCC': 'str:5',
  'CSAFP': 'str:3',
  'CBSAFP': 'str:5',
  'METDIVFP': 'str:5',
  'FUNCSTAT': 'str:1',
  'ALAND': 'int:14',
  'AWATER': 'int:14',
  'INTPTLAT': 'str:11',
  'INTPTLON': 'str:12'},
 'geometry': 'Polygon'}

In [31]:
# create a spatial index for the county polygons
index = rtree.index.Index()
for i, county in enumerate(shapefile):
    geometry = shapely.geometry.shape(county['geometry'])
    index.insert(i, geometry.bounds)

In [32]:
# Get the EINs and their coordinates
ein_lat_lon_query = """
SELECT *
FROM reference.ein_lat_lon_2022
"""

In [34]:
org_coordinates = client.query(ein_lat_lon_query).to_dataframe()

In [35]:
org_coordinates.head()

Unnamed: 0,ein,latitude,longitude
0,263528160,64.840051,-147.719976
1,61309331,41.414117,-73.303565
2,931294144,45.573373,-122.621178
3,473809177,32.760392,-97.492325
4,363153674,42.225494,-89.111554


In [36]:
# find the county that each org is in
ein_county = []
i = 0
for org in org_coordinates.itertuples():
    ein = org.ein
    point = shapely.geometry.Point(org.longitude, org.latitude)
    # if the point is null then skip it
    if point.is_empty:
        continue
    # use the spatial index to find the county that contains the point
    for j in index.intersection(point.bounds):
        county = shapefile[j]
        geometry = shapely.geometry.shape(county['geometry'])
        if geometry.contains(point):
            geo_id = county["properties"]["GEOID"]
            data = {"ein": ein, "geo_id": geo_id}
            ein_county.append(data)
            break
    if i % 10000 == 0:
        print(i, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    i += 1

0 2024-02-15 01:34:53
10000 2024-02-15 01:36:28
20000 2024-02-15 01:38:05
30000 2024-02-15 01:39:42
40000 2024-02-15 01:41:19
50000 2024-02-15 01:42:56
60000 2024-02-15 01:44:30
70000 2024-02-15 01:46:06
80000 2024-02-15 01:47:43
90000 2024-02-15 01:49:20
100000 2024-02-15 01:50:58
110000 2024-02-15 01:52:36


In [37]:
ein_county_df = pd.DataFrame(ein_county)

In [38]:
ein_county_df.head()

Unnamed: 0,ein,geo_id
0,263528160,02090
1,61309331,09001
2,931294144,41051
3,473809177,48439
4,363153674,17201
...,...,...
110736,410824189,27091
110737,411887217,27123
110738,710292138,05131
110739,464342876,29051


In [39]:
ein_county_df.to_csv("ein_county_2022.csv", index=False)

In [44]:
storage_client = storage.Client(project=project_id)

In [45]:
bucket = storage_client.get_bucket(bucket_name)

In [46]:
blob = bucket.blob(f"{prefix}/reference/ein_county_2022.csv")

In [47]:
blob.upload_from_filename("ein_county_2022.csv")

In [48]:
schema = [
    bigquery.SchemaField("ein", "STRING"),
    bigquery.SchemaField("county", "STRING"),
]

In [49]:
job_config = bigquery.LoadJobConfig(
    schema=schema,
    skip_leading_rows=1,
    source_format=bigquery.SourceFormat.CSV,
    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
)

In [50]:
table_ref = client.dataset("reference").table("ein_county_2022")

In [51]:
job = client.load_table_from_uri(
    f"gs://{bucket_name}/{prefix}/reference/ein_county_2022.csv",
    table_ref,
    job_config=job_config,
)



In [52]:
job.result()

LoadJob<project=decent-carving-397313, location=us-central1, id=25c2bf6f-9f83-41c7-aa1b-611f2fdc3cf1>