# Data Imports

In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

pd.set_option('display.max_columns', 70) # Set max display to 70 columns

## Importing the 2020 air quality CSV from the [EPA website](https://aqs.epa.gov/aqsweb/airdata/download_files.html)  

https://aqs.epa.gov/aqsweb/airdata/download_files.html

In [2]:
air_quality_df = pd.read_csv("annual_conc_by_monitor_2020.csv") # Read in the CSV

In [3]:
# Filtering the AQ Data
air_quality_df = air_quality_df[air_quality_df['State Code'] == 49] # Utah
air_quality_df = air_quality_df[air_quality_df['Parameter Name'] == 'PM2.5 - Local Conditions'] # PM 2.5
air_quality_df = air_quality_df[air_quality_df['Pollutant Standard'].isin(['PM25 24-hour 2024', 'PM25 Annual 2024'])] # PM 2.5 standard effective 2024

In [4]:
air_quality_df.head()

Unnamed: 0,State Code,County Code,Site Num,Parameter Code,POC,Latitude,Longitude,Datum,Parameter Name,Sample Duration,Pollutant Standard,Metric Used,Method Name,Year,Units of Measure,Event Type,Observation Count,Observation Percent,Completeness Indicator,Valid Day Count,Required Day Count,Exceptional Data Count,Null Data Count,Primary Exceedance Count,Secondary Exceedance Count,Certification Indicator,Num Obs Below MDL,Arithmetic Mean,Arithmetic Standard Dev,1st Max Value,1st Max DateTime,2nd Max Value,2nd Max DateTime,3rd Max Value,3rd Max DateTime,4th Max Value,4th Max DateTime,1st Max Non Overlapping Value,1st NO Max DateTime,2nd Max Non Overlapping Value,2nd NO Max DateTime,99th Percentile,98th Percentile,95th Percentile,90th Percentile,75th Percentile,50th Percentile,10th Percentile,Local Site Name,Address,State Name,County Name,City Name,CBSA Name,Date of Last Change
66556,49,5,7,88101,1,41.842649,-111.852199,WGS84,PM2.5 - Local Conditions,24 HOUR,PM25 24-hour 2024,Daily Mean,R & P Model 2025 PM-2.5 Sequential Air Sampler...,2020,Micrograms/cubic meter (LC),Events Excluded,350,96,Y,353,366,0,13,2.0,2.0,Certified,0,7.742,5.847565,39.6,2020-01-21 00:00,35.6,2020-01-20 00:00,31.7,2020-12-31 00:00,30.9,2020-09-07 00:00,,,,,30.9,24.1,19.5,15.0,10.0,6.1,2.2,Smithfield,675 West 220 North,Utah,Cache,Smithfield,"Logan, UT-ID",2024-08-07
66557,49,5,7,88101,1,41.842649,-111.852199,WGS84,PM2.5 - Local Conditions,24 HOUR,PM25 24-hour 2024,Daily Mean,R & P Model 2025 PM-2.5 Sequential Air Sampler...,2020,Micrograms/cubic meter (LC),Events Included,353,96,Y,353,366,3,13,5.0,5.0,Certified,0,8.109065,7.090248,61.6,2020-08-21 00:00,50.2,2020-08-22 00:00,41.0,2020-08-23 00:00,39.6,2020-01-21 00:00,,,,,39.6,28.6,20.3,15.7,10.2,6.3,2.3,Smithfield,675 West 220 North,Utah,Cache,Smithfield,"Logan, UT-ID",2024-08-07
66558,49,5,7,88101,1,41.842649,-111.852199,WGS84,PM2.5 - Local Conditions,24 HOUR,PM25 24-hour 2024,Daily Mean,R & P Model 2025 PM-2.5 Sequential Air Sampler...,2020,Micrograms/cubic meter (LC),Concurred Events Excluded,353,96,Y,353,366,3,13,5.0,5.0,Certified,0,8.109065,7.090248,61.6,2020-08-21 00:00,50.2,2020-08-22 00:00,41.0,2020-08-23 00:00,39.6,2020-01-21 00:00,,,,,39.6,28.6,20.3,15.7,10.2,6.3,2.3,Smithfield,675 West 220 North,Utah,Cache,Smithfield,"Logan, UT-ID",2024-08-07
66568,49,5,7,88101,1,41.842649,-111.852199,WGS84,PM2.5 - Local Conditions,24 HOUR,PM25 Annual 2024,Quarterly Means of Daily Means,R & P Model 2025 PM-2.5 Sequential Air Sampler...,2020,Micrograms/cubic meter (LC),Events Included,353,96,Y,353,366,3,13,,,Certified,0,8.109065,7.090248,61.6,2020-08-21 00:00,50.2,2020-08-22 00:00,41.0,2020-08-23 00:00,39.6,2020-01-21 00:00,,,,,39.6,28.6,20.3,15.7,10.2,6.3,2.3,Smithfield,675 West 220 North,Utah,Cache,Smithfield,"Logan, UT-ID",2024-08-07
66569,49,5,7,88101,1,41.842649,-111.852199,WGS84,PM2.5 - Local Conditions,24 HOUR,PM25 Annual 2024,Quarterly Means of Daily Means,R & P Model 2025 PM-2.5 Sequential Air Sampler...,2020,Micrograms/cubic meter (LC),Events Excluded,350,96,Y,353,366,0,13,,,Certified,0,7.742,5.847565,39.6,2020-01-21 00:00,35.6,2020-01-20 00:00,31.7,2020-12-31 00:00,30.9,2020-09-07 00:00,,,,,30.9,24.1,19.5,15.0,10.0,6.1,2.2,Smithfield,675 West 220 North,Utah,Cache,Smithfield,"Logan, UT-ID",2024-08-07


## Importing the shape file

The shape files can be downloaded from: 
https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.2020.html#list-tab-790442341  

We only added the relevant shape file to our GitHub as it would be too large with everything in it. We are using these files:  
* tl_2020_49_tract20.shp
* tl_2020_49_tract20.shx
* tl_2020_49_tract20.dbf
* tl_2020_49_tract20.prj

In [5]:
# The shape file is in the 'tl_2020_49_all' folder -> tl_2020_49_tract20.shp
# If you want to run, you'll have to replace with your own file path
file_path = "C:/Users/joshu/Documents/GitHub/BMI6016_SDOH/tl_2020_49_all/tl_2020_49_tract20.shp"
census_tracts = gpd.read_file(file_path)

In [6]:
census_tracts.head()

Unnamed: 0,STATEFP20,COUNTYFP20,TRACTCE20,GEOID20,NAME20,NAMELSAD20,MTFCC20,FUNCSTAT20,ALAND20,AWATER20,INTPTLAT20,INTPTLON20,geometry
0,49,49,901,49049000901,9.01,Census Tract,G5020,S,1686016,0,40.3051285,-111.6922978,"POLYGON ((-111.70144 40.31183, -111.69972 40.3..."
1,49,49,2500,49049002500,25.0,Census Tract,G5020,S,1034541,0,40.2296149,-111.6497278,"POLYGON ((-111.65689 40.22703, -111.65689 40.2..."
2,49,49,3401,49049003401,34.01,Census Tract,G5020,S,9085765,23788,40.0560227,-111.712257,"POLYGON ((-111.745 40.04366, -111.74455 40.044..."
3,49,49,10210,49049010210,102.1,Census Tract,G5020,S,15810476,0,40.4005884,-111.7306856,"POLYGON ((-111.77339 40.41553, -111.76899 40.4..."
4,49,49,505,49049000505,5.05,Census Tract,G5020,S,3919712,0,40.3700595,-111.7070846,"POLYGON ((-111.72677 40.37546, -111.72662 40.3..."


In [7]:
print(census_tracts.crs)
# Convert the census_tract GeoDF from EPSG:4269 (NAD83) to EPSG:4326 (WGS 84)
# This will match the format of the geo_df
census_tracts = census_tracts.to_crs(epsg=4326) 
print(census_tracts.crs)

EPSG:4269
EPSG:4326


In [8]:
# Creating point geometries from latitude and longitude

geometry = [Point(xy) for xy in zip(air_quality_df['Longitude'], air_quality_df['Latitude'])]
geo_df = gpd.GeoDataFrame(air_quality_df, geometry=geometry, crs="EPSG:4326")  # The air quality data is in WGS 84 (EPSG:4326)

In [9]:
# Spatial join between the point GeoDF and census tract DF
joined_df = gpd.sjoin(geo_df, census_tracts, predicate='within', how='left') # Using left join to keep all original data.

In [10]:
# Extract GEOID from joined GeoDF
if 'GEOID' in joined_df.columns:
    result_df = joined_df[['State Code', 'County Code', 'Site Num', 'Parameter Code', 'POC', 'Latitude', 'Longitude', 'GEOID']]
elif 'GEOID20' in joined_df.columns:
    result_df = joined_df[['State Code', 'County Code', 'Site Num', 'Parameter Code', 'POC', 'Latitude', 'Longitude', 'GEOID20']]
else:
    result_df = joined_df[['State Code', 'County Code', 'Site Num', 'Parameter Code', 'POC', 'Latitude', 'Longitude']] # if no GEOID information, keep original data.
    print("Warning: No GEOID or GEOID20 column found in census tract data.")

You can check the census tract by typing in the address to https://geomap.ffiec.gov/ffiecgeomap/

In [11]:
# Print head of joined_df with unique TRACTCE20 values (unique census tracts)
joined_df.loc[joined_df.drop_duplicates(subset='TRACTCE20').index].head()

Unnamed: 0,State Code,County Code,Site Num,Parameter Code,POC,Latitude,Longitude,Datum,Parameter Name,Sample Duration,Pollutant Standard,Metric Used,Method Name,Year,Units of Measure,Event Type,Observation Count,Observation Percent,Completeness Indicator,Valid Day Count,Required Day Count,Exceptional Data Count,Null Data Count,Primary Exceedance Count,Secondary Exceedance Count,Certification Indicator,Num Obs Below MDL,Arithmetic Mean,Arithmetic Standard Dev,1st Max Value,1st Max DateTime,2nd Max Value,2nd Max DateTime,3rd Max Value,3rd Max DateTime,4th Max Value,4th Max DateTime,1st Max Non Overlapping Value,1st NO Max DateTime,2nd Max Non Overlapping Value,2nd NO Max DateTime,99th Percentile,98th Percentile,95th Percentile,90th Percentile,75th Percentile,50th Percentile,10th Percentile,Local Site Name,Address,State Name,County Name,City Name,CBSA Name,Date of Last Change,geometry,index_right,STATEFP20,COUNTYFP20,TRACTCE20,GEOID20,NAME20,NAMELSAD20,MTFCC20,FUNCSTAT20,ALAND20,AWATER20,INTPTLAT20,INTPTLON20
66556,49,5,7,88101,1,41.842649,-111.852199,WGS84,PM2.5 - Local Conditions,24 HOUR,PM25 24-hour 2024,Daily Mean,R & P Model 2025 PM-2.5 Sequential Air Sampler...,2020,Micrograms/cubic meter (LC),Events Excluded,350,96,Y,353,366,0,13,2.0,2.0,Certified,0,7.742,5.847565,39.6,2020-01-21 00:00,35.6,2020-01-20 00:00,31.7,2020-12-31 00:00,30.9,2020-09-07 00:00,,,,,30.9,24.1,19.5,15.0,10.0,6.1,2.2,Smithfield,675 West 220 North,Utah,Cache,Smithfield,"Logan, UT-ID",2024-08-07,POINT (-111.8522 41.84265),258,49,5,201,49005000201,2.01,Census Tract,G5020,S,40625920,200123,41.8501637,-111.8559779
66954,49,11,4,88101,1,40.902967,-111.884467,WGS84,PM2.5 - Local Conditions,24 HOUR,PM25 24-hour 2024,Daily Mean,R & P Model 2025 PM-2.5 Sequential Air Sampler...,2020,Micrograms/cubic meter (LC),No Events,325,89,Y,325,366,0,40,0.0,0.0,Certified,0,7.083077,5.383427,32.3,2020-10-07 00:00,28.8,2020-10-06 00:00,27.4,2020-12-10 00:00,26.1,2020-09-07 00:00,,,,,26.1,25.6,18.3,13.8,8.7,5.8,2.2,Bountiful Viewmont,"171 WEST 1370 NORTH, BOUNTIFUL, UTAH",Utah,Davis,Bountiful,"Ogden-Clearfield, UT",2024-05-18,POINT (-111.88447 40.90297),170,49,11,126600,49011126600,1266.0,Census Tract,G5020,S,2447881,0,40.8999263,-111.8826698
67076,49,13,2,88101,3,40.294178,-110.009732,WGS84,PM2.5 - Local Conditions,24-HR BLK AVG,PM25 24-hour 2024,Daily Mean,,2020,Micrograms/cubic meter (LC),No Events,366,100,Y,366,366,0,0,4.0,4.0,Certified,0,7.465301,6.313076,45.3,2020-09-06 00:00,44.6,2020-10-07 00:00,44.3,2020-09-07 00:00,37.0,2020-10-08 00:00,,,,,37.0,23.2,19.9,15.1,8.9,5.5,2.4,Roosevelt,290 S. 1000 W.,Utah,Duchesne,Roosevelt,,2024-05-18,POINT (-110.00973 40.29418),287,49,13,940502,49013940502,9405.02,Census Tract,G5020,S,66640490,51513,40.3115865,-110.0221147
67174,49,21,5,88101,1,37.74743,-113.055525,WGS84,PM2.5 - Local Conditions,24-HR BLK AVG,PM25 24-hour 2024,Daily Mean,,2020,Micrograms/cubic meter (LC),No Events,366,100,Y,366,366,0,0,0.0,0.0,Certified,0,5.38306,3.507251,28.7,2020-09-07 00:00,21.6,2020-10-06 00:00,19.4,2020-10-07 00:00,18.1,2020-08-03 00:00,,,,,18.1,16.5,12.1,9.4,6.5,4.6,2.3,Enoch,3840 North 325 East,Utah,Iron,Enoch,"Cedar City, UT",2024-05-18,POINT (-113.05552 37.74743),87,49,21,110201,49021110201,1102.01,Census Tract,G5020,S,12397351,0,37.7493853,-113.0545971
67216,49,35,2005,88101,4,40.598056,-111.894167,WGS84,PM2.5 - Local Conditions,24-HR BLK AVG,PM25 24-hour 2024,Daily Mean,,2020,Micrograms/cubic meter (LC),Concurred Events Excluded,363,99,Y,363,366,0,0,4.0,4.0,Certified,0,9.264463,7.149932,49.3,2020-07-05 00:00,48.7,2020-08-21 00:00,40.6,2020-09-07 00:00,36.9,2020-08-22 00:00,,,,,36.9,31.2,24.1,17.8,11.2,7.0,3.2,Copper View,8449 S. Monroe St.,Utah,Salt Lake,Midvale,"Salt Lake City, UT",2024-08-07,POINT (-111.89417 40.59806),472,49,35,112402,49035112402,1124.02,Census Tract,G5020,S,2720633,0,40.6002942,-111.8972171


In [12]:
# saving results
#result_df.to_csv("census_tract_results.csv", index=False)