In [10]:
import requests
import json
import pandas as pd


### US official EPA AQS air quality dataset --

`https://www.epa.gov/aqs/aqs-code-list`

You need to get your API for AQS and make a txt file `myEmailKey.txt` to store your email and key in the first two lines.

Also, note that sometimes it may not allow you to access the API due to internet and geographical conditions! Try different connection and devices ~

In [11]:
# Read email and API key from the file
with open("myEmailKey.txt", "r") as file:
    email = file.readline().strip()
    api_key = file.readline().strip()

# Define the base API URL
base_url = "https://aqs.epa.gov/data/api"

# Define the endpoint and parameters
endpoint = "/list/countiesByState"
params = {
    "email": email,
    "key": api_key,
    "state": "37"
}

# Make the GET request with SSL/TLS verification disabled (NOT recommended for production)
response = requests.get(base_url + endpoint, params=params)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    data = response.json()
    print(data)
else:
    print(f"Error: {response.status_code}")


{'Header': [{'status': 'Success', 'request_time': '2023-05-08T23:38:29-04:00', 'url': 'https://aqs.epa.gov/data/api/list/countiesByState?email=zhu.tim99%40gmail.com&key=copperswift75&state=37', 'rows': 100}], 'Data': [{'code': '001', 'value_represented': 'Alamance'}, {'code': '003', 'value_represented': 'Alexander'}, {'code': '005', 'value_represented': 'Alleghany'}, {'code': '007', 'value_represented': 'Anson'}, {'code': '009', 'value_represented': 'Ashe'}, {'code': '011', 'value_represented': 'Avery'}, {'code': '013', 'value_represented': 'Beaufort'}, {'code': '015', 'value_represented': 'Bertie'}, {'code': '017', 'value_represented': 'Bladen'}, {'code': '019', 'value_represented': 'Brunswick'}, {'code': '021', 'value_represented': 'Buncombe'}, {'code': '023', 'value_represented': 'Burke'}, {'code': '025', 'value_represented': 'Cabarrus'}, {'code': '027', 'value_represented': 'Caldwell'}, {'code': '029', 'value_represented': 'Camden'}, {'code': '031', 'value_represented': 'Carteret'}

In [3]:
## similarly we can get different data using the same sturcture
# Read email and API key from the file
with open("myEmailKey.txt", "r") as file:
    email = file.readline().strip()
    api_key = file.readline().strip()

# Define the base API URL
base_url = "https://aqs.epa.gov/data/api"


In [None]:
## quarterly summary data by county e.g Example; returns quarterly summary FRM/FEM and non-FRM PM2.5 data for Wake County for 2016: https://aqs.epa.gov/data/api/quarterlyData/byCounty?email=test@aqs.api&key=test&param=88101,88502&bdate=20160101&edate=20160228&state=37&county=183

## we want LA county data for 2019 qualters 1 

endpoint = "/quarterlyData/byCounty"
params = {
    "email": email,
    "key": api_key,
    "param": "88101,88502",
    "bdate": "20190101",
    "edate": "20191231",
    "state": "06",
    "county": "037"
}

# Get the data
data = get_data(base_url, endpoint, params)

# Create a DataFrame from the JSON response

df = pd.DataFrame.from_dict(data["Data"])

df

In [18]:
## see what data is in the df
df.columns




Index(['state_code', 'county_code', 'site_number', 'parameter_code', 'poc',
       'latitude', 'longitude', 'datum', 'parameter', 'sample_duration',
       'sample_duration_code', 'sample_duration_type', 'pollutant_standard',
       'year', 'quarter', 'units_of_measure', 'event_type',
       'observation_count', 'observation_percent', 'arithmetic_mean',
       'minimum_value', 'maximum_value', 'quarterly_criteria_met',
       'actual_days_gt_std', 'estimated_days_gt_std', 'valid_samples',
       'valid_day_count', 'scheduled_samples', 'percent_days',
       'percent_one_value', 'monitoring_agency_code', 'monitoring_agency',
       'local_site_name', 'address', 'state', 'county', 'city', 'tribal_code',
       'tribal_land', 'cbsa_code', 'cbsa', 'date_of_last_change'],
      dtype='object')

In [19]:
## Here are some useful data if we want to plot it on the map
# 'latitude', 'longitude', 'parameter', 'year', 'quarter', 'units_of_measure', 'arithmetic_mean', 'minimum_value', 'maximum_value', 'local_site_name','cbsa'

## make a subset of the data 

df = df[['latitude', 'longitude', 'parameter', 'year', 'quarter', 'units_of_measure', 'arithmetic_mean', 'minimum_value', 'maximum_value', 'local_site_name','cbsa']]
df


Unnamed: 0,latitude,longitude,parameter,year,quarter,units_of_measure,arithmetic_mean,minimum_value,maximum_value,local_site_name,cbsa
0,34.13650,-117.92391,PM2.5 - Local Conditions,2019,1,Micrograms/cubic meter (LC),7.2267,2.4,21.5,Azusa,"Los Angeles-Long Beach-Anaheim, CA"
1,34.13650,-117.92391,PM2.5 - Local Conditions,2019,2,Micrograms/cubic meter (LC),9.8067,4.7,17.4,Azusa,"Los Angeles-Long Beach-Anaheim, CA"
2,34.13650,-117.92391,PM2.5 - Local Conditions,2019,3,Micrograms/cubic meter (LC),12.4429,8.0,16.7,Azusa,"Los Angeles-Long Beach-Anaheim, CA"
3,34.13650,-117.92391,PM2.5 - Local Conditions,2019,3,Micrograms/cubic meter (LC),16.2800,8.0,70.0,Azusa,"Los Angeles-Long Beach-Anaheim, CA"
4,34.13650,-117.92391,PM2.5 - Local Conditions,2019,3,Micrograms/cubic meter (LC),16.2800,8.0,70.0,Azusa,"Los Angeles-Long Beach-Anaheim, CA"
...,...,...,...,...,...,...,...,...,...,...,...
474,34.01029,-118.06850,PM2.5 - Local Conditions,2019,4,Micrograms/cubic meter (LC),12.1857,4.5,27.4,Pico Rivera #2,"Los Angeles-Long Beach-Anaheim, CA"
475,34.01029,-118.06850,PM2.5 - Local Conditions,2019,1,Micrograms/cubic meter (LC),8.2067,2.5,19.2,Pico Rivera #2,"Los Angeles-Long Beach-Anaheim, CA"
476,34.01029,-118.06850,PM2.5 - Local Conditions,2019,2,Micrograms/cubic meter (LC),8.1600,3.6,14.3,Pico Rivera #2,"Los Angeles-Long Beach-Anaheim, CA"
477,34.01029,-118.06850,PM2.5 - Local Conditions,2019,3,Micrograms/cubic meter (LC),11.2750,7.6,15.6,Pico Rivera #2,"Los Angeles-Long Beach-Anaheim, CA"


In [None]:
## use site as index
df.set_index("local_site_name", inplace=True)
df

In [27]:
## take quarter 1 data as an example
df_q1 = df[df["quarter"] == "1"]
df_q1.reset_index(inplace=True)


In [29]:
import plotly.express as px

fig = px.scatter_mapbox(df_q1, lat="latitude", lon="longitude", hover_name="local_site_name", hover_data=["arithmetic_mean", "parameter"],
                        color_discrete_sequence=["fuchsia"], zoom=10, height=300)
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()


### encapsulate the code above 

In [1]:
import air_quality_util as aq


In [2]:
email, api_key = aq.get_api_key("myEmailKey.txt")
base_url = aq.base_url

In [4]:
endpoint=aq.create_endpoint("Sites")
print(endpoint)

list/sitesByCounty


In [5]:
param = aq.param_requirement(email, api_key, "Sites") ## califonia: 06, LA: 037


In [6]:
df = aq.get_data(endpoint,param)

In [7]:
df

Unnamed: 0,code,value_represented
0,0001,
1,0002,Azusa
2,0003,
3,0004,
4,0005,
...,...,...
91,9401,
92,9403,
93,9405,
94,9407,


In [21]:
param = aq.param_requirement(email, api_key, "Quarterly by County") 
## param(PM2.5): 88101, bdate: 20190101, edate: 20191231, state: 06, county: 037

## ozone: 44201
## PM2.5: 88101
## PM10: 81102



In [22]:
endpoint = aq.create_endpoint("Quarterly by County")
df = aq.get_data(endpoint,param)


In [None]:
df

#### sample plot -- need encapsulation and more details later

In [24]:
import plotly.express as px

fig = px.scatter_mapbox(df, lat="latitude", lon="longitude", hover_name="local_site_name", hover_data=["arithmetic_mean", "parameter"],
                        color_discrete_sequence=["fuchsia"], zoom=10, height=300)
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

### Current issues regarding AQI:
1. Popular air quality measures include PM2.5, PM10, Ozone, CO, etc. Yet, NOT every site has all those measures throughout the time periods.
2. The location/geographic distribution of the sites are uneven -- need to think about ways to integrate with Census data based on geographic locations
3. We might want to query all the data into one *database* before analyzing; so we have to decide a time range and an area (i.e latitude and longitude range) and write query functions to collect them into separate catagories based on air quality parameters.

#### ALternative dataset for air quality -- Airnow

`https://docs.airnowapi.org/HistoricalObservationsByZip/docs`

In [26]:
import pandas as pd

In [None]:
## read csv: output.csv
df = pd.read_csv("Output_2.csv")
df

In [32]:
## change column name: coloumn 1 is lat, column 2 is lon

df.columns = ["lat", "lon", "time", "parameter", "value", "unit", "aqi", "type"]
df

Unnamed: 0,lat,lon,time,parameter,value,unit,aqi,type
0,34.210170,-118.870510,2023-04-09T19:00,PM2.5,4.8,UG/M3,20,1
1,34.404280,-118.809980,2023-04-09T19:00,OZONE,34.0,PPB,31,1
2,34.404280,-118.809980,2023-04-09T19:00,PM2.5,5.5,UG/M3,23,1
3,34.276320,-118.683690,2023-04-09T19:00,OZONE,42.0,PPB,39,1
4,34.276320,-118.683690,2023-04-09T19:00,PM2.5,2.5,UG/M3,10,1
...,...,...,...,...,...,...,...,...
21855,33.901400,-118.205000,2023-05-09T19:00,PM2.5,8.7,UG/M3,36,1
21856,33.793713,-118.171019,2023-05-09T19:00,PM2.5,8.7,UG/M3,36,1
21857,33.830586,-117.938509,2023-05-09T19:00,PM2.5,6.0,UG/M3,25,1
21858,34.143900,-117.850800,2023-05-09T19:00,PM2.5,5.1,UG/M3,21,1


In [33]:
## kept only PM2.5 data and  time = 2023-04-09T19:00	

df = df[(df["parameter"] == "PM2.5") & (df["time"] == "2023-04-09T19:00")]

In [34]:
df

Unnamed: 0,lat,lon,time,parameter,value,unit,aqi,type
0,34.21017,-118.87051,2023-04-09T19:00,PM2.5,4.8,UG/M3,20,1
2,34.40428,-118.80998,2023-04-09T19:00,PM2.5,5.5,UG/M3,23,1
4,34.27632,-118.68369,2023-04-09T19:00,PM2.5,2.5,UG/M3,10,1
5,34.1992,-118.5331,2023-04-09T19:00,PM2.5,17.8,UG/M3,63,2
8,34.3833,-118.5283,2023-04-09T19:00,PM2.5,6.7,UG/M3,28,1
10,34.181977,-118.363036,2023-04-09T19:00,PM2.5,17.5,UG/M3,62,2
12,34.066429,-118.226755,2023-04-09T19:00,PM2.5,25.1,UG/M3,78,2
14,33.9014,-118.205,2023-04-09T19:00,PM2.5,17.9,UG/M3,63,2
16,33.793713,-118.171019,2023-04-09T19:00,PM2.5,14.9,UG/M3,57,2
22,34.1439,-117.8508,2023-04-09T19:00,PM2.5,14.7,UG/M3,56,2


In [35]:
fig = px.scatter_mapbox(df, lat="lat", lon="lon", hover_data=["parameter", "aqi"],
                        color_discrete_sequence=["fuchsia"], zoom=10, height=300)
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
## https://www.airnowapi.org/aq/data/?startDate=2013-05-09T19&endDate=2023-05-09T20&parameters=OZONE,PM25&BBOX=-119.857788,33.162553,-116.957397,35.699402&dataType=B&format=text/csv&verbose=0&monitorType=0&includerawconcentrations=0&API_KEY=C19E652A-FC64-4A48-A3B9-A1B0939C9E3A