In [2]:
import requests
import json
import pandas as pd


### US official EPA AQS air quality dataset --

`https://www.epa.gov/aqs/aqs-code-list`

You need to get your API for AQS and make a txt file `myEmailKey.txt` to store your email and key in the first two lines.

Also, note that sometimes it may not allow you to access the API due to internet and geographical conditions! Try different connection and devices ~

In [11]:
# Read email and API key from the file
with open("myEmailKey.txt", "r") as file:
    email = file.readline().strip()
    api_key = file.readline().strip()

# Define the base API URL
base_url = "https://aqs.epa.gov/data/api"

# Define the endpoint and parameters
endpoint = "/list/countiesByState"
params = {
    "email": email,
    "key": api_key,
    "state": "37"
}

# Make the GET request with SSL/TLS verification disabled (NOT recommended for production)
response = requests.get(base_url + endpoint, params=params)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    data = response.json()
    print(data)
else:
    print(f"Error: {response.status_code}")


{'Header': [{'status': 'Success', 'request_time': '2023-05-08T23:38:29-04:00', 'url': 'https://aqs.epa.gov/data/api/list/countiesByState?email=zhu.tim99%40gmail.com&key=copperswift75&state=37', 'rows': 100}], 'Data': [{'code': '001', 'value_represented': 'Alamance'}, {'code': '003', 'value_represented': 'Alexander'}, {'code': '005', 'value_represented': 'Alleghany'}, {'code': '007', 'value_represented': 'Anson'}, {'code': '009', 'value_represented': 'Ashe'}, {'code': '011', 'value_represented': 'Avery'}, {'code': '013', 'value_represented': 'Beaufort'}, {'code': '015', 'value_represented': 'Bertie'}, {'code': '017', 'value_represented': 'Bladen'}, {'code': '019', 'value_represented': 'Brunswick'}, {'code': '021', 'value_represented': 'Buncombe'}, {'code': '023', 'value_represented': 'Burke'}, {'code': '025', 'value_represented': 'Cabarrus'}, {'code': '027', 'value_represented': 'Caldwell'}, {'code': '029', 'value_represented': 'Camden'}, {'code': '031', 'value_represented': 'Carteret'}

In [3]:
## similarly we can get different data using the same sturcture
# Read email and API key from the file
with open("myEmailKey.txt", "r") as file:
    email = file.readline().strip()
    api_key = file.readline().strip()

# Define the base API URL
base_url = "https://aqs.epa.gov/data/api"


In [None]:
## quarterly summary data by county e.g Example; returns quarterly summary FRM/FEM and non-FRM PM2.5 data for Wake County for 2016: https://aqs.epa.gov/data/api/quarterlyData/byCounty?email=test@aqs.api&key=test&param=88101,88502&bdate=20160101&edate=20160228&state=37&county=183

## we want LA county data for 2019 qualters 1 

endpoint = "/quarterlyData/byCounty"
params = {
    "email": email,
    "key": api_key,
    "param": "88101,88502",
    "bdate": "20190101",
    "edate": "20191231",
    "state": "06",
    "county": "037"
}

# Get the data
data = get_data(base_url, endpoint, params)

# Create a DataFrame from the JSON response

df = pd.DataFrame.from_dict(data["Data"])

df

In [18]:
## see what data is in the df
df.columns




Index(['state_code', 'county_code', 'site_number', 'parameter_code', 'poc',
       'latitude', 'longitude', 'datum', 'parameter', 'sample_duration',
       'sample_duration_code', 'sample_duration_type', 'pollutant_standard',
       'year', 'quarter', 'units_of_measure', 'event_type',
       'observation_count', 'observation_percent', 'arithmetic_mean',
       'minimum_value', 'maximum_value', 'quarterly_criteria_met',
       'actual_days_gt_std', 'estimated_days_gt_std', 'valid_samples',
       'valid_day_count', 'scheduled_samples', 'percent_days',
       'percent_one_value', 'monitoring_agency_code', 'monitoring_agency',
       'local_site_name', 'address', 'state', 'county', 'city', 'tribal_code',
       'tribal_land', 'cbsa_code', 'cbsa', 'date_of_last_change'],
      dtype='object')

In [19]:
## Here are some useful data if we want to plot it on the map
# 'latitude', 'longitude', 'parameter', 'year', 'quarter', 'units_of_measure', 'arithmetic_mean', 'minimum_value', 'maximum_value', 'local_site_name','cbsa'

## make a subset of the data 

df = df[['latitude', 'longitude', 'parameter', 'year', 'quarter', 'units_of_measure', 'arithmetic_mean', 'minimum_value', 'maximum_value', 'local_site_name','cbsa']]
df


Unnamed: 0,latitude,longitude,parameter,year,quarter,units_of_measure,arithmetic_mean,minimum_value,maximum_value,local_site_name,cbsa
0,34.13650,-117.92391,PM2.5 - Local Conditions,2019,1,Micrograms/cubic meter (LC),7.2267,2.4,21.5,Azusa,"Los Angeles-Long Beach-Anaheim, CA"
1,34.13650,-117.92391,PM2.5 - Local Conditions,2019,2,Micrograms/cubic meter (LC),9.8067,4.7,17.4,Azusa,"Los Angeles-Long Beach-Anaheim, CA"
2,34.13650,-117.92391,PM2.5 - Local Conditions,2019,3,Micrograms/cubic meter (LC),12.4429,8.0,16.7,Azusa,"Los Angeles-Long Beach-Anaheim, CA"
3,34.13650,-117.92391,PM2.5 - Local Conditions,2019,3,Micrograms/cubic meter (LC),16.2800,8.0,70.0,Azusa,"Los Angeles-Long Beach-Anaheim, CA"
4,34.13650,-117.92391,PM2.5 - Local Conditions,2019,3,Micrograms/cubic meter (LC),16.2800,8.0,70.0,Azusa,"Los Angeles-Long Beach-Anaheim, CA"
...,...,...,...,...,...,...,...,...,...,...,...
474,34.01029,-118.06850,PM2.5 - Local Conditions,2019,4,Micrograms/cubic meter (LC),12.1857,4.5,27.4,Pico Rivera #2,"Los Angeles-Long Beach-Anaheim, CA"
475,34.01029,-118.06850,PM2.5 - Local Conditions,2019,1,Micrograms/cubic meter (LC),8.2067,2.5,19.2,Pico Rivera #2,"Los Angeles-Long Beach-Anaheim, CA"
476,34.01029,-118.06850,PM2.5 - Local Conditions,2019,2,Micrograms/cubic meter (LC),8.1600,3.6,14.3,Pico Rivera #2,"Los Angeles-Long Beach-Anaheim, CA"
477,34.01029,-118.06850,PM2.5 - Local Conditions,2019,3,Micrograms/cubic meter (LC),11.2750,7.6,15.6,Pico Rivera #2,"Los Angeles-Long Beach-Anaheim, CA"


In [None]:
## use site as index
df.set_index("local_site_name", inplace=True)
df

In [27]:
## take quarter 1 data as an example
df_q1 = df[df["quarter"] == "1"]
df_q1.reset_index(inplace=True)


In [29]:
import plotly.express as px

fig = px.scatter_mapbox(df_q1, lat="latitude", lon="longitude", hover_name="local_site_name", hover_data=["arithmetic_mean", "parameter"],
                        color_discrete_sequence=["fuchsia"], zoom=10, height=300)
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()


### encapsulate the code above

- `get_api_key(filename)`
- `base_url`
- `create_endpoint(goal)` --- based on user's serach `goal`, return the *endpoint* for API 
- `param_requirement(email,api_key,goal)` --- based on `goal`, prompt user inputs (e.g. time, area code, etc.) and return the *params* for API (**TO BE MODIFIED**)
- `get_data`(endpoint, params) --- return a dataframe based on user requirement



In [36]:
import air_quality_util as aq


In [37]:
email, api_key = aq.get_api_key("myEmailKey.txt")
base_url = aq.base_url

In [38]:
endpoint=aq.create_endpoint("Sites")
print(endpoint)

list/sitesByCounty


In [5]:
param = aq.param_requirement(email, api_key, "Sites") ## califonia: 06, LA: 037


In [6]:
df = aq.get_data(endpoint,param)

In [None]:
df

In [21]:
param = aq.param_requirement(email, api_key, "Quarterly by County") 
## e.g.  param(PM2.5): 88101, bdate: 20190101, edate: 20191231, state: 06, county: 037

## ozone: 44201
## PM2.5: 88101
## PM10: 81102



In [22]:
endpoint = aq.create_endpoint("Quarterly by County")
df = aq.get_data(endpoint,param)


In [None]:
df

#### sample plot -- need encapsulation and more details later

In [24]:
import plotly.express as px

fig = px.scatter_mapbox(df, lat="latitude", lon="longitude", hover_name="local_site_name", hover_data=["arithmetic_mean", "parameter"],
                        color_discrete_sequence=["fuchsia"], zoom=10, height=300)
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

### Current issues regarding AQI:
1. Popular air quality measures include PM2.5, PM10, Ozone, CO, etc. Yet, NOT every site has all those measures throughout the time periods.
2. The location/geographic distribution of the sites are uneven -- need to think about ways to integrate with Census data based on geographic locations
3. We might want to query all the data into one *database* before analyzing; so we have to decide a time range and an area (i.e latitude and longitude range) and write query functions to collect them into separate catagories based on air quality parameters.

#### ALternative dataset for air quality -- Airnow

`https://docs.airnowapi.org/HistoricalObservationsByZip/docs`

In [26]:
import pandas as pd

In [None]:
## read csv: output.csv
df = pd.read_csv("Output_2.csv")
df

In [32]:
## change column name: coloumn 1 is lat, column 2 is lon

df.columns = ["lat", "lon", "time", "parameter", "value", "unit", "aqi", "type"]
df

Unnamed: 0,lat,lon,time,parameter,value,unit,aqi,type
0,34.210170,-118.870510,2023-04-09T19:00,PM2.5,4.8,UG/M3,20,1
1,34.404280,-118.809980,2023-04-09T19:00,OZONE,34.0,PPB,31,1
2,34.404280,-118.809980,2023-04-09T19:00,PM2.5,5.5,UG/M3,23,1
3,34.276320,-118.683690,2023-04-09T19:00,OZONE,42.0,PPB,39,1
4,34.276320,-118.683690,2023-04-09T19:00,PM2.5,2.5,UG/M3,10,1
...,...,...,...,...,...,...,...,...
21855,33.901400,-118.205000,2023-05-09T19:00,PM2.5,8.7,UG/M3,36,1
21856,33.793713,-118.171019,2023-05-09T19:00,PM2.5,8.7,UG/M3,36,1
21857,33.830586,-117.938509,2023-05-09T19:00,PM2.5,6.0,UG/M3,25,1
21858,34.143900,-117.850800,2023-05-09T19:00,PM2.5,5.1,UG/M3,21,1


In [33]:
## kept only PM2.5 data and  time = 2023-04-09T19:00	

df = df[(df["parameter"] == "PM2.5") & (df["time"] == "2023-04-09T19:00")]

0         2010-02-05
1         2010-02-07
2         2010-02-07
3         2010-02-07
4         2010-02-09
             ...    
240994    2010-02-01
240995    2010-02-13
240996    2010-02-25
240997    2010-03-09
240998    2010-03-21
Name: date_gmt, Length: 240999, dtype: object

In [35]:
fig = px.scatter_mapbox(df, lat="lat", lon="lon", hover_data=["parameter", "aqi"],
                        color_discrete_sequence=["fuchsia"], zoom=10, height=300)
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
## https://www.airnowapi.org/aq/data/?startDate=2013-05-09T19&endDate=2023-05-09T20&parameters=OZONE,PM25&BBOX=-119.857788,33.162553,-116.957397,35.699402&dataType=B&format=text/csv&verbose=0&monitorType=0&includerawconcentrations=0&API_KEY=C19E652A-FC64-4A48-A3B9-A1B0939C9E3A

### Future tasks:
1. We might need geographic data (beside Census and AQI) to visualize county-level population and air quality
`https://chart-studio.plotly.com/~jackp/18273.embed?_ga=2.232249817.779600767.1683598940-14096325.1682002206`
2. We need to think about ways to classify sites based on location (we might directly use location data from the data base, but it is not in the form of *county* ...)


#### p.s. A visualization of 2010-02-07 8am PM2.5 data 

https://aqs.epa.gov/data/api/metaData/fieldsByService?email=test@aqs.api&key=test&service=sampleData

SyntaxError: invalid syntax (2508296525.py, line 1)

In [5]:
df = pd.read_csv("CA_2010_PM2.5.csv")

df.shape

  df = pd.read_csv("CA_2010_PM2.5.csv")


(240999, 29)

In [14]:
df.time_gmt.unique

<bound method Series.unique of 0         13:00
1         13:00
2         14:00
3         22:00
4         18:00
          ...  
240994    08:00
240995    08:00
240996    08:00
240997    08:00
240998    08:00
Name: time_gmt, Length: 240999, dtype: object>

In [4]:
## display what columns we have
df.columns

Index(['state_code', 'county_code', 'site_number', 'parameter_code', 'poc',
       'latitude', 'longitude', 'datum', 'parameter', 'date_local',
       'time_local', 'date_gmt', 'time_gmt', 'sample_measurement',
       'units_of_measure', 'units_of_measure_code', 'sample_duration',
       'sample_duration_code', 'sample_frequency', 'detection_limit',
       'uncertainty', 'qualifier', 'method_type', 'method', 'method_code',
       'state', 'county', 'date_of_last_change', 'cbsa_code'],
      dtype='object')

In [16]:
sub_df = df.loc[df['date_gmt'] == '2010-02-07', ['time_gmt', 'latitude', 'longitude', 'sample_measurement', 'date_local', 'site_number']]

In [18]:
sub_df = sub_df.loc[df['time_gmt'] == '08:00']

In [19]:
sub_df

Unnamed: 0,time_gmt,latitude,longitude,sample_measurement,date_local,site_number
1803,08:00,37.535833,-121.961823,8.0,2010-02-07,1001
5674,08:00,36.487823,-117.871036,-0.2,2010-02-07,1003
14137,08:00,36.634225,-120.382331,2.0,2010-02-07,2009
21538,08:00,40.801780,-124.162100,5.5,2010-02-07,1002
21648,08:00,40.776780,-124.179490,8.5,2010-02-07,1004
...,...,...,...,...,...,...
239898,08:00,37.687526,-121.784217,4.8,2010-02-07,7
240157,08:00,37.936013,-122.026154,3.4,2010-02-07,2
240330,08:00,37.348497,-121.894898,5.4,2010-02-07,5
240496,08:00,38.102507,-122.237976,3.0,2010-02-07,4


In [24]:
import plotly.express as px
fig = px.scatter_mapbox(sub_df, lat="latitude", lon="longitude", hover_data=["sample_measurement", "site_number"],
                        color_discrete_sequence=["fuchsia"], zoom=10, height=300)
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

## make this plot available on the web

import plotly.io as pio
pio.write_html(fig, file='index.html', auto_open=True)



In [22]:
import plotly.express as px

fig = px.density_mapbox(sub_df, lat="latitude", lon="longitude", z="sample_measurement", radius=10,
                        zoom=10, height=500, color_continuous_scale="Viridis",
                        hover_data=["site_number", "sample_measurement"])
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [1]:
import pandas as pd

## read csv: CA_2010_PM2.5.csv

df = pd.read_csv(r'C:\Users\tim09\git_repo\Air-Pandas\Air_quality_query\CA_2010_PM2.5.csv')

df.shape

  df = pd.read_csv(r'C:\Users\tim09\git_repo\Air-Pandas\Air_quality_query\CA_2010_PM2.5.csv')


(240999, 29)

In [2]:
## See what columns we have
df.columns

Index(['state_code', 'county_code', 'site_number', 'parameter_code', 'poc',
       'latitude', 'longitude', 'datum', 'parameter', 'date_local',
       'time_local', 'date_gmt', 'time_gmt', 'sample_measurement',
       'units_of_measure', 'units_of_measure_code', 'sample_duration',
       'sample_duration_code', 'sample_frequency', 'detection_limit',
       'uncertainty', 'qualifier', 'method_type', 'method', 'method_code',
       'state', 'county', 'date_of_last_change', 'cbsa_code'],
      dtype='object')

In [25]:
## we want those sub columns: 'site_number', 'latitude', 'longitude', 'date_gmt', 'time_gmt', 'sample_measurement', 'units_of_measure', 'county'

sub_df = df[['site_number', 'latitude', 'longitude', 'date_gmt', 'time_gmt', 'sample_measurement', 'units_of_measure', 'county']]

## and we can remove those where sample_measurement is NAN

sub_df = sub_df[sub_df['sample_measurement'].notna()]

sub_df.shape



(227678, 8)

In [26]:
avg_df = sub_df.groupby(['site_number', 'date_gmt'])['sample_measurement'].mean().reset_index()
avg_df

Unnamed: 0,site_number,date_gmt,sample_measurement
0,1,2010-01-01,18.500000
1,1,2010-01-02,7.557692
2,1,2010-01-03,16.250000
3,1,2010-01-04,33.391304
4,1,2010-01-05,30.291667
...,...,...,...
11618,9033,2010-11-16,6.000000
11619,9033,2010-11-28,2.000000
11620,9033,2010-12-04,14.000000
11621,9033,2010-12-22,2.000000


In [29]:
## do not use site number!
avg_df = sub_df.groupby(['latitude', 'longitude', 'date_gmt'])['sample_measurement'].mean().reset_index()
# Create a reference dataframe with latitude, longitude and corresponding site_number, units_of_measure, and county
ref_df = sub_df[['latitude', 'longitude', 'site_number', 'units_of_measure', 'county']].drop_duplicates()

# Merge the reference dataframe with the average dataframe
final_df = pd.merge(avg_df, ref_df, on=['latitude', 'longitude'], how='left')


In [30]:
final_df

Unnamed: 0,latitude,longitude,date_gmt,sample_measurement,site_number,units_of_measure,county
0,32.631242,-117.059088,2010-01-02,10.2,1,Micrograms/cubic meter (LC),San Diego
1,32.631242,-117.059088,2010-01-08,7.9,1,Micrograms/cubic meter (LC),San Diego
2,32.631242,-117.059088,2010-01-11,9.2,1,Micrograms/cubic meter (LC),San Diego
3,32.631242,-117.059088,2010-01-14,10.4,1,Micrograms/cubic meter (LC),San Diego
4,32.631242,-117.059088,2010-01-20,9.1,1,Micrograms/cubic meter (LC),San Diego
...,...,...,...,...,...,...,...
18552,41.726892,-122.633579,2010-12-04,10.3,2001,Micrograms/cubic meter (LC),Siskiyou
18553,41.726892,-122.633579,2010-12-10,7.9,2001,Micrograms/cubic meter (LC),Siskiyou
18554,41.726892,-122.633579,2010-12-19,4.8,2001,Micrograms/cubic meter (LC),Siskiyou
18555,41.726892,-122.633579,2010-12-22,5.7,2001,Micrograms/cubic meter (LC),Siskiyou


In [15]:
# Group the data by site, date, and other desired columns
grouped_df = sub_df.groupby(['site_number', 'latitude', 'longitude', 'date_gmt', 'units_of_measure', 'county'])

# Calculate the average of 'sample_measurement' for each group
averaged_df = grouped_df['sample_measurement'].mean().reset_index()

averaged_df.shape

(18557, 7)

In [16]:
averaged_df.head()

Unnamed: 0,site_number,latitude,longitude,date_gmt,units_of_measure,county,sample_measurement
0,1,32.631242,-117.059088,2010-01-02,Micrograms/cubic meter (LC),San Diego,10.2
1,1,32.631242,-117.059088,2010-01-08,Micrograms/cubic meter (LC),San Diego,7.9
2,1,32.631242,-117.059088,2010-01-11,Micrograms/cubic meter (LC),San Diego,9.2
3,1,32.631242,-117.059088,2010-01-14,Micrograms/cubic meter (LC),San Diego,10.4
4,1,32.631242,-117.059088,2010-01-20,Micrograms/cubic meter (LC),San Diego,9.1


In [33]:
# Total number of hours measured for each site
total_hours = sub_df.groupby(['latitude', 'longitude']).size().reset_index(name='total_hours')

# Number of hours where measurement exceeds 15.5, 40.5, 65.5
exceed_15_hours = sub_df[sub_df['sample_measurement'] > 15.5].groupby(['latitude', 'longitude']).size().reset_index(name='exceed_15_hours')
exceed_40_hours = sub_df[sub_df['sample_measurement'] > 40.5].groupby(['latitude', 'longitude']).size().reset_index(name='exceed_40_hours')
exceed_65_hours = sub_df[sub_df['sample_measurement'] > 65.5].groupby(['latitude', 'longitude']).size().reset_index(name='exceed_65_hours')

# Merge dataframes
merge_df = pd.merge(total_hours, exceed_15_hours, on=['latitude', 'longitude'], how='left')
merge_df = pd.merge(merge_df, exceed_40_hours, on=['latitude', 'longitude'], how='left')
merge_df = pd.merge(merge_df, exceed_65_hours, on=['latitude', 'longitude'], how='left')

merge_df = merge_df.fillna(0)

# Calculate percentages
merge_df['percent_exceed_15'] = (merge_df['exceed_15_hours'] / merge_df['total_hours']) * 100
merge_df['percent_exceed_40'] = (merge_df['exceed_40_hours'] / merge_df['total_hours']) * 100
merge_df['percent_exceed_65'] = (merge_df['exceed_65_hours'] / merge_df['total_hours']) * 100

merge_df.head()

Unnamed: 0,latitude,longitude,total_hours,exceed_15_hours,exceed_40_hours,exceed_65_hours,percent_exceed_15,percent_exceed_40,percent_exceed_65
0,32.631242,-117.059088,101,7.0,0.0,0.0,6.930693,0.0,0.0
1,32.67618,-115.48307,193,42.0,3.0,0.0,21.761658,1.554404,0.0
2,32.701492,-117.149653,320,39.0,0.0,0.0,12.1875,0.0,0.0
3,32.725226,-116.365203,73,1.0,1.0,1.0,1.369863,1.369863,1.369863
4,32.791194,-116.942092,116,13.0,0.0,0.0,11.206897,0.0,0.0


In [35]:
avg_latitude = merge_df['latitude'].mean()
avg_longitude = merge_df['longitude'].mean()

In [37]:
import folium

# Create a Map instance
m = folium.Map(location=[avg_latitude, avg_longitude], zoom_start=4)

# Add points to the map instance
for idx, row in merge_df.iterrows():
    # Define the color to be used in the marker
    if row['percent_exceed_15'] > 50:
        color = 'red'
    elif row['percent_exceed_15'] > 20:
        color = 'orange'
    else:
        color = 'green'

    # Add marker to the map
    folium.CircleMarker(location=(row['latitude'], row['longitude']),
                        radius=5,
                        color=color,
                        fill=True,
                        fill_color=color).add_to(m)

# Show the map
m

In [4]:
sub_df.head()

Unnamed: 0,site_number,latitude,longitude,date_gmt,time_gmt,sample_measurement,units_of_measure,county
138,1001,37.535833,-121.961823,2010-01-01,08:00,23.0,Micrograms/cubic meter (LC),Alameda
139,1001,37.535833,-121.961823,2010-01-01,09:00,29.0,Micrograms/cubic meter (LC),Alameda
140,1001,37.535833,-121.961823,2010-01-01,10:00,25.0,Micrograms/cubic meter (LC),Alameda
141,1001,37.535833,-121.961823,2010-01-01,11:00,22.0,Micrograms/cubic meter (LC),Alameda
142,1001,37.535833,-121.961823,2010-01-01,12:00,14.0,Micrograms/cubic meter (LC),Alameda


In [24]:
# Filter the data for sample measurements higher than the thresholds
filtered_df = sub_df[sub_df['sample_measurement'] > 15.5]

# Group the filtered data by site, date, and other desired columns
grouped_df = filtered_df.groupby(['site_number', 'latitude', 'longitude', 'date_gmt', 'units_of_measure', 'county'])

#grouped_df.head(5)

# count the number of rows for each group and reset the index
counts_df = grouped_df['sample_measurement'].count().reset_index()

counts_df

Unnamed: 0,site_number,latitude,longitude,date_gmt,units_of_measure,county,sample_measurement
0,1,32.631242,-117.059088,2010-03-21,Micrograms/cubic meter (LC),San Diego,1
1,1,32.631242,-117.059088,2010-04-26,Micrograms/cubic meter (LC),San Diego,1
2,1,32.631242,-117.059088,2010-08-24,Micrograms/cubic meter (LC),San Diego,1
3,1,32.631242,-117.059088,2010-10-11,Micrograms/cubic meter (LC),San Diego,1
4,1,32.631242,-117.059088,2010-12-04,Micrograms/cubic meter (LC),San Diego,1
...,...,...,...,...,...,...,...
6337,9004,34.106678,-117.274063,2010-10-17,Micrograms/cubic meter (LC),San Bernardino,1
6338,9004,34.106678,-117.274063,2010-11-19,Micrograms/cubic meter (LC),San Bernardino,1
6339,9004,34.106678,-117.274063,2010-12-04,Micrograms/cubic meter (LC),San Bernardino,1
6340,9004,34.106678,-117.274063,2010-12-10,Micrograms/cubic meter (LC),San Bernardino,1


In [5]:
## modify the index: each would represent a unique site, i.e. site_number, latitude, longitude, county would be combined to be a unique index

sub_df.set_index(['site_number', 'latitude', 'longitude', 'county'], inplace=True)

sub_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,date_gmt,time_gmt,sample_measurement,units_of_measure
site_number,latitude,longitude,county,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1001,37.535833,-121.961823,Alameda,2010-01-01,08:00,23.0,Micrograms/cubic meter (LC)
1001,37.535833,-121.961823,Alameda,2010-01-01,09:00,29.0,Micrograms/cubic meter (LC)
1001,37.535833,-121.961823,Alameda,2010-01-01,10:00,25.0,Micrograms/cubic meter (LC)
1001,37.535833,-121.961823,Alameda,2010-01-01,11:00,22.0,Micrograms/cubic meter (LC)
1001,37.535833,-121.961823,Alameda,2010-01-01,12:00,14.0,Micrograms/cubic meter (LC)


In [6]:
## we want to calculated the mean of sample_measurement for each site on each date_gmt

sub_df2 = sub_df.groupby(['site_number', 'latitude', 'longitude', 'county', 'date_gmt']).mean()

sub_df2.head()


  sub_df2 = sub_df.groupby(['site_number', 'latitude', 'longitude', 'county', 'date_gmt']).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,sample_measurement
site_number,latitude,longitude,county,date_gmt,Unnamed: 5_level_1
1,32.631242,-117.059088,San Diego,2010-01-02,10.2
1,32.631242,-117.059088,San Diego,2010-01-08,7.9
1,32.631242,-117.059088,San Diego,2010-01-11,9.2
1,32.631242,-117.059088,San Diego,2010-01-14,10.4
1,32.631242,-117.059088,San Diego,2010-01-20,9.1


In [9]:
sub_df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,sample_measurement
site_number,latitude,longitude,county,date_gmt,Unnamed: 5_level_1
1,32.631242,-117.059088,San Diego,2010-01-02,10.2
1,32.631242,-117.059088,San Diego,2010-01-08,7.9
1,32.631242,-117.059088,San Diego,2010-01-11,9.2
1,32.631242,-117.059088,San Diego,2010-01-14,10.4
1,32.631242,-117.059088,San Diego,2010-01-20,9.1
...,...,...,...,...,...
9033,34.669739,-118.130511,Los Angeles,2010-11-16,6.0
9033,34.669739,-118.130511,Los Angeles,2010-11-28,2.0
9033,34.669739,-118.130511,Los Angeles,2010-12-04,14.0
9033,34.669739,-118.130511,Los Angeles,2010-12-22,2.0


In [11]:
## find those sample_measurement > 15.5

sub_df_danger = sub_df2[sub_df2['sample_measurement'] > 15.5]

In [12]:
sub_df_danger

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,sample_measurement
site_number,latitude,longitude,county,date_gmt,Unnamed: 5_level_1
1,32.631242,-117.059088,San Diego,2010-03-21,16.0
1,32.631242,-117.059088,San Diego,2010-04-26,16.0
1,32.631242,-117.059088,San Diego,2010-08-24,17.8
1,32.631242,-117.059088,San Diego,2010-10-11,19.4
1,32.631242,-117.059088,San Diego,2010-12-04,22.7
...,...,...,...,...,...
9004,34.106678,-117.274063,San Bernardino,2010-10-17,21.6
9004,34.106678,-117.274063,San Bernardino,2010-11-19,39.3
9004,34.106678,-117.274063,San Bernardino,2010-12-04,22.2
9004,34.106678,-117.274063,San Bernardino,2010-12-10,38.3
