Code to read purple air sensor network data, and export it as a csv file.
The latitude and longitude are bounded to approximately contain the United States, but may include some locations in Canada / Mexico.

In [1]:
import pandas as pd

In [2]:
# Code from https://github.com/ReagentX/purple_air_api#listing-all-useful-sensors
from purpleair.network import SensorList
p = SensorList()  # Initialized 23,145 sensors!
useful = [s for s in p.all_sensors if s.is_useful()]  # List of sensors with no defects
print(len(useful))  # 17,426

Child 566 lists parent 565, but parent does not exist!
Child 2861 lists parent 2860, but parent does not exist!
Initialized 22,385 sensors!
15898


In [3]:
df = p.to_dataframe('useful', channel = 'parent')

In [4]:
df.columns

Index(['parent', 'lat', 'lon', 'name', 'location_type', 'pm_2.5', 'temp_f',
       'temp_c', 'humidity', 'pressure', 'p_0_3_um', 'p_0_5_um', 'p_1_0_um',
       'p_2_5_um', 'p_5_0_um', 'p_10_0_um', 'pm1_0_cf_1', 'pm2_5_cf_1',
       'pm10_0_cf_1', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'last_seen',
       'model', 'adc', 'rssi', 'hidden', 'flagged', 'downgraded', 'age',
       'brightness', 'hardware', 'version', 'last_update_check', 'created',
       'uptime', 'is_owner', '10min_avg', '30min_avg', '1hour_avg',
       '6hour_avg', '1day_avg', '1week_avg'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0_level_0,parent,lat,lon,name,location_type,pm_2.5,temp_f,temp_c,humidity,pressure,...,last_update_check,created,uptime,is_owner,10min_avg,30min_avg,1hour_avg,6hour_avg,1day_avg,1week_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25999,,30.053808,-95.494643,Villages of Bridgestone AQI,outside,1.0,50.0,10.0,62.0,1022.37,...,,,,False,2.47,4.19,8.1,19.66,17.15,15.22
49409,,18.759182,99.017172,"""First's Place""",outside,42.26,73.0,22.777778,43.0,974.24,...,,,,False,42.61,43.03,42.74,39.9,35.09,31.6
42073,,47.185173,-122.176855,#1,outside,7.46,57.0,13.888889,48.0,1003.78,...,,,,False,8.28,8.76,9.58,13.3,13.25,11.98
53069,,47.190197,-122.177992,#2,outside,7.68,71.0,21.666667,33.0,1003.66,...,,,,False,8.07,8.43,9.45,14.39,14.58,13.23
94891,,40.160043,-79.272304,#9,outside,9.28,42.0,5.555556,50.0,971.35,...,,,,False,9.07,7.58,6.71,6.3,7.76,12.12


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15898 entries, 25999 to 108592
Data columns (total 43 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   parent             0 non-null      object        
 1   lat                15898 non-null  float64       
 2   lon                15898 non-null  float64       
 3   name               15898 non-null  object        
 4   location_type      15862 non-null  object        
 5   pm_2.5             15898 non-null  float64       
 6   temp_f             15898 non-null  float64       
 7   temp_c             15898 non-null  float64       
 8   humidity           15898 non-null  float64       
 9   pressure           15898 non-null  float64       
 10  p_0_3_um           0 non-null      object        
 11  p_0_5_um           0 non-null      object        
 12  p_1_0_um           0 non-null      object        
 13  p_2_5_um           0 non-null      object        
 14  p

In [7]:
df.dropna(axis = 1, thresh = 5000, inplace = True)

In [8]:
df_us = df.query('(lat > 20) and (lat < 50) and (lon > -125) and (lon < -65)')

In [9]:
df_us.shape

(14163, 22)

In [10]:
df_us.isna().sum()

lat               0
lon               0
name              0
location_type    28
pm_2.5            0
temp_f            0
temp_c            0
humidity          0
pressure          0
last_seen         0
model             0
hidden            0
flagged           0
downgraded        0
age               0
is_owner          0
10min_avg         0
30min_avg         0
1hour_avg         0
6hour_avg         0
1day_avg          0
1week_avg         0
dtype: int64

In [11]:
df_us.memory_usage(deep=True).sum()

4918082

In [12]:
df_us.to_csv('data/original_data/purpleair_original.csv')