## **Load PurpleAir Data**

- We can load the PurpleAir Data that we want from the PurpleAir API wrapper that has been built
- Then we can use the clean_purpleair.py file to clean the data for proper analysis

In [15]:
# Proper imports

from purpleair_wrapper import PurpleAirAPI
import pandas as pd
import os

In [16]:
# Set up the API key and endpoint
os.environ['PURPLE_AIR_API_KEY'] = "C8D7497A-2669-11F0-81BE-42010A80001F"

# Create an instance of the PurpleAirAPI class
client = PurpleAirAPI()

In [17]:
# Pull the data for monitors we are interested in
purple_df = pd.read_csv('../data/ASDS 2018-2023 for South San Francisco and San Bruno Metadata.csv')
sensor_ids = purple_df["Site_ID"].unique().tolist()

# Fields to pull from the API
fields = ["pm2.5_atm", "humidity", "temperature", "pressure"]

In [18]:
# Retrieve the PurpleAir data
df = client.get_sensor_history(sensor_ids,
                               fields=fields,
                               start_time='2024-03-01',
                               average=1440)

df.head()

  return pd.concat(data_frames, ignore_index=True)


Unnamed: 0,time_stamp,humidity,temperature,pressure,pm2.5_atm,sensor_index
0,1724976000,53.508,72.765,1014.531,6.2695,21427
1,1733788800,39.207,59.633,1023.066,9.686,21427
2,1725667200,55.642,72.662,1010.742,2.0595,21427
3,1737763200,51.23,61.337,1006.646,6.2545,21427
4,1742860800,49.024,76.289,1012.444,4.339,21427


In [13]:
df.shape

(9974, 6)

In [21]:
# Add the location name of the sensor to the dataframe
df = df.merge(purple_df[['Site_ID', 'Site_Name', 'Longitude', 'Latitude']], left_on='sensor_index', right_on='Site_ID', how='left')
df.rename(columns={'Site_Name': 'sensor_name',
                   'Longitude': 'longitude',
                   'Latitude': 'latitude'},
                   inplace=True)
df.drop(columns=['Site_ID'], inplace=True)

# Sort values by sensor index and the time stamp
df.sort_values(by=['sensor_index', 'time_stamp'], inplace=True)

df.head()

Unnamed: 0,time_stamp,humidity,temperature,pressure,pm2.5_atm,sensor_index,sensor_name,longitude,latitude
0,1709251200,61.451,60.976,1012.084,0.208,21427,Grundy Park,-122.42097,37.622585
1,1709337600,55.048,61.505,1007.616,0.352,21427,Grundy Park,-122.42097,37.622585
2,1709424000,57.3,57.617,1011.476,0.2735,21427,Grundy Park,-122.42097,37.622585
3,1709510400,57.251,59.393,1017.193,0.4165,21427,Grundy Park,-122.42097,37.622585
4,1709596800,58.307,61.629,1015.93,0.423,21427,Grundy Park,-122.42097,37.622585


In [25]:
# Function to calculate AQI for PM2.5
def calculate_pm2_5_aqi(C_p):
    if pd.isna(C_p):
        return None

    C_p = float(str(C_p)[:str(C_p).find('.')+2]) if '.' in str(C_p) else float(C_p)

    breakpoints = [
        (0.0,   9.0,   0,   50),
        (9.1,   35.4,  51,  100),
        (35.5,  55.4,  101, 150),
        (55.5,  125.4, 151, 200),
        (125.5, 225.4, 201, 300),
        (225.5, 500.4, 301, 500)
    ]

    for BP_Lo, BP_Hi, I_Lo, I_Hi in breakpoints:
        if BP_Lo <= C_p <= BP_Hi:
            I_p = ((I_Hi - I_Lo) / (BP_Hi - BP_Lo)) * (C_p - BP_Lo) + I_Lo
            return round(I_p)

    return None

# Process PurpleAir API data
df = df.rename(columns={
    'time_stamp': 'time',
    'sensor_name': 'location_name',
    'sensor_index': 'location_id',
    'pm2.5_atm': 'pm2_5_24h_mean',
    'temperature': 'temp',
    'humidity': 'rh'
})
# Convert time from Unix timestamp to UTC
df['time'] = pd.to_datetime(df['time'], unit='s', utc=True)
df['time'] = df['time'].dt.tz_convert('US/Pacific').dt.tz_localize(None)

# Round PM2.5 values to 2 decimal places
df['pm2_5_24h_mean'] = df['pm2_5_24h_mean'].round(2)

# AQI Calculations
df['pm2_5_24h_mean_aqi'] = df['pm2_5_24h_mean'].apply(calculate_pm2_5_aqi)

# Column arrangement
df = df[['time', 'location_name', 'location_id', 'latitude', 'longitude',
         'pm2_5_24h_mean', 'pm2_5_24h_mean_aqi',
         'temp', 'rh', 'pressure']]

# Sort
df = df.sort_values('time')


In [26]:
df.head()

Unnamed: 0,time,location_name,location_id,latitude,longitude,pm2_5_24h_mean,pm2_5_24h_mean_aqi,temp,rh,pressure
0,2024-02-29 08:00:00,Grundy Park,21427,37.622585,-122.42097,0.21,1.0,60.976,61.451,1012.084
6582,2024-02-29 08:00:00,Shelter Crik,113144,37.62002,-122.42762,0.0,0.0,58.988,60.765,1008.262
5674,2024-02-29 08:00:00,terra,109718,37.669968,-122.42153,0.89,4.0,61.047,59.685,1005.107
5945,2024-02-29 08:00:00,Belle Air,111235,37.631878,-122.409966,1.16,6.0,,,
6159,2024-02-29 08:00:00,Crestmoor III,111498,37.616806,-122.431,1.13,6.0,58.963,64.513,1006.47


In [29]:
# Make sure the longitude and latitude are correct
df['longitude'] = df['longitude'].apply(lambda x: -abs(x))
df['latitude'] = df['latitude'].apply(lambda x: abs(x))

In [30]:
# Save the data to a CSV file
df.to_csv('../data/purpleair_2024-03-01.csv', index=False)