## dataAverages_step3
### Author: Olivia Sablan
The following code is used to correct and take averages of PurpleAir data previously quality checked in "qualitycheck_step2."

Last updated: May 19, 2025

In [11]:
import numpy as np
import datetime
import pandas as pd
import warnings
from datetime import datetime, timezone
warnings.filterwarnings("ignore")

def remove_timezone(dt):
    return dt.replace(tzinfo=None)

In [12]:
# Read in data that has gone through QA/QC procedures
df = pd.read_csv('../data/qualitycontrolled.csv')
df['created_at'] = pd.to_datetime(df.created_at, utc = True)
# Set timezone to Eastern from UTC and then remove the timezone stamp to be more easily merged with other data

df['created_at'] = df['created_at'].dt.tz_convert('US/Eastern')
df['created_at'] = df['created_at'].apply(remove_timezone)

# Correct the channel mena of PurpleAir PM2.5 data using the Barkjohne et al. (2021) correction factor
df['CorrectedPM'] = np.where(df["channelmean"] < 343, 0.524 * df["channelmean"] - 0.0862*df["humidity"] + 5.75,
                             (0.46 * df["channelmean"]) + ((3.93E-4)*(df["channelmean"]**2)) + 2.97)
df = df.drop(columns = ['channelmean'])
# After correction, if there are any concentrations that are negative values, set them equal to zero
df['CorrectedPM'][df['CorrectedPM'] < 0] = 0
df = df[['created_at', 'temperature', 'humidity', 'ID', 'CorrectedPM']]

In [6]:
# When a PurpleAir sensor was replaced with a new one, they got assigned a new ID number, but it is in the same
# location so I rename each secondary sensor ID as the same as the first to keep a continuous record at one location
df['ID'] = np.where(df['ID'] == 226751, 224997, df['ID'])
df['ID'] = np.where(df['ID'] == 225093, 225017, df['ID'])
df['ID'] = np.where(df['ID'] == 225103, 225081, df['ID'])
df['ID'] = np.where(df['ID'] == 225031, 226735, df['ID'])

In [16]:
google = pd.read_csv('https://docs.google.com/spreadsheets/')
google = google.dropna(subset = 'Sensor ID for User')
google['Sensor ID for User'] = google['Sensor ID for User'].astype(int)
google = google.dropna(subset = ['Sensor ID for User'])

senslist=np.array(google['Sensor ID for User'][:],dtype='int')
strsenslist= [str(i) for i in senslist]

lon=np.array(google['Longitude'][:],dtype='float')
lat=np.array(google['Latitude'][:],dtype='float')

alldaily = pd.DataFrame()
allhourly = pd.DataFrame()

for x in range(len(senslist)):
    sensor = df[df["ID"] == senslist[x]]
    sensor["Latitude"]=lat[x]
    sensor["Longitude"]=lon[x]
    
    daily_sens = sensor.groupby(pd.Grouper(key="created_at", freq="1D")).mean().reset_index()
    daily_sens['created_at'] = daily_sens.created_at.dt.date
    daily_sens.dropna(subset=["CorrectedPM"], inplace=True)
    daily_sens['Source'] = 'CSU'
    
    hourly_sens = sensor.groupby(pd.Grouper(key="created_at", freq="1H")).mean().reset_index()
    hourly_sens.dropna(subset=["CorrectedPM"], inplace=True)
    hourly_sens["Latitude"]=lat[x]
    hourly_sens["Longitude"]=lon[x]
    hourly_sens['Source'] = 'CSU'
 
    alldaily = pd.concat([alldaily,daily_sens],sort=False)
    allhourly = pd.concat([allhourly,hourly_sens],sort=False)

In [19]:
allhourly.to_csv('../data/averages/allhourlyCSU.csv')
alldaily.to_csv('../data/averages/alldailyCSU.csv')