## Qualitycheck_step2
### Author: Olivia Sablan
The following code is used to clean PurpleAir data previously compiled in "OnlineandSD_step1."

Last updated: May 19, 2025

In [8]:
import pandas as pd
import numpy as np
import glob
import warnings
import os.path
import datetime
from datetime import timedelta
warnings.filterwarnings("ignore")

In [9]:
google = pd.read_csv('https://docs.google.com/spreadsheets/')
google = google.dropna(subset = 'Sensor ID for User')
google['Sensor ID for User'] = google['Sensor ID for User'].astype(int)
google = google.dropna(subset = ['Sensor ID for User'])

senslist=np.array(google['Sensor ID for User'][:],dtype='int')
strsenslist= [str(i) for i in senslist]
compileddf = pd.read_csv('../data/raw_combinedSDonline.csv')
compileddf['created_at'] = pd.to_datetime(compileddf['created_at'])

In [25]:
# Three sensors have errorneous temp/humidity sensors, replace these obs with obs from a nearby sensor
# Define mapping of sensors that need to be replaced with reference sensors
replacement_map = {226755: 225077,225025: 225077,226733: 225107}
compileddf_fixed = pd.DataFrame()

for sensor_id in senslist:
    if sensor_id in replacement_map:
        sens_toreplace = compileddf[compileddf['ID'] == sensor_id]
        sens_correct = compileddf[compileddf['ID'] == replacement_map[sensor_id]]
        fixed = pd.merge(
            sens_toreplace[['created_at', 'pm2.5_cf_1_a', 'pm2.5_cf_1_b', 'ID']],
            sens_correct[['created_at', 'temperature', 'humidity']],
            on='created_at')
        compileddf_fixed = pd.concat([compileddf_fixed, fixed], ignore_index=True)
    else:
        othersens = compileddf[compileddf['ID'] == sensor_id]
        compileddf_fixed = pd.concat([compileddf_fixed, othersens], ignore_index=True)

In [38]:
# empty lists to store the total data and all the data I removed to get a percentage of what is dropped
total_len = []
temp_remove = []
hum_remove = []
chan1_remove = []
pm_remove = []
below_zero = []
quality_controlled = pd.DataFrame()

for ii in senslist:
    df = compileddf_fixed[compileddf_fixed['ID'] == ii]        
    # calculating the percent difference between channel a and channel b 
    # NOTE: I already took 10 min averages in my pre-processing, 
    #if you didn't you should take them before this step
    df['diff'] = abs(df['pm2.5_cf_1_a'] - df['pm2.5_cf_1_b'])
    df['percent'] = ((df['pm2.5_cf_1_a'] + df['pm2.5_cf_1_b'])/2)*0.05
    df['percent'] = np.where(df['percent'] < 10, 10, df['percent'] )

    # store the total length of this data file to see how much data is removed in each step later
    total_len.append(len(df))

    # drop temps > 150 degrees F
    keep_temp = df[(df['temperature'] < 150) & (df['temperature'] > -50)]
    delete_temp = df[(df['temperature'] > 150) & (df['temperature'] > -50)]
    temp_remove.append(len(delete_temp))

    # drop relative humidity > 100%
    keep_hum = keep_temp[(keep_temp['humidity'] < 100)]
    delete_hum = keep_temp[(keep_temp['humidity'] > 100)]
    hum_remove.append(len(delete_hum))

    # drop channels with >10% difference or > 10 ug/m3 difference
    keep_chan1 = keep_hum[(keep_hum['diff'] < keep_hum['percent'])]
    delete_chan1 = keep_hum[(keep_hum['diff'] > keep_hum['percent'])]
    chan1_remove.append(len(delete_chan1))

    keep_chan1['channelmean'] = keep_chan1[['pm2.5_cf_1_a','pm2.5_cf_1_b']].mean(axis = 1)

    #drop pm > 500 ug/m3 
    keep_pm = keep_chan1[(keep_chan1['channelmean'] < 500)]
    delete_pm = keep_chan1[(keep_chan1['channelmean'] > 500)]
    pm_remove.append(len(delete_pm))

    # drop any concentration < 0 ug/m3 (THIS IS USUALLY NO OBSERVATIONS FOR ME)
    keep_zero = keep_pm[(keep_pm['channelmean'] > 0)]
    delete_zero = keep_pm[(keep_pm['channelmean'] < 0)]
    below_zero.append(len(delete_zero))

    #trim your tidy data
    tidy = keep_zero[['created_at', 'temperature', 'humidity', 'channelmean']]
    tidy['ID'] = ii
    quality_controlled = pd.concat([quality_controlled, tidy], ignore_index=True)
quality_controlled.to_csv('../data/qualitycontrolled.csv')

In [40]:
# find the total length of your lists
all_total = np.sum(total_len)
all_temp = np.sum(temp_remove)
all_hum = np.sum(hum_remove)
all_chan1 = np.sum(chan1_remove)
all_pm = np.sum(pm_remove)
all_zero = np.sum(below_zero)

In [41]:
print('Temperature > 150: ',round((all_temp) / (all_total)*100, 4), '%')
print('Humidity > 100: ',round((all_hum)/ (all_total)*100, 4), '%')
print('Channel Disagreement: ',round((all_chan1) / (all_total)*100, 4), '%')
print('PM > 500: ',round((all_pm)/ (all_total)*100, 4), '%')
print('PM < 0: ',round((all_zero)/ (all_total)*100, 4), '%')

Temperature > 150:  0.0003 %
Humidity > 100:  0.0 %
Channel Disagreement:  0.282 %
PM > 500:  0.0022 %
PM < 0:  0.0 %
