In [1]:
import pandas as pd
from datetime import datetime

air_zones = ('Southern_Interior', 'Central_Interior', 'Coastal', 'Georgia_Strait', 'Lower_Fraser_Valley', 'Northeast')
pollutants = ['O3', 'CO', 'NO2', 'SO2', 'PM25']
columns = pollutants + ['SID',]
units = ['ppb','ppm','ppb','ppb','ug/m3']
years = (2016, 2017, 2018, 2019, 2020)

#conversion values from EPA 454/B-18-007
aqi_classes = ((0,50), (51,100), (101,150), (151,200), (201,300), (301,400), (401,500))
pm25_classes = ((0.0,12.0), (12.0, 35.4), (35.4,55.4), (55.4,150.4), (150.4,250.4), (250.4,350.4), (350.4,500.4))

def classBreak(val, classes):
    #returns class break index
    for i, c in enumerate(classes):
        if val > c[0] and val < c[1]:
            return i
    return False

# parse the weird date format
def dateParser(date_str):
    #returns date in proper format
    date_str = date_str.replace(" 24", " 12")
    date_list = date_str.split(' ')
    if len(date_list[1]) == 4:
        date_list[1] = '0' + date_list[1]
    return datetime.strptime(' '.join(date_list), '%m/%d/%Y %I:%M %p')

In [3]:
out = pd.DataFrame(columns=['YEAR', 'MIN', 'MAX', 'MEAN', 'ZONE'])
index = 0
for air_zone in air_zones:
    
    #air_zone = 'Southern_Interior'
    df = pd.read_csv('data/bc-covid-airdata/{}.csv'.format(air_zone), header=0, 
                     index_col=['Date Time'], parse_dates=True, date_parser=dateParser, na_values=['--'])

    df = df[[c for c in df.columns if c in columns]]
    df = df.apply(pd.to_numeric, errors='coerce')
    df.sort_index(inplace=True, ascending=True)


    for year in years:

        aqi = []
        
        dates = pd.date_range(start='03/18/'+str(year), end='05/16/'+str(year))

        for i in range(len(dates)):

            day_data = df.loc[dates[i-1]:dates[i]]
            #day_data.between_time('15:00','19:00')
            pm25 = [val for val in day_data['PM25'].tolist() if str(val) !='nan']
            # skip partially empty days
            if len(pm25) < 10:
                continue
            # convert pm25 to AQI
            pm25_min, pm25_max, pm25_mean = min(pm25), max(pm25), sum(pm25)/len(pm25)

            pm25_break = classBreak(pm25_mean, pm25_classes)

            pm25_aqi = round((aqi_classes[pm25_break][1]-aqi_classes[pm25_break][0])/(pm25_max-pm25_min) * (round(pm25_mean, 2)-pm25_min) + aqi_classes[pm25_break][0])

            aqi.append(pm25_aqi)

        aqi_min, aqi_max, aqi_mean = min(aqi), max(aqi), sum(aqi)/len(aqi)
        d = {'YEAR':year, 'MIN': aqi_min, 'MAX': aqi_max, 'MEAN': aqi_mean, 'ZONE':air_zone+'_'+str(years.index(year))}
        out.loc[index] = d
        index += 1

In [4]:
out

Unnamed: 0,YEAR,MIN,MAX,MEAN,ZONE
0,2016,7,73,17.254237,Southern_Interior_0
1,2017,7,30,16.0,Southern_Interior_1
2,2018,3,65,14.59322,Southern_Interior_2
3,2019,4,76,19.050847,Southern_Interior_3
4,2020,4,70,13.288136,Southern_Interior_4
5,2016,3,66,15.576271,Central_Interior_0
6,2017,6,57,12.915254,Central_Interior_1
7,2018,2,68,18.864407,Central_Interior_2
8,2019,5,72,23.79661,Central_Interior_3
9,2020,4,65,13.440678,Central_Interior_4


In [5]:
path_to_data = r'data\\Average_aqi_2016-2020_proper.csv'
out.to_csv(path_to_data, index = False)