In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings

In [2]:
parent_directory = '/Users/pajhaver/Documents/AQI_Research'

In [3]:
pollutant_files = ['CO_illinois', 'NO2_illinois', 'OZ_illinois', 'PM10_illinois'
                  ,'PM25_illinois','SO2_illinois']

pollutant_files = ['CO_illinois']

drop_met_cols = ['dew_point_temperature_set_1_Fahrenheit', 'wind_gust_set_1_Miles/hour', 'weather_cond_code_set_1_code',
                'cloud_layer_3_code_set_1_code', 'precip_accum_one_hour_set_1_Inches', 'precip_accum_three_hour_set_1_Inches', 
                'cloud_layer_1_code_set_1_code', 'cloud_layer_2_code_set_1_code', 'precip_accum_six_hour_set_1_Inches', 
                 'precip_accum_24_hour_set_1_Inches', 'visibility_set_1_Statute miles', 'metar_remark_set_1_text', 'metar_set_1_text',
                'air_temp_high_6_hour_set_1_Fahrenheit', 'ceiling_set_1_Feet', 'air_temp_high_24_hour_set_1_Fahrenheit', 'air_temp_low_24_hour_set_1_Fahrenheit',
                'dew_point_temperature_set_1d_Fahrenheit', 'wind_chill_set_1d_Fahrenheit', 'pressure_set_1d_INHG',
                'sea_level_pressure_set_1d_INHG', 'heat_index_set_1d_Fahrenheit', 'air_temp_low_6_hour_set_1_Fahrenheit']

drop_pol_cols = ['AQS_SITE_ID', 'POC','UNITS', 'DAILY_OBS_COUNT', 'PERCENT_COMPLETE',
       'AQS_PARAMETER_CODE', 'AQS_PARAMETER_DESC', 'CBSA_CODE', 'CBSA_NAME',
       'STATE_CODE', 'STATE', 'COUNTY_CODE', 'COUNTY', 'SITE_LATITUDE',
       'SITE_LONGITUDE']

In [4]:
def get_dataset_per_year(year):
    df_meteorological = pd.DataFrame()
    for i in range(1,13):
        df_met_read = pd.read_csv(parent_directory + '/meterological_data/' + str(year) + '/' + str(i) + '.csv', skiprows=[0,1,2,3,4,5], header = [0,1])
        df_met_read.columns = df_met_read.columns.map('_'.join)
        df_meteorological = pd.concat([df_meteorological, df_met_read])

    df_meteorological['Date_Time_Unnamed: 1_level_1'] = pd.to_datetime(df_meteorological['Date_Time_Unnamed: 1_level_1'])
    df_meteorological.index = df_meteorological['Date_Time_Unnamed: 1_level_1']
    df_meteorological_hourly = df_meteorological.resample('H').mean()
    df_meteorological_hourly = df_meteorological_hourly.drop(columns = drop_met_cols)

    pol_count = 0

    for pollutant in pollutant_files:
        df_pollutant = pd.read_csv(parent_directory + '/pollutant_data/' + str(year) + '/' + pollutant + '.csv')
        df_pollutant = df_pollutant.drop(columns = drop_pol_cols)
        df_pollutant['Date'] = pd.to_datetime(df_pollutant['Date'])
        df_pollutant.index = df_pollutant['Date']
        df_pollutant = df_pollutant.drop(columns = 'Date')
        df_pollutant_15DaysMean = df_pollutant.resample('15D').mean()
        df_pollutant_hourly = df_pollutant_15DaysMean.resample('H').ffill()
        print(df_pollutant_hourly)
        if pol_count == 0:
            df_merged_temp = df_meteorological_hourly.join(df_pollutant_hourly, how = 'left')
        else:
            df_merged_temp = df_merged_temp.join(df_pollutant_hourly, lsuffix='_MAX', rsuffix='_NEW', how = 'left')
            df_merged_temp['DAILY_AQI_VALUE'] = df_merged_temp[['DAILY_AQI_VALUE_MAX', 'DAILY_AQI_VALUE_NEW']].max(axis=1)
            df_merged_temp = df_merged_temp.drop(columns = ['DAILY_AQI_VALUE_MAX', 'DAILY_AQI_VALUE_NEW'])
        pol_count += 1
    

    return df_merged_temp


In [5]:
df_final_dataset = get_dataset_per_year(2009)

                     Daily Max 8-hour CO Concentration  DAILY_AQI_VALUE
Date                                                                   
2009-01-01 00:00:00                           0.335714         3.714286
2009-01-01 01:00:00                           0.335714         3.714286
2009-01-01 02:00:00                           0.335714         3.714286
2009-01-01 03:00:00                           0.335714         3.714286
2009-01-01 04:00:00                           0.335714         3.714286
2009-01-01 05:00:00                           0.335714         3.714286
2009-01-01 06:00:00                           0.335714         3.714286
2009-01-01 07:00:00                           0.335714         3.714286
2009-01-01 08:00:00                           0.335714         3.714286
2009-01-01 09:00:00                           0.335714         3.714286
2009-01-01 10:00:00                           0.335714         3.714286
2009-01-01 11:00:00                           0.335714         3

In [6]:
len(df_final_dataset)

8760

In [7]:
df_final_dataset[df_final_dataset['DAILY_AQI_VALUE'].isnull()]

Unnamed: 0_level_0,altimeter_set_1_INHG,air_temp_set_1_Fahrenheit,relative_humidity_set_1_%,wind_speed_set_1_Miles/hour,wind_direction_set_1_Degrees,Daily Max 8-hour CO Concentration,DAILY_AQI_VALUE
Date_Time_Unnamed: 1_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2009-03-17 00:00:00,29.480000,56.6,39.126667,1.146667,26.666667,,
2009-03-17 01:00:00,29.480000,53.6,46.953333,0.000000,0.000000,,
2009-03-17 02:00:00,29.483333,47.6,65.970000,0.000000,0.000000,,
2009-03-17 03:00:00,29.486667,45.2,72.273333,0.000000,0.000000,,
2009-03-17 04:00:00,29.490000,45.8,79.450000,1.146667,56.666667,,
2009-03-17 05:00:00,29.500000,45.8,87.293333,0.000000,0.000000,,
2009-03-17 06:00:00,29.493333,44.0,89.160000,1.146667,56.666667,,
2009-03-17 07:00:00,29.496667,45.2,85.160000,0.000000,0.000000,,
2009-03-17 08:00:00,29.486667,44.6,87.090000,1.536667,66.666667,,
2009-03-17 09:00:00,29.476667,44.0,87.056667,2.683333,126.666667,,


In [8]:
df_final_dataset.describe()

Unnamed: 0,altimeter_set_1_INHG,air_temp_set_1_Fahrenheit,relative_humidity_set_1_%,wind_speed_set_1_Miles/hour,wind_direction_set_1_Degrees,Daily Max 8-hour CO Concentration,DAILY_AQI_VALUE
count,8705.0,8705.0,8705.0,8659.0,8659.0,7921.0,7921.0
mean,29.358704,49.471631,71.046883,6.918005,138.455846,0.319823,3.488217
std,0.221498,19.445735,17.226614,5.635774,108.402389,0.072907,0.92164
min,28.32,-18.4,22.79,0.0,0.0,0.2,2.0
25%,29.216667,35.6,58.903333,2.293333,30.0,0.271429,2.866667
50%,29.37,51.8,72.6,6.516667,136.666667,0.313333,3.333333
75%,29.486667,64.4,85.503333,10.36,233.333333,0.333333,3.714286
max,30.036667,92.6,100.0,31.846667,360.0,0.58,6.8


In [9]:
df_final_dataset

Unnamed: 0_level_0,altimeter_set_1_INHG,air_temp_set_1_Fahrenheit,relative_humidity_set_1_%,wind_speed_set_1_Miles/hour,wind_direction_set_1_Degrees,Daily Max 8-hour CO Concentration,DAILY_AQI_VALUE
Date_Time_Unnamed: 1_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2009-01-01 00:00:00,29.743333,17.6,61.920000,6.136667,256.666667,0.335714,3.714286
2009-01-01 01:00:00,29.743333,17.6,61.920000,3.440000,273.333333,0.335714,3.714286
2009-01-01 02:00:00,29.733333,15.8,70.750000,3.830000,246.666667,0.335714,3.714286
2009-01-01 03:00:00,29.713333,17.6,67.190000,2.683333,160.000000,0.335714,3.714286
2009-01-01 04:00:00,29.690000,17.6,70.963333,2.683333,106.666667,0.335714,3.714286
2009-01-01 05:00:00,29.670000,17.6,72.850000,4.990000,170.000000,0.335714,3.714286
2009-01-01 06:00:00,29.643333,17.6,74.880000,7.290000,173.333333,0.335714,3.714286
2009-01-01 07:00:00,29.623333,18.8,75.020000,9.590000,176.666667,0.335714,3.714286
2009-01-01 08:00:00,29.613333,19.4,71.180000,11.120000,173.333333,0.335714,3.714286
2009-01-01 09:00:00,29.570000,19.4,75.073333,10.356667,160.000000,0.335714,3.714286
