In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings

In [12]:
parent_directory = '/Users/pajhaver/Documents/AQI_Research/'

# Meteorological Data Processing

In [13]:
df_meteorological = pd.read_csv(parent_directory + 'meterological_data/2009/' + '1' + '.csv', skiprows=[0,1,2,3,4,5], header = [0,1])
df_meteorological.columns = df_meteorological.columns.map('_'.join)

In [14]:
df_meteorological = pd.DataFrame()
for i in range(1,13):
    df_temp = pd.read_csv(parent_directory + 'meterological_data/2009/' + str(i) + '.csv', skiprows=[0,1,2,3,4,5], header = [0,1])
    df_temp.columns = df_temp.columns.map('_'.join)
    
    df_meteorological = pd.concat([df_meteorological, df_temp])

In [15]:
len(df_meteorological)

26052

In [16]:
df_meteorological['Date_Time_Unnamed: 1_level_1'].head()

0    01/01/2009 00:00 UTC
1    01/01/2009 00:25 UTC
2    01/01/2009 00:45 UTC
3    01/01/2009 01:05 UTC
4    01/01/2009 01:25 UTC
Name: Date_Time_Unnamed: 1_level_1, dtype: object

In [17]:
df_meteorological['Date_Time_Unnamed: 1_level_1'] = pd.to_datetime(df_meteorological['Date_Time_Unnamed: 1_level_1'])

In [18]:
df_meteorological.index = df_meteorological['Date_Time_Unnamed: 1_level_1']

In [19]:
df_meteorological_hourly = df_meteorological.resample('H').mean()

In [34]:
df_meteorological_hourly

Unnamed: 0_level_0,altimeter_set_1_INHG,air_temp_set_1_Fahrenheit,dew_point_temperature_set_1_Fahrenheit,relative_humidity_set_1_%,wind_speed_set_1_Miles/hour,wind_direction_set_1_Degrees,wind_gust_set_1_Miles/hour,weather_cond_code_set_1_code,cloud_layer_3_code_set_1_code,precip_accum_one_hour_set_1_Inches,...,air_temp_high_6_hour_set_1_Fahrenheit,air_temp_low_6_hour_set_1_Fahrenheit,ceiling_set_1_Feet,air_temp_high_24_hour_set_1_Fahrenheit,air_temp_low_24_hour_set_1_Fahrenheit,dew_point_temperature_set_1d_Fahrenheit,wind_chill_set_1d_Fahrenheit,pressure_set_1d_INHG,sea_level_pressure_set_1d_INHG,heat_index_set_1d_Fahrenheit
Date_Time_Unnamed: 1_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-01-01 00:00:00,29.743333,17.6,,61.920000,6.136667,256.666667,,,,,...,,,,,,6.630000,,29.093333,29.623333,
2009-01-01 01:00:00,29.743333,17.6,,61.920000,3.440000,273.333333,,,,,...,,,,,,6.630000,,29.093333,29.623333,
2009-01-01 02:00:00,29.733333,15.8,,70.750000,3.830000,246.666667,,,,,...,,,,,,7.870000,,29.083333,29.616667,
2009-01-01 03:00:00,29.713333,17.6,,67.190000,2.683333,160.000000,,,,,...,,,,,,8.450000,,29.063333,29.583333,
2009-01-01 04:00:00,29.690000,17.6,,70.963333,2.683333,106.666667,,,,,...,,,,,,9.670000,,29.040000,29.553333,
2009-01-01 05:00:00,29.670000,17.6,,72.850000,4.990000,170.000000,,,,,...,,,,,,10.280000,,29.023333,29.533333,
2009-01-01 06:00:00,29.643333,17.6,,74.880000,7.290000,173.333333,,,,,...,,,,,,10.890000,10.160000,29.003333,29.510000,
2009-01-01 07:00:00,29.623333,18.8,,75.020000,9.590000,176.666667,,,,,...,,,,,,12.090000,10.593333,28.983333,29.483333,
2009-01-01 08:00:00,29.613333,19.4,,71.180000,11.120000,173.333333,,,,,...,,,12000.000000,,,11.470000,10.396667,28.973333,29.470000,
2009-01-01 09:00:00,29.570000,19.4,,75.073333,10.356667,160.000000,,,,,...,,,11000.000000,,,12.686667,10.953333,28.930000,29.416667,


In [21]:
len(df_meteorological_hourly)

8760

# Pollutants Data Processing

In [23]:
df_pollutants = pd.read_csv(parent_directory + '/pollutant_data/2009/' + 'PM10_illinois' + '.csv')

In [24]:
df_pollutants.head()

Unnamed: 0,Date,AQS_SITE_ID,POC,Daily Mean PM10 Concentration,UNITS,DAILY_AQI_VALUE,DAILY_OBS_COUNT,PERCENT_COMPLETE,AQS_PARAMETER_CODE,AQS_PARAMETER_DESC,CBSA_CODE,CBSA_NAME,STATE_CODE,STATE,COUNTY_CODE,COUNTY,SITE_LATITUDE,SITE_LONGITUDE
0,01/01/2009,170314201,1,12,ug/m3 SC,11,1,100.0,81102,PM10 Total 0-10um STP,16980,"Chicago-Naperville-Elgin, IL-IN-WI",17,Illinois,31,Cook,42.139996,-87.799227
1,01/07/2009,170314201,1,29,ug/m3 SC,27,1,100.0,81102,PM10 Total 0-10um STP,16980,"Chicago-Naperville-Elgin, IL-IN-WI",17,Illinois,31,Cook,42.139996,-87.799227
2,01/13/2009,170314201,1,14,ug/m3 SC,13,1,100.0,81102,PM10 Total 0-10um STP,16980,"Chicago-Naperville-Elgin, IL-IN-WI",17,Illinois,31,Cook,42.139996,-87.799227
3,01/19/2009,170314201,1,4,ug/m3 SC,4,1,100.0,81102,PM10 Total 0-10um STP,16980,"Chicago-Naperville-Elgin, IL-IN-WI",17,Illinois,31,Cook,42.139996,-87.799227
4,01/25/2009,170314201,1,22,ug/m3 SC,20,1,100.0,81102,PM10 Total 0-10um STP,16980,"Chicago-Naperville-Elgin, IL-IN-WI",17,Illinois,31,Cook,42.139996,-87.799227


In [25]:
df_pollutants['Date'] = pd.to_datetime(df_pollutants['Date'])

In [26]:
len(df_pollutants)

61

In [27]:
df_pollutants

Unnamed: 0,Date,AQS_SITE_ID,POC,Daily Mean PM10 Concentration,UNITS,DAILY_AQI_VALUE,DAILY_OBS_COUNT,PERCENT_COMPLETE,AQS_PARAMETER_CODE,AQS_PARAMETER_DESC,CBSA_CODE,CBSA_NAME,STATE_CODE,STATE,COUNTY_CODE,COUNTY,SITE_LATITUDE,SITE_LONGITUDE
0,2009-01-01,170314201,1,12,ug/m3 SC,11,1,100.0,81102,PM10 Total 0-10um STP,16980,"Chicago-Naperville-Elgin, IL-IN-WI",17,Illinois,31,Cook,42.139996,-87.799227
1,2009-01-07,170314201,1,29,ug/m3 SC,27,1,100.0,81102,PM10 Total 0-10um STP,16980,"Chicago-Naperville-Elgin, IL-IN-WI",17,Illinois,31,Cook,42.139996,-87.799227
2,2009-01-13,170314201,1,14,ug/m3 SC,13,1,100.0,81102,PM10 Total 0-10um STP,16980,"Chicago-Naperville-Elgin, IL-IN-WI",17,Illinois,31,Cook,42.139996,-87.799227
3,2009-01-19,170314201,1,4,ug/m3 SC,4,1,100.0,81102,PM10 Total 0-10um STP,16980,"Chicago-Naperville-Elgin, IL-IN-WI",17,Illinois,31,Cook,42.139996,-87.799227
4,2009-01-25,170314201,1,22,ug/m3 SC,20,1,100.0,81102,PM10 Total 0-10um STP,16980,"Chicago-Naperville-Elgin, IL-IN-WI",17,Illinois,31,Cook,42.139996,-87.799227
5,2009-01-31,170314201,1,13,ug/m3 SC,12,1,100.0,81102,PM10 Total 0-10um STP,16980,"Chicago-Naperville-Elgin, IL-IN-WI",17,Illinois,31,Cook,42.139996,-87.799227
6,2009-02-06,170314201,1,41,ug/m3 SC,38,1,100.0,81102,PM10 Total 0-10um STP,16980,"Chicago-Naperville-Elgin, IL-IN-WI",17,Illinois,31,Cook,42.139996,-87.799227
7,2009-02-12,170314201,1,9,ug/m3 SC,8,1,100.0,81102,PM10 Total 0-10um STP,16980,"Chicago-Naperville-Elgin, IL-IN-WI",17,Illinois,31,Cook,42.139996,-87.799227
8,2009-02-18,170314201,1,12,ug/m3 SC,11,1,100.0,81102,PM10 Total 0-10um STP,16980,"Chicago-Naperville-Elgin, IL-IN-WI",17,Illinois,31,Cook,42.139996,-87.799227
9,2009-02-24,170314201,1,36,ug/m3 SC,33,1,100.0,81102,PM10 Total 0-10um STP,16980,"Chicago-Naperville-Elgin, IL-IN-WI",17,Illinois,31,Cook,42.139996,-87.799227


In [28]:
df_pollutants.index = df_pollutants['Date']

In [29]:
df_pollutants_15DaysMean = df_pollutants.resample('15D').mean()

In [56]:
df_pollutants_5DaysMean = df_pollutants.resample('5D').mean()

In [30]:
df_pollutants_15DaysMean

Unnamed: 0_level_0,AQS_SITE_ID,POC,Daily Mean PM10 Concentration,DAILY_AQI_VALUE,DAILY_OBS_COUNT,PERCENT_COMPLETE,AQS_PARAMETER_CODE,CBSA_CODE,STATE_CODE,COUNTY_CODE,SITE_LATITUDE,SITE_LONGITUDE
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2009-01-01,170314201.0,1.0,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-16,170314201.0,1.0,13.0,12.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-31,170314201.0,1.0,21.0,19.333333,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-02-15,170314201.0,1.0,24.0,22.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-03-02,170314201.0,1.0,12.666667,12.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-03-17,170314201.0,1.0,18.0,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-04-01,170314201.0,1.0,15.333333,14.333333,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-04-16,170314201.0,1.0,19.0,17.5,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-05-01,170314201.0,1.0,19.0,17.666667,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-05-16,170314201.0,1.0,25.0,23.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227


In [57]:
df_pollutants_5DaysMean

Unnamed: 0_level_0,AQS_SITE_ID,POC,Daily Mean PM10 Concentration,DAILY_AQI_VALUE,DAILY_OBS_COUNT,PERCENT_COMPLETE,AQS_PARAMETER_CODE,CBSA_CODE,STATE_CODE,COUNTY_CODE,SITE_LATITUDE,SITE_LONGITUDE
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2009-01-01,170314201.0,1.0,12.0,11.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-06,170314201.0,1.0,29.0,27.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-11,170314201.0,1.0,14.0,13.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-16,170314201.0,1.0,4.0,4.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-21,170314201.0,1.0,22.0,20.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-26,,,,,,,,,,,,
2009-01-31,170314201.0,1.0,13.0,12.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-02-05,170314201.0,1.0,41.0,38.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-02-10,170314201.0,1.0,9.0,8.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-02-15,170314201.0,1.0,12.0,11.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227


In [31]:
df_pollutants_hourly = df_pollutants_15DaysMean.resample('H').ffill()

In [32]:
df_pollutants_hourly

Unnamed: 0_level_0,AQS_SITE_ID,POC,Daily Mean PM10 Concentration,DAILY_AQI_VALUE,DAILY_OBS_COUNT,PERCENT_COMPLETE,AQS_PARAMETER_CODE,CBSA_CODE,STATE_CODE,COUNTY_CODE,SITE_LATITUDE,SITE_LONGITUDE
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2009-01-01 00:00:00,170314201.0,1.0,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 01:00:00,170314201.0,1.0,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 02:00:00,170314201.0,1.0,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 03:00:00,170314201.0,1.0,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 04:00:00,170314201.0,1.0,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 05:00:00,170314201.0,1.0,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 06:00:00,170314201.0,1.0,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 07:00:00,170314201.0,1.0,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 08:00:00,170314201.0,1.0,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 09:00:00,170314201.0,1.0,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227


In [33]:
len(df_pollutants_hourly)

8641

# Merging Pollutant and Meteorological Hourly Data For Year 2009

In [35]:
df_merged = df_meteorological_hourly.join(df_pollutants_hourly, how = 'inner')

In [36]:
len(df_merged)

8641

In [37]:
df_merged.columns

Index(['altimeter_set_1_INHG', 'air_temp_set_1_Fahrenheit',
       'dew_point_temperature_set_1_Fahrenheit', 'relative_humidity_set_1_%',
       'wind_speed_set_1_Miles/hour', 'wind_direction_set_1_Degrees',
       'wind_gust_set_1_Miles/hour', 'weather_cond_code_set_1_code',
       'cloud_layer_3_code_set_1_code', 'precip_accum_one_hour_set_1_Inches',
       'precip_accum_three_hour_set_1_Inches', 'cloud_layer_1_code_set_1_code',
       'cloud_layer_2_code_set_1_code', 'precip_accum_six_hour_set_1_Inches',
       'precip_accum_24_hour_set_1_Inches', 'visibility_set_1_Statute miles',
       'metar_remark_set_1_text', 'metar_set_1_text',
       'air_temp_high_6_hour_set_1_Fahrenheit',
       'air_temp_low_6_hour_set_1_Fahrenheit', 'ceiling_set_1_Feet',
       'air_temp_high_24_hour_set_1_Fahrenheit',
       'air_temp_low_24_hour_set_1_Fahrenheit',
       'dew_point_temperature_set_1d_Fahrenheit',
       'wind_chill_set_1d_Fahrenheit', 'pressure_set_1d_INHG',
       'sea_level_pressure_s

In [44]:
df_merged_2 = df_meteorological_hourly.join(df_pollutants_hourly, how = 'left')

In [45]:
df_merged_2

Unnamed: 0_level_0,altimeter_set_1_INHG,air_temp_set_1_Fahrenheit,dew_point_temperature_set_1_Fahrenheit,relative_humidity_set_1_%,wind_speed_set_1_Miles/hour,wind_direction_set_1_Degrees,wind_gust_set_1_Miles/hour,weather_cond_code_set_1_code,cloud_layer_3_code_set_1_code,precip_accum_one_hour_set_1_Inches,...,Daily Mean PM10 Concentration,DAILY_AQI_VALUE,DAILY_OBS_COUNT,PERCENT_COMPLETE,AQS_PARAMETER_CODE,CBSA_CODE,STATE_CODE,COUNTY_CODE,SITE_LATITUDE,SITE_LONGITUDE
Date_Time_Unnamed: 1_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-01-01 00:00:00,29.743333,17.6,,61.920000,6.136667,256.666667,,,,,...,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 01:00:00,29.743333,17.6,,61.920000,3.440000,273.333333,,,,,...,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 02:00:00,29.733333,15.8,,70.750000,3.830000,246.666667,,,,,...,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 03:00:00,29.713333,17.6,,67.190000,2.683333,160.000000,,,,,...,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 04:00:00,29.690000,17.6,,70.963333,2.683333,106.666667,,,,,...,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 05:00:00,29.670000,17.6,,72.850000,4.990000,170.000000,,,,,...,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 06:00:00,29.643333,17.6,,74.880000,7.290000,173.333333,,,,,...,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 07:00:00,29.623333,18.8,,75.020000,9.590000,176.666667,,,,,...,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 08:00:00,29.613333,19.4,,71.180000,11.120000,173.333333,,,,,...,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227
2009-01-01 09:00:00,29.570000,19.4,,75.073333,10.356667,160.000000,,,,,...,18.333333,17.0,1.0,100.0,81102.0,16980.0,17.0,31.0,42.139996,-87.799227


In [51]:
df_merged_2['Daily Mean PM10 Concentration'].resample('H').bfill()

Date_Time_Unnamed: 1_level_1
2009-01-01 00:00:00    18.333333
2009-01-01 01:00:00    18.333333
2009-01-01 02:00:00    18.333333
2009-01-01 03:00:00    18.333333
2009-01-01 04:00:00    18.333333
2009-01-01 05:00:00    18.333333
2009-01-01 06:00:00    18.333333
2009-01-01 07:00:00    18.333333
2009-01-01 08:00:00    18.333333
2009-01-01 09:00:00    18.333333
2009-01-01 10:00:00    18.333333
2009-01-01 11:00:00    18.333333
2009-01-01 12:00:00    18.333333
2009-01-01 13:00:00    18.333333
2009-01-01 14:00:00    18.333333
2009-01-01 15:00:00    18.333333
2009-01-01 16:00:00    18.333333
2009-01-01 17:00:00    18.333333
2009-01-01 18:00:00    18.333333
2009-01-01 19:00:00    18.333333
2009-01-01 20:00:00    18.333333
2009-01-01 21:00:00    18.333333
2009-01-01 22:00:00    18.333333
2009-01-01 23:00:00    18.333333
2009-01-02 00:00:00    18.333333
2009-01-02 01:00:00    18.333333
2009-01-02 02:00:00    18.333333
2009-01-02 03:00:00    18.333333
2009-01-02 04:00:00    18.333333
2009-01-02 05: