In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings

In [2]:
parent_directory = '.'

In [3]:
pollutant_files = ['CO_illinois', 'NO2_illinois', 'OZ_illinois', 'PM10_illinois'
                  ,'PM25_illinois','SO2_illinois']

#pollutant_files = ['CO_illinois']

drop_met_cols = ['dew_point_temperature_set_1_Fahrenheit', 'wind_gust_set_1_Miles/hour', 'weather_cond_code_set_1_code',
                'cloud_layer_3_code_set_1_code', 'precip_accum_one_hour_set_1_Inches', 'precip_accum_three_hour_set_1_Inches', 
                'cloud_layer_1_code_set_1_code', 'cloud_layer_2_code_set_1_code', 'precip_accum_six_hour_set_1_Inches', 
                 'precip_accum_24_hour_set_1_Inches', 'visibility_set_1_Statute miles', 'metar_remark_set_1_text', 'metar_set_1_text',
                'air_temp_high_6_hour_set_1_Fahrenheit', 'ceiling_set_1_Feet', 'air_temp_high_24_hour_set_1_Fahrenheit', 'air_temp_low_24_hour_set_1_Fahrenheit',
                'dew_point_temperature_set_1d_Fahrenheit', 'wind_chill_set_1d_Fahrenheit', 'pressure_set_1d_INHG',
                'sea_level_pressure_set_1d_INHG', 'heat_index_set_1d_Fahrenheit', 'air_temp_low_6_hour_set_1_Fahrenheit']

drop_pol_cols = ['AQS_SITE_ID', 'POC','UNITS', 'DAILY_OBS_COUNT', 'PERCENT_COMPLETE',
       'AQS_PARAMETER_CODE', 'AQS_PARAMETER_DESC', 'CBSA_CODE', 'CBSA_NAME',
       'STATE_CODE', 'STATE', 'COUNTY_CODE', 'COUNTY', 'SITE_LATITUDE',
       'SITE_LONGITUDE']

In [4]:
def get_dataset_per_year(year):
    df_meteorological = pd.DataFrame()
    for i in range(1,13):
        if year == 2018 and i == 12:
            continue
        print('month ===> ' + str(i))
        df_met_read = pd.read_csv(parent_directory + '/meterological_data/' + str(year) + '/' + str(i) + '.csv', skiprows=[0,1,2,3,4,5], header = [0,1], error_bad_lines=False)
        df_met_read.columns = df_met_read.columns.map('_'.join)
        df_meteorological = pd.concat([df_meteorological, df_met_read])

    df_meteorological['Date_Time_Unnamed: 1_level_1'] = pd.to_datetime(df_meteorological['Date_Time_Unnamed: 1_level_1'])
    df_meteorological.index = df_meteorological['Date_Time_Unnamed: 1_level_1']
    df_meteorological_hourly = df_meteorological.resample('H').mean()
    df_meteorological_hourly = df_meteorological_hourly.drop(columns = drop_met_cols, errors = 'ignore')

    pol_count = 0

    for pollutant in pollutant_files:
        df_pollutant = pd.read_csv(parent_directory + '/pollutant_data/' + str(year) + '/' + pollutant + '.csv')
        df_pollutant = df_pollutant.drop(columns = drop_pol_cols, errors = 'ignore')
        df_pollutant['Date'] = pd.to_datetime(df_pollutant['Date'])
        df_pollutant.index = df_pollutant['Date']
        df_pollutant = df_pollutant.drop(columns = 'Date')
        df_pollutant_15DaysMean = df_pollutant.resample('15D').mean()
        df_pollutant_hourly = df_pollutant_15DaysMean.resample('H').ffill()
        #print(df_pollutant_hourly)
        if pol_count == 0:
            df_merged_temp = df_meteorological_hourly.join(df_pollutant_hourly, how = 'inner')
        else:
            df_merged_temp = df_merged_temp.join(df_pollutant_hourly, lsuffix='_MAX', rsuffix='_NEW', how = 'inner')
            df_merged_temp['DAILY_AQI_VALUE'] = df_merged_temp[['DAILY_AQI_VALUE_MAX', 'DAILY_AQI_VALUE_NEW']].max(axis=1)
            df_merged_temp = df_merged_temp.drop(columns = ['DAILY_AQI_VALUE_MAX', 'DAILY_AQI_VALUE_NEW'])
        pol_count += 1
    

    return df_merged_temp


In [5]:
final_dataset = pd.DataFrame()
for year in range(2009,2019):
    print(year)
    df_temp_dataset = get_dataset_per_year(year)
    final_dataset = pd.concat([final_dataset, df_temp_dataset])

2009
month ===> 1
month ===> 2
month ===> 3
month ===> 4
month ===> 5
month ===> 6
month ===> 7
month ===> 8
month ===> 9
month ===> 10
month ===> 11
month ===> 12
2010
month ===> 1
month ===> 2
month ===> 3
month ===> 4
month ===> 5
month ===> 6
month ===> 7
month ===> 8
month ===> 9
month ===> 10
month ===> 11
month ===> 12
2011
month ===> 1
month ===> 2
month ===> 3
month ===> 4
month ===> 5
month ===> 6
month ===> 7
month ===> 8
month ===> 9
month ===> 10


b'Skipping line 1962: expected 32 fields, saw 33\nSkipping line 1963: expected 32 fields, saw 33\n'
b'Skipping line 208: expected 32 fields, saw 33\n'


month ===> 11
month ===> 12
2012
month ===> 1
month ===> 2
month ===> 3
month ===> 4
month ===> 5
month ===> 6
month ===> 7
month ===> 8
month ===> 9
month ===> 10


b'Skipping line 38: expected 32 fields, saw 33\n'
b'Skipping line 547: expected 32 fields, saw 33\n'
b'Skipping line 921: expected 32 fields, saw 33\nSkipping line 922: expected 32 fields, saw 33\nSkipping line 1585: expected 32 fields, saw 33\n'


month ===> 11
month ===> 12
2013
month ===> 1
month ===> 2
month ===> 3
month ===> 4
month ===> 5
month ===> 6
month ===> 7
month ===> 8
month ===> 9
month ===> 10


b'Skipping line 2013: expected 32 fields, saw 33\nSkipping line 2038: expected 32 fields, saw 33\n'
b'Skipping line 1536: expected 32 fields, saw 33\nSkipping line 1538: expected 32 fields, saw 33\n'
b'Skipping line 622: expected 32 fields, saw 33\nSkipping line 1258: expected 32 fields, saw 33\n'
b'Skipping line 1936: expected 32 fields, saw 33\nSkipping line 1937: expected 32 fields, saw 33\n'
b'Skipping line 851: expected 32 fields, saw 33\n'
b'Skipping line 523: expected 32 fields, saw 33\n'
b'Skipping line 2195: expected 32 fields, saw 33\n'
b'Skipping line 1349: expected 32 fields, saw 33\nSkipping line 1351: expected 32 fields, saw 33\nSkipping line 1352: expected 32 fields, saw 33\nSkipping line 1353: expected 32 fields, saw 33\nSkipping line 1354: expected 32 fields, saw 33\n'
b'Skipping line 190: expected 32 fields, saw 33\nSkipping line 197: expected 32 fields, saw 33\n'


month ===> 11
month ===> 12
2014
month ===> 1
month ===> 2
month ===> 3
month ===> 4
month ===> 5
month ===> 6
month ===> 7
month ===> 8
month ===> 9


b'Skipping line 1968: expected 32 fields, saw 33\n'
b'Skipping line 1313: expected 32 fields, saw 33\nSkipping line 1314: expected 32 fields, saw 33\nSkipping line 1315: expected 32 fields, saw 33\n'
b'Skipping line 43: expected 32 fields, saw 33\n'
b'Skipping line 1459: expected 32 fields, saw 33\n'


month ===> 10
month ===> 11
month ===> 12
2015
month ===> 1
month ===> 2
month ===> 3
month ===> 4
month ===> 5
month ===> 6
month ===> 7
month ===> 8
month ===> 9


b'Skipping line 538: expected 32 fields, saw 33\nSkipping line 539: expected 32 fields, saw 33\nSkipping line 540: expected 32 fields, saw 33\nSkipping line 542: expected 32 fields, saw 33\nSkipping line 543: expected 32 fields, saw 33\nSkipping line 547: expected 32 fields, saw 33\nSkipping line 549: expected 32 fields, saw 33\nSkipping line 604: expected 32 fields, saw 33\nSkipping line 618: expected 32 fields, saw 33\nSkipping line 621: expected 32 fields, saw 33\nSkipping line 625: expected 32 fields, saw 33\n'
b'Skipping line 323: expected 32 fields, saw 33\n'


month ===> 10
month ===> 11
month ===> 12
2016
month ===> 1
month ===> 2
month ===> 3
month ===> 4
month ===> 5
month ===> 6
month ===> 7
month ===> 8
month ===> 9


b'Skipping line 157: expected 32 fields, saw 33\nSkipping line 159: expected 32 fields, saw 33\n'
b'Skipping line 82: expected 32 fields, saw 33\nSkipping line 83: expected 32 fields, saw 33\nSkipping line 826: expected 32 fields, saw 33\nSkipping line 829: expected 32 fields, saw 33\n'
b'Skipping line 1257: expected 32 fields, saw 33\nSkipping line 1264: expected 32 fields, saw 33\nSkipping line 1265: expected 32 fields, saw 33\nSkipping line 1266: expected 32 fields, saw 33\nSkipping line 1267: expected 32 fields, saw 33\nSkipping line 1268: expected 32 fields, saw 33\nSkipping line 1274: expected 32 fields, saw 33\nSkipping line 1974: expected 32 fields, saw 33\nSkipping line 2120: expected 32 fields, saw 33\n'


month ===> 10
month ===> 11
month ===> 12
2017
month ===> 1
month ===> 2
month ===> 3
month ===> 4
month ===> 5
month ===> 6
month ===> 7
month ===> 8


b'Skipping line 1157: expected 32 fields, saw 33\n'
b'Skipping line 463: expected 32 fields, saw 33\nSkipping line 464: expected 32 fields, saw 33\nSkipping line 465: expected 32 fields, saw 33\nSkipping line 466: expected 32 fields, saw 33\nSkipping line 1681: expected 32 fields, saw 33\nSkipping line 1684: expected 32 fields, saw 33\nSkipping line 1689: expected 32 fields, saw 33\nSkipping line 1694: expected 32 fields, saw 33\nSkipping line 1695: expected 32 fields, saw 33\nSkipping line 1696: expected 32 fields, saw 33\nSkipping line 1698: expected 32 fields, saw 33\n'
b'Skipping line 1391: expected 32 fields, saw 33\nSkipping line 1392: expected 32 fields, saw 33\n'
b'Skipping line 1398: expected 32 fields, saw 33\nSkipping line 2092: expected 32 fields, saw 33\n'
b'Skipping line 2061: expected 32 fields, saw 33\n'


month ===> 9
month ===> 10
month ===> 11
month ===> 12


b'Skipping line 341: expected 32 fields, saw 33\nSkipping line 342: expected 32 fields, saw 33\nSkipping line 343: expected 32 fields, saw 33\nSkipping line 347: expected 32 fields, saw 33\n'


2018
month ===> 1
month ===> 2
month ===> 3
month ===> 4
month ===> 5
month ===> 6
month ===> 7
month ===> 8
month ===> 9


b'Skipping line 189: expected 32 fields, saw 33\nSkipping line 218: expected 32 fields, saw 33\nSkipping line 220: expected 32 fields, saw 33\nSkipping line 222: expected 32 fields, saw 33\n'
b'Skipping line 823: expected 32 fields, saw 33\nSkipping line 824: expected 32 fields, saw 33\nSkipping line 848: expected 32 fields, saw 33\nSkipping line 849: expected 32 fields, saw 33\nSkipping line 876: expected 32 fields, saw 33\nSkipping line 884: expected 32 fields, saw 33\nSkipping line 1010: expected 32 fields, saw 33\nSkipping line 1013: expected 32 fields, saw 33\nSkipping line 1463: expected 32 fields, saw 33\nSkipping line 1464: expected 32 fields, saw 33\nSkipping line 1465: expected 32 fields, saw 33\n'
b'Skipping line 682: expected 32 fields, saw 33\n'
b'Skipping line 310: expected 32 fields, saw 33\n'
b'Skipping line 1806: expected 32 fields, saw 33\n'
b'Skipping line 373: expected 32 fields, saw 33\n'


month ===> 10
month ===> 11


In [6]:
len(final_dataset)

73042

In [7]:
final_dataset[final_dataset['DAILY_AQI_VALUE'].isnull()]

Unnamed: 0,altimeter_set_1_INHG,air_temp_set_1_Fahrenheit,relative_humidity_set_1_%,wind_speed_set_1_Miles/hour,wind_direction_set_1_Degrees,Daily Max 8-hour CO Concentration,Daily Max 1-hour NO2 Concentration,Daily Max 8-hour Ozone Concentration,Daily Mean PM10 Concentration,Daily Mean PM2.5 Concentration,Daily Max 1-hour SO2 Concentration,DAILY_AQI_VALUE


In [8]:
final_dataset.describe()

Unnamed: 0,altimeter_set_1_INHG,air_temp_set_1_Fahrenheit,relative_humidity_set_1_%,wind_speed_set_1_Miles/hour,wind_direction_set_1_Degrees,Daily Max 8-hour CO Concentration,Daily Max 1-hour NO2 Concentration,Daily Max 8-hour Ozone Concentration,Daily Mean PM10 Concentration,Daily Mean PM2.5 Concentration,Daily Max 1-hour SO2 Concentration,DAILY_AQI_VALUE
count,72270.0,72209.0,72208.0,72174.0,72174.0,62962.0,70522.0,72322.0,72322.0,72322.0,71962.0,73042.0
mean,29.586266,52.091208,71.43008,7.636475,149.959457,0.315293,24.247657,0.037685,16.325269,9.595453,2.613758,43.823345
std,0.38106,19.782002,17.929367,5.508496,104.283107,0.078669,6.986517,0.010573,7.862494,2.875725,1.609292,10.817572
min,28.32,-18.4,12.463333,0.0,0.0,0.133333,7.4,0.011769,2.5,3.45,0.266667,20.466667
25%,29.3,36.8,58.85,3.83,46.666667,0.266667,19.090909,0.028417,12.666667,7.704545,1.442857,36.0
50%,29.48,53.54,73.32,7.283333,160.0,0.3,22.606667,0.038467,15.5,9.033333,2.314286,42.066667
75%,29.92,68.12,86.243333,10.746667,236.666667,0.346154,29.4,0.0456,19.75,11.109091,3.346667,48.9
max,30.766667,101.6,100.0,37.973333,360.0,0.7,43.84,0.064786,91.0,20.873684,9.433333,88.642857


In [9]:
final_dataset.head()

Unnamed: 0,altimeter_set_1_INHG,air_temp_set_1_Fahrenheit,relative_humidity_set_1_%,wind_speed_set_1_Miles/hour,wind_direction_set_1_Degrees,Daily Max 8-hour CO Concentration,Daily Max 1-hour NO2 Concentration,Daily Max 8-hour Ozone Concentration,Daily Mean PM10 Concentration,Daily Mean PM2.5 Concentration,Daily Max 1-hour SO2 Concentration,DAILY_AQI_VALUE
2009-01-01 00:00:00,29.743333,17.6,61.92,6.136667,256.666667,0.335714,39.482759,0.011769,18.333333,13.63913,3.88,50.956522
2009-01-01 01:00:00,29.743333,17.6,61.92,3.44,273.333333,0.335714,39.482759,0.011769,18.333333,13.63913,3.88,50.956522
2009-01-01 02:00:00,29.733333,15.8,70.75,3.83,246.666667,0.335714,39.482759,0.011769,18.333333,13.63913,3.88,50.956522
2009-01-01 03:00:00,29.713333,17.6,67.19,2.683333,160.0,0.335714,39.482759,0.011769,18.333333,13.63913,3.88,50.956522
2009-01-01 04:00:00,29.69,17.6,70.963333,2.683333,106.666667,0.335714,39.482759,0.011769,18.333333,13.63913,3.88,50.956522


In [10]:
final_dataset.to_csv('AQI_dataset.csv', sep=',', index=True)

In [11]:
final_dataset

Unnamed: 0,altimeter_set_1_INHG,air_temp_set_1_Fahrenheit,relative_humidity_set_1_%,wind_speed_set_1_Miles/hour,wind_direction_set_1_Degrees,Daily Max 8-hour CO Concentration,Daily Max 1-hour NO2 Concentration,Daily Max 8-hour Ozone Concentration,Daily Mean PM10 Concentration,Daily Mean PM2.5 Concentration,Daily Max 1-hour SO2 Concentration,DAILY_AQI_VALUE
2009-01-01 00:00:00,29.743333,17.60,61.920000,6.136667,256.666667,0.335714,39.482759,0.011769,18.333333,13.639130,3.880000,50.956522
2009-01-01 01:00:00,29.743333,17.60,61.920000,3.440000,273.333333,0.335714,39.482759,0.011769,18.333333,13.639130,3.880000,50.956522
2009-01-01 02:00:00,29.733333,15.80,70.750000,3.830000,246.666667,0.335714,39.482759,0.011769,18.333333,13.639130,3.880000,50.956522
2009-01-01 03:00:00,29.713333,17.60,67.190000,2.683333,160.000000,0.335714,39.482759,0.011769,18.333333,13.639130,3.880000,50.956522
2009-01-01 04:00:00,29.690000,17.60,70.963333,2.683333,106.666667,0.335714,39.482759,0.011769,18.333333,13.639130,3.880000,50.956522
2009-01-01 05:00:00,29.670000,17.60,72.850000,4.990000,170.000000,0.335714,39.482759,0.011769,18.333333,13.639130,3.880000,50.956522
2009-01-01 06:00:00,29.643333,17.60,74.880000,7.290000,173.333333,0.335714,39.482759,0.011769,18.333333,13.639130,3.880000,50.956522
2009-01-01 07:00:00,29.623333,18.80,75.020000,9.590000,176.666667,0.335714,39.482759,0.011769,18.333333,13.639130,3.880000,50.956522
2009-01-01 08:00:00,29.613333,19.40,71.180000,11.120000,173.333333,0.335714,39.482759,0.011769,18.333333,13.639130,3.880000,50.956522
2009-01-01 09:00:00,29.570000,19.40,75.073333,10.356667,160.000000,0.335714,39.482759,0.011769,18.333333,13.639130,3.880000,50.956522
