In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings

In [2]:
parent_directory = '.'

In [3]:
#pollutant_files = ['CO_illinois', 'NO2_illinois', 'OZ_illinois', 'PM10_illinois','PM25_illinois','SO2_illinois']

pollutant = 'CO_illinois'

drop_met_cols = ['dew_point_temperature_set_1_Fahrenheit', 'wind_gust_set_1_Miles/hour', 'weather_cond_code_set_1_code',
                'cloud_layer_3_code_set_1_code', 'precip_accum_one_hour_set_1_Inches', 'precip_accum_three_hour_set_1_Inches', 
                'cloud_layer_1_code_set_1_code', 'cloud_layer_2_code_set_1_code', 'precip_accum_six_hour_set_1_Inches', 
                 'precip_accum_24_hour_set_1_Inches', 'visibility_set_1_Statute miles', 'metar_remark_set_1_text', 'metar_set_1_text',
                'air_temp_high_6_hour_set_1_Fahrenheit', 'ceiling_set_1_Feet', 'air_temp_high_24_hour_set_1_Fahrenheit', 'air_temp_low_24_hour_set_1_Fahrenheit',
                'dew_point_temperature_set_1d_Fahrenheit', 'wind_chill_set_1d_Fahrenheit', 'pressure_set_1d_INHG',
                'sea_level_pressure_set_1d_INHG', 'heat_index_set_1d_Fahrenheit', 'air_temp_low_6_hour_set_1_Fahrenheit']

drop_pol_cols = ['AQS_SITE_ID', 'POC','UNITS', 'DAILY_OBS_COUNT', 'PERCENT_COMPLETE',
       'AQS_PARAMETER_CODE', 'AQS_PARAMETER_DESC', 'CBSA_CODE', 'CBSA_NAME',
       'STATE_CODE', 'STATE', 'COUNTY_CODE', 'COUNTY', 'SITE_LATITUDE',
       'SITE_LONGITUDE']

In [4]:
def get_dataset_per_year(year):
    df_pollutant = pd.read_csv(parent_directory + '/pollutant_data/' + str(year) + '/' + pollutant + '.csv')
    df_pollutant = df_pollutant.drop(columns = drop_pol_cols, errors = 'ignore')
    df_pollutant['Date'] = pd.to_datetime(df_pollutant['Date'])
    df_pollutant.index = df_pollutant['Date']
    df_pollutant = df_pollutant.drop(columns = 'Date')
    df_pollutant_Daily = df_pollutant.resample('1D').mean()
    return df_pollutant_Daily

In [5]:
final_dataset_pol = pd.DataFrame()
for year in range(2009,2019):
    print(year)
    df_temp_dataset = get_dataset_per_year(year)
    final_dataset_pol = pd.concat([final_dataset_pol, df_temp_dataset])

2009
2010
2011
2012
2013
2014
2015
2016
2017
2018


In [6]:
final_dataset_pol

Unnamed: 0_level_0,Daily Max 8-hour CO Concentration,DAILY_AQI_VALUE
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2009-01-01,0.3,3.0
2009-01-02,0.3,3.0
2009-01-03,0.3,3.0
2009-01-04,0.3,3.0
2009-01-05,0.3,3.0
2009-01-06,0.5,6.0
2009-01-07,0.4,5.0
2009-01-08,0.3,3.0
2009-01-09,0.4,5.0
2009-01-10,0.2,2.0


In [7]:
final_dataset_pol['ffill'] = final_dataset_pol['Daily Max 8-hour CO Concentration'].ffill()


In [8]:
final_dataset_pol['bfill'] = final_dataset_pol['Daily Max 8-hour CO Concentration'].bfill()


In [9]:
final_dataset_pol['Daily Max 8-hour CO Concentration'] = final_dataset_pol[['ffill', 'bfill']].mean(axis=1)


In [10]:
final_dataset_pol = final_dataset_pol.drop(['ffill', 'bfill'], axis=1)


In [11]:
final_dataset_pol

Unnamed: 0_level_0,Daily Max 8-hour CO Concentration,DAILY_AQI_VALUE
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2009-01-01,0.3,3.0
2009-01-02,0.3,3.0
2009-01-03,0.3,3.0
2009-01-04,0.3,3.0
2009-01-05,0.3,3.0
2009-01-06,0.5,6.0
2009-01-07,0.4,5.0
2009-01-08,0.3,3.0
2009-01-09,0.4,5.0
2009-01-10,0.2,2.0


In [12]:
final_dataset_met = pd.read_csv(parent_directory + '/meteorological_dataset_high_low.csv')


In [13]:
final_dataset_met.shape

(3611, 16)

In [14]:
final_dataset_pol.shape

(3520, 2)

In [15]:
final_dataset_met.isnull().sum()

Date_Time_Unnamed: 1_level_1           0
altimeter_set_1_INHG_Median            0
air_temp_set_1_Fahrenheit_Median       0
relative_humidity_set_1_%_Median       0
wind_speed_set_1_Miles/hour_Median     0
wind_direction_set_1_Degrees_Median    0
altimeter_set_1_INHG_High              0
air_temp_set_1_Fahrenheit_High         0
relative_humidity_set_1_%_High         0
wind_speed_set_1_Miles/hour_High       0
wind_direction_set_1_Degrees_High      0
altimeter_set_1_INHG_Low               0
air_temp_set_1_Fahrenheit_Low          0
relative_humidity_set_1_%_Low          0
wind_speed_set_1_Miles/hour_Low        0
wind_direction_set_1_Degrees_Low       0
dtype: int64

In [16]:
final_dataset_pol.isnull().sum()

Daily Max 8-hour CO Concentration      0
DAILY_AQI_VALUE                      761
dtype: int64

In [17]:
final_dataset_met

Unnamed: 0,Date_Time_Unnamed: 1_level_1,altimeter_set_1_INHG_Median,air_temp_set_1_Fahrenheit_Median,relative_humidity_set_1_%_Median,wind_speed_set_1_Miles/hour_Median,wind_direction_set_1_Degrees_Median,altimeter_set_1_INHG_High,air_temp_set_1_Fahrenheit_High,relative_humidity_set_1_%_High,wind_speed_set_1_Miles/hour_High,wind_direction_set_1_Degrees_High,altimeter_set_1_INHG_Low,air_temp_set_1_Fahrenheit_Low,relative_humidity_set_1_%_Low,wind_speed_set_1_Miles/hour_Low,wind_direction_set_1_Degrees_Low
0,2009-01-01,29.510,21.20,68.340,11.500,180.0,29.75,30.20,79.26,21.85,280.0,29.18,15.80,58.68,0.00,0.0
1,2009-01-02,29.160,28.40,68.780,10.360,240.0,29.32,32.00,79.74,17.27,300.0,29.10,24.80,39.61,0.00,0.0
2,2009-01-03,29.370,23.00,68.110,6.910,90.0,29.43,35.60,85.59,18.41,280.0,29.24,17.60,47.45,0.00,0.0
3,2009-01-04,29.240,37.40,83.390,12.660,160.0,29.55,39.20,100.00,21.85,320.0,29.13,26.60,59.74,0.00,0.0
4,2009-01-05,29.605,20.30,61.790,6.910,265.0,29.63,28.40,78.44,12.66,310.0,29.39,12.20,50.09,0.00,0.0
5,2009-01-06,29.150,21.20,78.940,3.440,90.0,29.39,30.20,92.85,10.36,170.0,28.87,15.80,67.42,0.00,0.0
6,2009-01-07,28.685,28.40,92.850,4.610,250.0,28.89,28.40,100.00,24.16,280.0,28.64,19.40,79.26,0.00,0.0
7,2009-01-08,29.050,15.80,78.440,11.500,270.0,29.31,21.20,85.59,19.57,280.0,28.79,10.40,62.44,5.75,240.0
8,2009-01-09,29.360,17.60,78.940,5.750,110.0,29.41,30.20,92.61,14.97,270.0,29.31,8.60,68.56,0.00,0.0
9,2009-01-10,29.370,28.40,86.150,10.360,50.0,29.41,30.20,92.85,16.11,360.0,29.33,26.60,74.03,5.75,10.0


In [18]:
final_dataset_met['Date_Time_Unnamed: 1_level_1'] = pd.to_datetime(final_dataset_met['Date_Time_Unnamed: 1_level_1'])
final_dataset_met.index = final_dataset_met['Date_Time_Unnamed: 1_level_1']
final_dataset_met = final_dataset_met.drop(['Date_Time_Unnamed: 1_level_1'], axis = 1)

In [19]:
final_dataset_met.head()

Unnamed: 0_level_0,altimeter_set_1_INHG_Median,air_temp_set_1_Fahrenheit_Median,relative_humidity_set_1_%_Median,wind_speed_set_1_Miles/hour_Median,wind_direction_set_1_Degrees_Median,altimeter_set_1_INHG_High,air_temp_set_1_Fahrenheit_High,relative_humidity_set_1_%_High,wind_speed_set_1_Miles/hour_High,wind_direction_set_1_Degrees_High,altimeter_set_1_INHG_Low,air_temp_set_1_Fahrenheit_Low,relative_humidity_set_1_%_Low,wind_speed_set_1_Miles/hour_Low,wind_direction_set_1_Degrees_Low
Date_Time_Unnamed: 1_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2009-01-01,29.51,21.2,68.34,11.5,180.0,29.75,30.2,79.26,21.85,280.0,29.18,15.8,58.68,0.0,0.0
2009-01-02,29.16,28.4,68.78,10.36,240.0,29.32,32.0,79.74,17.27,300.0,29.1,24.8,39.61,0.0,0.0
2009-01-03,29.37,23.0,68.11,6.91,90.0,29.43,35.6,85.59,18.41,280.0,29.24,17.6,47.45,0.0,0.0
2009-01-04,29.24,37.4,83.39,12.66,160.0,29.55,39.2,100.0,21.85,320.0,29.13,26.6,59.74,0.0,0.0
2009-01-05,29.605,20.3,61.79,6.91,265.0,29.63,28.4,78.44,12.66,310.0,29.39,12.2,50.09,0.0,0.0


In [20]:
final_dataset = final_dataset_pol.join(final_dataset_met, how = 'inner')

In [21]:
final_dataset.shape

(3520, 17)

In [22]:
final_dataset

Unnamed: 0,Daily Max 8-hour CO Concentration,DAILY_AQI_VALUE,altimeter_set_1_INHG_Median,air_temp_set_1_Fahrenheit_Median,relative_humidity_set_1_%_Median,wind_speed_set_1_Miles/hour_Median,wind_direction_set_1_Degrees_Median,altimeter_set_1_INHG_High,air_temp_set_1_Fahrenheit_High,relative_humidity_set_1_%_High,wind_speed_set_1_Miles/hour_High,wind_direction_set_1_Degrees_High,altimeter_set_1_INHG_Low,air_temp_set_1_Fahrenheit_Low,relative_humidity_set_1_%_Low,wind_speed_set_1_Miles/hour_Low,wind_direction_set_1_Degrees_Low
2009-01-01,0.3,3.0,29.510,21.20,68.340,11.500,180.0,29.75,30.20,79.26,21.85,280.0,29.18,15.80,58.68,0.00,0.0
2009-01-02,0.3,3.0,29.160,28.40,68.780,10.360,240.0,29.32,32.00,79.74,17.27,300.0,29.10,24.80,39.61,0.00,0.0
2009-01-03,0.3,3.0,29.370,23.00,68.110,6.910,90.0,29.43,35.60,85.59,18.41,280.0,29.24,17.60,47.45,0.00,0.0
2009-01-04,0.3,3.0,29.240,37.40,83.390,12.660,160.0,29.55,39.20,100.00,21.85,320.0,29.13,26.60,59.74,0.00,0.0
2009-01-05,0.3,3.0,29.605,20.30,61.790,6.910,265.0,29.63,28.40,78.44,12.66,310.0,29.39,12.20,50.09,0.00,0.0
2009-01-06,0.5,6.0,29.150,21.20,78.940,3.440,90.0,29.39,30.20,92.85,10.36,170.0,28.87,15.80,67.42,0.00,0.0
2009-01-07,0.4,5.0,28.685,28.40,92.850,4.610,250.0,28.89,28.40,100.00,24.16,280.0,28.64,19.40,79.26,0.00,0.0
2009-01-08,0.3,3.0,29.050,15.80,78.440,11.500,270.0,29.31,21.20,85.59,19.57,280.0,28.79,10.40,62.44,5.75,240.0
2009-01-09,0.4,5.0,29.360,17.60,78.940,5.750,110.0,29.41,30.20,92.61,14.97,270.0,29.31,8.60,68.56,0.00,0.0
2009-01-10,0.2,2.0,29.370,28.40,86.150,10.360,50.0,29.41,30.20,92.85,16.11,360.0,29.33,26.60,74.03,5.75,10.0


In [23]:
final_dataset.to_csv('AQI_dataset_CO.csv', sep=',', index=True)