In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt  # Matlab-style plotting

In [2]:
original_dataset = pd.read_csv('AQI_dataset_CO_time_series.csv')
dataset = original_dataset.copy()
all_cols = list(dataset.columns)
y_name = 'Daily Max 8-hour CO Concentration'
drop_cols = ['date', 'Unnamed: 0', 'DAILY_AQI_VALUE'] + [y_name]
categorical_cols = [x for x in all_cols if x.startswith('wind_direction_set_1_Compass')]
categorical_cols = categorical_cols + ['weekend']
numerical_features = [x for x in all_cols if x not in categorical_cols + drop_cols]

In [3]:
numerical_features

['altimeter_set_1_INHG_Median',
 'air_temp_set_1_Fahrenheit_Median',
 'relative_humidity_set_1_%_Median',
 'wind_speed_set_1_Miles/hour_Median',
 'altimeter_set_1_INHG_High',
 'air_temp_set_1_Fahrenheit_High',
 'relative_humidity_set_1_%_High',
 'wind_speed_set_1_Miles/hour_High',
 'altimeter_set_1_INHG_Low',
 'air_temp_set_1_Fahrenheit_Low',
 'relative_humidity_set_1_%_Low',
 'wind_speed_set_1_Miles/hour_Low',
 'Daily Max 8-hour CO Concentration_(t-1)',
 'altimeter_set_1_INHG_Median_(t-1)',
 'air_temp_set_1_Fahrenheit_Median_(t-1)',
 'relative_humidity_set_1_%_Median_(t-1)',
 'wind_speed_set_1_Miles/hour_Median_(t-1)',
 'altimeter_set_1_INHG_High_(t-1)',
 'air_temp_set_1_Fahrenheit_High_(t-1)',
 'relative_humidity_set_1_%_High_(t-1)',
 'wind_speed_set_1_Miles/hour_High_(t-1)',
 'altimeter_set_1_INHG_Low_(t-1)',
 'air_temp_set_1_Fahrenheit_Low_(t-1)',
 'relative_humidity_set_1_%_Low_(t-1)',
 'wind_speed_set_1_Miles/hour_Low_(t-1)']

In [4]:
categorical_cols = categorical_cols + ['weekend']

In [5]:
categorical_cols

['wind_direction_set_1_Compass_Median',
 'wind_direction_set_1_Compass_Median_(t-1)',
 'weekend',
 'weekend']

In [6]:
y = dataset[y_name]

In [7]:
dataset = dataset.drop(drop_cols, axis = 1)

In [8]:
dataset.dtypes

altimeter_set_1_INHG_Median                  float64
air_temp_set_1_Fahrenheit_Median             float64
relative_humidity_set_1_%_Median             float64
wind_speed_set_1_Miles/hour_Median           float64
altimeter_set_1_INHG_High                    float64
air_temp_set_1_Fahrenheit_High               float64
relative_humidity_set_1_%_High               float64
wind_speed_set_1_Miles/hour_High             float64
altimeter_set_1_INHG_Low                     float64
air_temp_set_1_Fahrenheit_Low                float64
relative_humidity_set_1_%_Low                float64
wind_speed_set_1_Miles/hour_Low              float64
weekend                                         bool
wind_direction_set_1_Compass_Median           object
Daily Max 8-hour CO Concentration_(t-1)      float64
altimeter_set_1_INHG_Median_(t-1)            float64
air_temp_set_1_Fahrenheit_Median_(t-1)       float64
relative_humidity_set_1_%_Median_(t-1)       float64
wind_speed_set_1_Miles/hour_Median_(t-1)     f

In [9]:
dataset.head()

Unnamed: 0,altimeter_set_1_INHG_Median,air_temp_set_1_Fahrenheit_Median,relative_humidity_set_1_%_Median,wind_speed_set_1_Miles/hour_Median,altimeter_set_1_INHG_High,air_temp_set_1_Fahrenheit_High,relative_humidity_set_1_%_High,wind_speed_set_1_Miles/hour_High,altimeter_set_1_INHG_Low,air_temp_set_1_Fahrenheit_Low,...,wind_speed_set_1_Miles/hour_Median_(t-1),wind_direction_set_1_Compass_Median_(t-1),altimeter_set_1_INHG_High_(t-1),air_temp_set_1_Fahrenheit_High_(t-1),relative_humidity_set_1_%_High_(t-1),wind_speed_set_1_Miles/hour_High_(t-1),altimeter_set_1_INHG_Low_(t-1),air_temp_set_1_Fahrenheit_Low_(t-1),relative_humidity_set_1_%_Low_(t-1),wind_speed_set_1_Miles/hour_Low_(t-1)
0,29.16,28.4,68.78,10.36,29.32,32.0,79.74,17.27,29.1,24.8,...,11.5,S,29.75,30.2,79.26,21.85,29.18,15.8,58.68,0.0
1,29.37,23.0,68.11,6.91,29.43,35.6,85.59,18.41,29.24,17.6,...,10.36,WSW,29.32,32.0,79.74,17.27,29.1,24.8,39.61,0.0
2,29.24,37.4,83.39,12.66,29.55,39.2,100.0,21.85,29.13,26.6,...,6.91,E,29.43,35.6,85.59,18.41,29.24,17.6,47.45,0.0
3,29.605,20.3,61.79,6.91,29.63,28.4,78.44,12.66,29.39,12.2,...,12.66,SSE,29.55,39.2,100.0,21.85,29.13,26.6,59.74,0.0
4,29.15,21.2,78.94,3.44,29.39,30.2,92.85,10.36,28.87,15.8,...,6.91,W,29.63,28.4,78.44,12.66,29.39,12.2,50.09,0.0


In [10]:
x = dataset.copy()

In [11]:
dataset.isnull().sum()

altimeter_set_1_INHG_Median                  0
air_temp_set_1_Fahrenheit_Median             0
relative_humidity_set_1_%_Median             0
wind_speed_set_1_Miles/hour_Median           0
altimeter_set_1_INHG_High                    0
air_temp_set_1_Fahrenheit_High               0
relative_humidity_set_1_%_High               0
wind_speed_set_1_Miles/hour_High             0
altimeter_set_1_INHG_Low                     0
air_temp_set_1_Fahrenheit_Low                0
relative_humidity_set_1_%_Low                0
wind_speed_set_1_Miles/hour_Low              0
weekend                                      0
wind_direction_set_1_Compass_Median          0
Daily Max 8-hour CO Concentration_(t-1)      0
altimeter_set_1_INHG_Median_(t-1)            0
air_temp_set_1_Fahrenheit_Median_(t-1)       0
relative_humidity_set_1_%_Median_(t-1)       0
wind_speed_set_1_Miles/hour_Median_(t-1)     0
wind_direction_set_1_Compass_Median_(t-1)    0
altimeter_set_1_INHG_High_(t-1)              0
air_temp_set_

In [12]:
x.head()

Unnamed: 0,altimeter_set_1_INHG_Median,air_temp_set_1_Fahrenheit_Median,relative_humidity_set_1_%_Median,wind_speed_set_1_Miles/hour_Median,altimeter_set_1_INHG_High,air_temp_set_1_Fahrenheit_High,relative_humidity_set_1_%_High,wind_speed_set_1_Miles/hour_High,altimeter_set_1_INHG_Low,air_temp_set_1_Fahrenheit_Low,...,wind_speed_set_1_Miles/hour_Median_(t-1),wind_direction_set_1_Compass_Median_(t-1),altimeter_set_1_INHG_High_(t-1),air_temp_set_1_Fahrenheit_High_(t-1),relative_humidity_set_1_%_High_(t-1),wind_speed_set_1_Miles/hour_High_(t-1),altimeter_set_1_INHG_Low_(t-1),air_temp_set_1_Fahrenheit_Low_(t-1),relative_humidity_set_1_%_Low_(t-1),wind_speed_set_1_Miles/hour_Low_(t-1)
0,29.16,28.4,68.78,10.36,29.32,32.0,79.74,17.27,29.1,24.8,...,11.5,S,29.75,30.2,79.26,21.85,29.18,15.8,58.68,0.0
1,29.37,23.0,68.11,6.91,29.43,35.6,85.59,18.41,29.24,17.6,...,10.36,WSW,29.32,32.0,79.74,17.27,29.1,24.8,39.61,0.0
2,29.24,37.4,83.39,12.66,29.55,39.2,100.0,21.85,29.13,26.6,...,6.91,E,29.43,35.6,85.59,18.41,29.24,17.6,47.45,0.0
3,29.605,20.3,61.79,6.91,29.63,28.4,78.44,12.66,29.39,12.2,...,12.66,SSE,29.55,39.2,100.0,21.85,29.13,26.6,59.74,0.0
4,29.15,21.2,78.94,3.44,29.39,30.2,92.85,10.36,28.87,15.8,...,6.91,W,29.63,28.4,78.44,12.66,29.39,12.2,50.09,0.0


In [13]:
x = pd.get_dummies(x, drop_first=True)
print(x.shape)

(3519, 56)


In [14]:
x.head()

Unnamed: 0,altimeter_set_1_INHG_Median,air_temp_set_1_Fahrenheit_Median,relative_humidity_set_1_%_Median,wind_speed_set_1_Miles/hour_Median,altimeter_set_1_INHG_High,air_temp_set_1_Fahrenheit_High,relative_humidity_set_1_%_High,wind_speed_set_1_Miles/hour_High,altimeter_set_1_INHG_Low,air_temp_set_1_Fahrenheit_Low,...,wind_direction_set_1_Compass_Median_(t-1)_NNW,wind_direction_set_1_Compass_Median_(t-1)_NW,wind_direction_set_1_Compass_Median_(t-1)_S,wind_direction_set_1_Compass_Median_(t-1)_SE,wind_direction_set_1_Compass_Median_(t-1)_SSE,wind_direction_set_1_Compass_Median_(t-1)_SSW,wind_direction_set_1_Compass_Median_(t-1)_SW,wind_direction_set_1_Compass_Median_(t-1)_W,wind_direction_set_1_Compass_Median_(t-1)_WNW,wind_direction_set_1_Compass_Median_(t-1)_WSW
0,29.16,28.4,68.78,10.36,29.32,32.0,79.74,17.27,29.1,24.8,...,0,0,1,0,0,0,0,0,0,0
1,29.37,23.0,68.11,6.91,29.43,35.6,85.59,18.41,29.24,17.6,...,0,0,0,0,0,0,0,0,0,1
2,29.24,37.4,83.39,12.66,29.55,39.2,100.0,21.85,29.13,26.6,...,0,0,0,0,0,0,0,0,0,0
3,29.605,20.3,61.79,6.91,29.63,28.4,78.44,12.66,29.39,12.2,...,0,0,0,0,1,0,0,0,0,0
4,29.15,21.2,78.94,3.44,29.39,30.2,92.85,10.36,28.87,15.8,...,0,0,0,0,0,0,0,1,0,0


In [15]:
# Partition the dataset in train + validation sets
from sklearn.model_selection import cross_val_score, train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
print("X_train : " + str(X_train.shape))
print("X_test : " + str(X_test.shape))
print("y_train : " + str(y_train.shape))
print("y_test : " + str(y_test.shape))

X_train : (2815, 56)
X_test : (704, 56)
y_train : (2815,)
y_test : (704,)


In [16]:
# Standardize numerical features
from sklearn.preprocessing import StandardScaler

stdSc = StandardScaler()
X_train.loc[:, numerical_features] = stdSc.fit_transform(X_train.loc[:, numerical_features])
X_test.loc[:, numerical_features] = stdSc.transform(X_test.loc[:, numerical_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [17]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor

In [18]:
# Define error measure for official scoring : RMSE
scorer = make_scorer(mean_squared_error, greater_is_better = False)

def rmse_cv_train(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring = scorer, cv = 20))
    return(rmse)

def rmse_cv_test(model):
    rmse= np.sqrt(-cross_val_score(model, X_test, y_test, scoring = scorer, cv = 20))
    return(rmse)

In [19]:
#Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(max_features=17)

regressor.fit(X_train, y_train)

y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)

In [20]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [21]:
print(rmsle(y_test,y_test_pred))

0.09809350971255576


In [22]:
test_dataset = y_test.to_frame()

In [23]:
test_dataset['predicted_CO'] = y_test_pred

In [24]:
test_dataset

Unnamed: 0,Daily Max 8-hour CO Concentration,predicted_CO
496,0.30,0.240
118,0.30,0.270
1928,0.50,0.360
1829,0.30,0.320
944,0.20,0.250
867,0.20,0.405
3307,0.40,0.320
2740,0.30,0.290
3207,0.50,0.380
1052,0.20,0.200


In [25]:
original_dataset.iloc[496]

Unnamed: 0                                          497
Daily Max 8-hour CO Concentration                   0.3
DAILY_AQI_VALUE                                       3
altimeter_set_1_INHG_Median                      29.255
air_temp_set_1_Fahrenheit_Median                   53.6
relative_humidity_set_1_%_Median                  87.57
wind_speed_set_1_Miles/hour_Median                 11.5
altimeter_set_1_INHG_High                          29.5
air_temp_set_1_Fahrenheit_High                     80.6
relative_humidity_set_1_%_High                      100
wind_speed_set_1_Miles/hour_High                  24.16
altimeter_set_1_INHG_Low                          29.17
air_temp_set_1_Fahrenheit_Low                        50
relative_humidity_set_1_%_Low                     42.06
wind_speed_set_1_Miles/hour_Low                       0
date                                         2010-05-13
weekend                                           False
wind_direction_set_1_Compass_Median             