## This file tests different regressors for the data

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
df = pd.read_excel('./data/Wunderground_West_Lafayette_2010_Clean.xlsx')
df.head(3)

Unnamed: 0,month,day,temp_F_max,temp_F_avg,temp_F_min,dew_point_F_max,dew_point_F_avg,humidity_percent_max,humidity_percent_avg,humidity_percent_min,wind_speed_mph_max,wind_speed_mph_avg,wind_speed_mph_min,pressure_Hg_max,pressure_Hg_avg,pressure_Hg_min
0,Jan,1,15,10.2,6,7,3.6,80,74.9,68,14,10.1,6,29.7,29.7,29.6
1,Jan,2,12,6.9,0,3,-0.2,80,72.9,64,17,8.7,0,29.8,29.8,29.7
2,Jan,3,17,9.4,-1,7,1.6,80,71.0,59,13,7.3,0,29.8,29.8,29.7


In [3]:
df.dtypes

month                    object
day                       int64
temp_F_max                int64
temp_F_avg              float64
temp_F_min                int64
dew_point_F_max           int64
dew_point_F_avg         float64
humidity_percent_max      int64
humidity_percent_avg    float64
humidity_percent_min      int64
wind_speed_mph_max        int64
wind_speed_mph_avg      float64
wind_speed_mph_min        int64
pressure_Hg_max         float64
pressure_Hg_avg         float64
pressure_Hg_min         float64
dtype: object

In [4]:
df.month = df.month.astype('category')
df.day = df.day.astype('category')

In [5]:
baseline_guess = df.temp_F_avg.mean()

In [6]:
X = df.loc[:,['month','day','temp_F_max','temp_F_min','humidity_percent_max','humidity_percent_min','wind_speed_mph_max','wind_speed_mph_min','pressure_Hg_max','pressure_Hg_min']]

In [7]:
y = df.temp_F_avg.shift(1,axis=0)
y[0] = df.temp_F_avg[364]
y

0      55.5
1      10.2
2       6.9
3       9.4
4      11.7
       ... 
360    25.7
361    20.2
362    22.8
363    26.7
364    45.1
Name: temp_F_avg, Length: 365, dtype: float64

In [8]:
df.temp_F_avg.tail(3)

362    26.7
363    45.1
364    55.5
Name: temp_F_avg, dtype: float64

The idea with this is to use today's weather to predict tomorrow's

### Simple Bias Regressor

In [9]:
def MSE(y, b):
    sum = 0
    for v in y:
        sum += (v - b)**2
    return sum / len(y)
sbr_mse = MSE(y, baseline_guess)
print("Baseline error: ", sbr_mse)

Baseline error:  421.1785757928315


### Linear Regressor

In [10]:
X_num = X.select_dtypes(include='number')
X_cat = X.select_dtypes(exclude='number')
std_mean = X_num.mean()
std_std = X_num.std()
X_num = (X_num - X_num.mean()) / X_num.std()
X_cat = pd.get_dummies(X_cat)
X_std = pd.concat([X_num,X_cat],axis=1)
X_std.head(3)

Unnamed: 0,temp_F_max,temp_F_min,humidity_percent_max,humidity_percent_min,wind_speed_mph_max,wind_speed_mph_min,pressure_Hg_max,pressure_Hg_min,month_Apr,month_Aug,...,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31
0,-2.121575,-1.881364,-1.09521,1.048581,-0.031661,2.079206,1.427757,1.547318,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-2.254804,-2.180101,-1.09521,0.801537,0.546156,-0.477482,1.96556,2.021517,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-2.032755,-2.229891,-1.09521,0.492733,-0.224267,-0.477482,1.96556,2.021517,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
tmp = pd.concat([X_std,y],axis=1)
(train,test) = train_test_split(tmp)
X_train_std = train.drop('temp_F_avg',axis=1)
y_train = train.temp_F_avg

In [12]:
lr = LinearRegression()
lr.fit(X_train_std,y_train)

LinearRegression()

In [13]:
print("Train Score: ",lr.score(X_train_std,y_train))

Train Score:  0.9331384534644594


In [14]:
X_test = test.drop('temp_F_avg',axis=1)
y_test = test.temp_F_avg
print("Validation Score: ",lr.score(X_test,y_test))

Validation Score:  0.9332430738699304


### Decision Tree

In [15]:
T = DecisionTreeRegressor()
grid={'max_depth':range(1,20)}
TCV = GridSearchCV(T,param_grid=grid,return_train_score=True)
TCV.fit(X_std,y)

GridSearchCV(estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': range(1, 20)}, return_train_score=True)

In [16]:
TCV.best_params_

{'max_depth': 10}

In [17]:
TCV.best_score_

-0.05178016870479778

### Random Forest

In [21]:
rf = RandomForestRegressor()
grid={'max_depth':range(10,200,10)}
rf_CV = GridSearchCV(rf,param_grid=grid,return_train_score=True,n_jobs=-1)
rf_CV.fit(X_std,y)

GridSearchCV(estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': range(10, 200, 10)},
             return_train_score=True)

In [22]:
rf_CV.best_params_

{'max_depth': 30}

In [23]:
rf_CV.best_score_

0.29233330500156385

By far the best regressor besides the Simple Bias Regressor was the Linear Regressor, getting a substantial validation score. This makes sense for the kinds of trends we're looking at, since we're trying to predict trends.