In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate

In [2]:
df = pd.read_hdf('./data/big_data.h5')
df.head(3)

Unnamed: 0,mean_temperature_3,mean_temperature_2,mean_temperature_1,day_of_year,temperature_0_12,temperature_0_13,temperature_0_14,temperature_0_15,temperature_0_16,temperature_0_17,...,wind_speed_0_2,wind_speed_0_3,wind_speed_0_4,wind_speed_0_5,wind_speed_0_6,wind_speed_0_7,wind_speed_0_8,wind_speed_0_9,wind_speed_0_10,wind_speed_0_11
32,300.634792,302.856458,302.795433,275,309.1,310.58,310.495769,310.411538,310.327308,310.243077,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
37,288.131042,288.033958,286.062292,276,282.272771,282.281385,282.29,282.51,284.44,287.86,...,4.0,3.0,3.0,4.0,4.0,3.0,1.0,4.0,4.0,4.0
38,289.920417,292.629167,296.770417,276,289.158749,289.144375,289.13,290.73,293.02,296.18,...,8.0,5.0,4.0,2.0,2.0,1.0,4.0,0.0,0.0,0.0


In [3]:
df = pd.read_csv('./data/wunderground_scraped_data.csv')
df.head(3)

Unnamed: 0,Temp_max,Temp_avg,Temp_min,Dew_max,Dew_avg,Dew_min,Hum_max,Hum_avg,Hum_min,Wind_max,Wind_avg,Wind_min,Pres_max,Pres_avg,Pres_min,Precipitation,Date
0,21,15.3,6,17,8.8,-6,93,76.0,58,22,12.2,0,29.8,29.7,29.6,0.0,1999-1-1
1,26,22.4,18,25,20.8,16,100,93.5,86,31,19.9,7,29.6,29.1,28.8,0.0,1999-1-2
2,25,14.0,5,24,9.4,0,96,81.8,71,21,17.4,14,29.3,29.0,28.7,0.0,1999-1-3


In [4]:
df.shape

(7672, 17)

In [5]:
df.dtypes

Temp_max           int64
Temp_avg         float64
Temp_min           int64
Dew_max            int64
Dew_avg          float64
Dew_min            int64
Hum_max            int64
Hum_avg          float64
Hum_min            int64
Wind_max           int64
Wind_avg         float64
Wind_min           int64
Pres_max         float64
Pres_avg         float64
Pres_min         float64
Precipitation    float64
Date              object
dtype: object

In [6]:
df.Date = pd.to_datetime(df.Date, format="%Y-%M-%d")

In [7]:
df['Month'] = pd.DatetimeIndex(df.Date).month

In [8]:
df.Month = df.Month.astype('category')

In [9]:
y = df.Temp_avg.shift(-3,axis=0)

In [10]:
y.iloc[7669] = df.Temp_avg.iloc[2]
y.iloc[7670] = df.Temp_avg.iloc[1]
y.iloc[7671] = df.Temp_avg.iloc[0]

In [11]:
y.tail()

7667    32.1
7668    25.8
7669    14.0
7670    22.4
7671    15.3
Name: Temp_avg, dtype: float64

In [33]:
X_num = df.select_dtypes(include='number')
X_num = X_num.drop('Temp_avg',axis=1)
X_cat = df.Month
X_num = (X_num - X_num.mean()) / X_num.std()
X_cat = pd.get_dummies(X_cat)
# X = pd.concat([X_num,X_cat],axis=1)
X = X_num
X.head(3)

Unnamed: 0,Temp_max,Temp_min,Dew_max,Dew_avg,Dew_min,Hum_max,Hum_avg,Hum_min,Wind_max,Wind_avg,Wind_min,Pres_max,Pres_avg,Pres_min,Precipitation
0,-1.790709,-1.90725,-1.552186,-1.625805,-1.639072,0.591588,0.532052,0.499409,0.914458,0.720427,-0.855672,2.070572,1.555088,0.299399,-0.188746
1,-1.553707,-1.27823,-1.113554,-0.989699,-0.653243,1.28101,1.912986,2.320044,2.52283,2.870251,1.244749,1.043353,-0.726904,-0.186968,-0.188746
2,-1.601107,-1.959669,-1.168383,-1.594,-1.370209,0.887054,0.989733,1.344704,0.73575,2.172256,3.345169,-0.497476,-1.107236,-0.247764,-0.188746


In [34]:
lr = LinearRegression()
lr.fit(X, y)
R2_train = lr.score(X,y)
print("Train R2: " ,R2_train.round(2))

Train R2:  0.78


In [35]:
coef = lr.coef_
coef = pd.Series(coef, index=X.columns)
coef.sort_values(ascending=False)

Temp_min         9.078037
Dew_avg          6.030160
Temp_max         2.389275
Pres_max         1.544878
Pres_avg         1.026290
Hum_max          0.968637
Dew_min          0.731006
Precipitation    0.369653
Wind_avg        -0.303195
Wind_min        -0.406403
Wind_max        -0.661110
Pres_min        -0.670038
Dew_max         -0.913595
Hum_min         -1.112979
Hum_avg         -2.940849
dtype: float64

In [15]:
scores = cross_validate(lr, X, y, return_train_score=True)
scores

{'fit_time': array([0.00603628, 0.00860691, 0.00590754, 0.00963187, 0.        ]),
 'score_time': array([0.        , 0.        , 0.        , 0.00439334, 0.        ]),
 'test_score': array([0.78096683, 0.75358353, 0.80164486, 0.76749444, 0.77100735]),
 'train_score': array([0.77784808, 0.78440141, 0.77280141, 0.78084697, 0.78103078])}

In [16]:
R2_test = scores['test_score'].mean()
R2_train = scores['train_score'].mean()
print("Train R2:", R2_train.round(2))
print("Valid R2:", R2_test.round(2))

Train R2: 0.78
Valid R2: 0.77


## Testing Forests

In [17]:
rf = RandomForestRegressor()
grid = {'n_estimators': range(1,2000,300)}
rfCV = GridSearchCV(rf,
                   param_grid=grid,
                   return_train_score=True,
                   n_jobs=-1)
rfCV.fit(X,y)

GridSearchCV(estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'n_estimators': range(1, 2000, 300)},
             return_train_score=True)

In [18]:
rfCV.best_params_

{'n_estimators': 1201}

In [19]:
rfCV.best_score_

0.7813234378989897

In [20]:
rf = RandomForestRegressor()
grid = {'n_estimators': range(900, 1500, 100)}
rfCV = GridSearchCV(rf,
                   param_grid=grid,
                   return_train_score=True,
                   n_jobs=-1)
rfCV.fit(X,y)

GridSearchCV(estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'n_estimators': range(900, 1500, 100)},
             return_train_score=True)

In [21]:
rfCV.best_params_

{'n_estimators': 1000}

In [22]:
rfCV.best_score_

0.7813968425552182

In [23]:
rf = RandomForestRegressor()
grid = {'n_estimators': range(900, 1100, 10)}
rfCV = GridSearchCV(rf,
                   param_grid=grid,
                   return_train_score=True,
                   n_jobs=-1)
rfCV.fit(X,y)

GridSearchCV(estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'n_estimators': range(900, 1100, 10)},
             return_train_score=True)

In [24]:
rfCV.best_params_

{'n_estimators': 900}

In [25]:
rfCV.best_score_

0.7815380574942452

In [29]:
gb = GradientBoostingRegressor()
grid = {'n_estimators':range(800,1200,100),
        'learning_rate':np.arange(0.01,0.3,0.05)}
gbCV = GridSearchCV(gb,param_grid=grid,return_train_score=True,n_jobs=-1)

In [30]:
gbCV.fit(X,y)

GridSearchCV(estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'learning_rate': array([0.01, 0.06, 0.11, 0.16, 0.21, 0.26]),
                         'n_estimators': range(800, 1200, 100)},
             return_train_score=True)

In [31]:
gbCV.best_params_

{'learning_rate': 0.01, 'n_estimators': 800}

In [32]:
gbCV.best_score_

0.7847019580595382