In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from grid_search_helper import EstimatorSelectionHelper
%matplotlib inline

In [2]:
df = pd.read_csv('mydata.csv')

In [3]:
def build_dataframes_for_points(df):
    """
    makes different dataframes that only have data from one surge point
    """
    point_list = [0, 1, 2, 3, 14, 15, 16, 12, 13, 24, 25, 26, 27, 28, 17, 29]
    return [df[df.point == point].reset_index(drop=True)[:31184] for point in point_list]

In [4]:
df_0,df_1,df_2,df_3,df_14,df_15,df_16,df_12,df_13,df_24,df_25,df_26,\
                                                        df_27,df_28,df_17,df_29 = build_dataframes_for_points(df)

In [5]:
array_0 = np.array([df_0.surge[i:i+60] for i in xrange(len(df_0.surge)-60)])
array_0 = array_0[:,::-1]
#make a small hold out sample to conserve the timeseries nature of this, I'll graph it later
hold_out_array = array_0[-500:,:]
y_hold_out = hold_out_array[:,0]
X_hold_out = hold_out_array[:,3:]

# this is for the classical model training and testing
training_testing_array = array_0[:-500,:]
y_0 = training_testing_array[:,0]
X_0 = training_testing_array[:,3:]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_0, y_0, test_size=0.2)

In [None]:
# ran_forest = RandomForestRegressor(n_estimators=100)
# ran_forest.fit(X_train, y_train)

In [None]:
# 'Random Forest:',np.mean((ran_forest.predict(X_test) - y_test) ** 2)

In [None]:
# plt.figure(figsize=(20,5))
# plt.plot(ran_forest.predict(X_hold_out)[:300],label='3 min forecast')
# plt.plot(y_hold_out[:300],label='true 0')
# plt.legend()
# plt.show()

In [None]:
ex_trees = ExtraTreesRegressor(n_estimators=100)
ex_trees.fit(X_train, y_train)

In [None]:
'Extra Trees:',np.mean((ex_trees.predict(X_test) - y_test) ** 2)

In [None]:
plt.figure(figsize=(20,5))
plt.plot(ex_trees.predict(X_hold_out)[:300],label='3 min forecast')
plt.plot(y_hold_out[:300],label='true 0')
plt.legend()
plt.show()

# doing grid searches

In [7]:
models1 = {'LinearRegression':LinearRegression(),
           'Ridge':Ridge(),
           'Lasso':Lasso(),
           'ExtraTreesRegressor':ExtraTreesRegressor(),
           'RandomForestRegressor':RandomForestRegressor(),
           'AdaBoostRegressor':AdaBoostRegressor(),
           'GradientBoostingRegressor':GradientBoostingRegressor()}

params1 = {'LinearRegression':{},
           'Ridge':{'alpha':[0.001, 0.01, 0.1, 1.0]},
           'Lasso':{'alpha':[0.001, 0.01, 0.1, 1.0]},
           'ExtraTreesRegressor':{'n_estimators':[8,16,32,64,128]},
           'RandomForestRegressor':{'n_estimators':[8,16,32,64,128]},
           'AdaBoostRegressor':{'n_estimators':[8,16,32,64,128],'learning_rate':[0.6,0.8,1.0]},
           'GradientBoostingRegressor':{'n_estimators':[8,16,32,64,128],'learning_rate':[0.6,0.8,1.0]}}

In [8]:
helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(X_train, y_train, n_jobs=-1)

Running GridSearchCV for GradientBoostingRegressor.
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min


Running GridSearchCV for Ridge.
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:  2.7min finished


Running GridSearchCV for LinearRegression.
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.0s finished


Running GridSearchCV for AdaBoostRegressor.
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min


Running GridSearchCV for RandomForestRegressor.
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:  2.0min finished


Running GridSearchCV for ExtraTreesRegressor.
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  2.5min finished


Running GridSearchCV for Lasso.
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  3.8min finished
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    2.2s finished


In [9]:
helper1.score_summary(sort_by='min_score')

  df = pd.concat(rows, axis=1).T.sort([sort_by], ascending=False)


Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,alpha,learning_rate,n_estimators
44,ExtraTreesRegressor,0.848211,0.862127,0.87351,0.0104309,,,128.0
42,ExtraTreesRegressor,0.845945,0.858913,0.869744,0.0101,,,32.0
43,ExtraTreesRegressor,0.84448,0.860318,0.872867,0.0117674,,,64.0
39,RandomForestRegressor,0.841134,0.854674,0.869205,0.0121925,,,128.0
38,RandomForestRegressor,0.838715,0.854059,0.868475,0.0118229,,,64.0
41,ExtraTreesRegressor,0.837991,0.852692,0.868398,0.0124164,,,16.0
37,RandomForestRegressor,0.83582,0.85235,0.868462,0.0129344,,,32.0
36,RandomForestRegressor,0.831924,0.847105,0.860638,0.0120372,,,16.0
4,GradientBoostingRegressor,0.82505,0.830992,0.839034,0.00517329,,0.6,128.0
2,GradientBoostingRegressor,0.823254,0.831537,0.839264,0.00555201,,0.6,32.0
