In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from grid_search_helper import EstimatorSelectionHelper
from prepare_data import prepare_data
%matplotlib inline

In [2]:
df = pd.read_csv('mydata.csv')
data_prep = prepare_data(df)
data_prep.build_dataframes_for_points()
data_prep.time_series_to_regression()

In [None]:
# X_0 = data_prep.get_point_data('point_0')['X']
# y_0 = data_prep.get_point_data('point_0')['y']

# X_0_hold_out = data_prep.get_point_data('point_0')['X_hold_out']
# y_0_hold_out = data_prep.get_point_data('point_0')['y_hold_out']

# X_train = data_prep.get_point_data('point_0')['X_train']
# X_test = data_prep.get_point_data('point_0')['X_test']

# y_train = data_prep.get_point_data('point_0')['y_train']
# y_test = data_prep.get_point_data('point_0')['y_test']

In [3]:
from joblib import Parallel, delayed

class train_models:
    def __init__(self,data_dict):
        self.data_dict=data_dict
    
    def fit(self):
        def to_train(self,key, X, y):
            return (key, ExtraTreesRegressor(n_estimators=128).fit(X, y))
        
        self.result = Parallel(n_jobs=16)(delayed(to_train)(key, value['X_train'], value['y_train'])\
                                                                        for key, value in self.data_dict.iteritems())
        
    def predict_on_test_sets(self):
        self.predict_test_dict = {}
        for point, model in self.result:
            self.predict_test_dict[point] = model.predict(self.data_dict[point]['X_test'])
        return self.predict_test_dict
    
    def predict_on_hold_out_sets(self):
        self.predict_hold_out_dict = {}
        for point, model in self.result:
            self.predict_hold_out_dict[point] = model.predict(self.data_dict[point]['X_hold_out'])
        return self.predict_hold_out_dict
    
    def mse_on_test_sets(self):
        return np.mean([np.mean((prediction - self.data_dict[point]['y_test'])**2) \
                                                        for point, prediction in self.predict_test_dict.iteritems()])

    def mse_on_hold_out_sets(self):
        return np.mean([np.mean((prediction - self.data_dict[point]['y_hold_out'])**2) \
                                                    for point, prediction in self.predict_hold_out_dict.iteritems()])

In [4]:
x = data_prep.get_point_data()

In [5]:
train_my_models = train_models(x)

In [6]:
train_my_models.fit()

TypeError: can't pickle function objects

In [None]:
from joblib import Parallel, delayed


def train_model(key, X, y):
    return (key, ExtraTreesRegressor(n_estimators=128).fit(X, y))

result = Parallel(n_jobs=16)(delayed(train_model)(key, value['X_train'], value['y_train']) \
                                                             for key, value in data_prep.get_point_data().iteritems())

In [None]:
for key, value in data_prep.get_point_data().iteritems():
    print key

In [None]:
for key, value in data_prep.get_point_data()['point_0'].iteritems():
    print key

In [None]:
ex_trees = ExtraTreesRegressor(n_estimators=128)
ex_trees.fit(data_prep.get_point_data()['point_0']['X_train'], data_prep.get_point_data()['point_0']['y_train'])

In [None]:
'Extra Trees:',np.mean((ex_trees.predict(data_prep.get_point_data()['point_0']['X_test']) - \
                                                                data_prep.get_point_data()['point_0']['y_test']) ** 2)
#  0.014418507785463306

In [None]:
'Extra Trees:',np.mean((ex_trees.predict(data_prep.get_point_data()['point_0']['X_hold_out']) - \
                                                            data_prep.get_point_data()['point_0']['y_hold_out']) ** 2)


In [None]:
plt.figure(figsize=(20,5))
plt.plot(ex_trees.predict(X_0_hold_out)[:300],label='3 min forecast')
plt.plot(y_0_hold_out[:300],label='true 0')
plt.legend()
plt.show()

# doing grid searches

In [None]:
'''
These are the models I used grid search cross validation on. I import a class called EstimatorSelectionHelper that
runs through all of the different models their parameters and fits everyone using all possible combinations of
parameters. It then prints out a summary of the results ordered by best performing model from the grid search in
a pandas dataframe.
'''

models1 = {'LinearRegression':LinearRegression(),
           'Ridge':Ridge(),
           'Lasso':Lasso(),
           'ExtraTreesRegressor':ExtraTreesRegressor(),
           'RandomForestRegressor':RandomForestRegressor(),
           'AdaBoostRegressor':AdaBoostRegressor(),
           'GradientBoostingRegressor':GradientBoostingRegressor()}

params1 = {'LinearRegression':{},
           'Ridge':{'alpha':[0.001, 0.01, 0.1, 1.0]},
           'Lasso':{'alpha':[0.001, 0.01, 0.1, 1.0]},
           'ExtraTreesRegressor':{'n_estimators':[8,16,32,64,128]},
           'RandomForestRegressor':{'n_estimators':[8,16,32,64,128]},
           'AdaBoostRegressor':{'n_estimators':[8,16,32,64,128],'learning_rate':[0.6,0.8,1.0]},
           'GradientBoostingRegressor':{'n_estimators':[8,16,32,64,128],'learning_rate':[0.6,0.8,1.0]}}

In [None]:
helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(X_train, y_train, n_jobs=-1)

In [None]:
helper1.score_summary(sort_by='min_score')