In [1]:
import pandas as pd
import numpy as np

In [2]:
from bokeh.plotting import figure, output_notebook, show
from bokeh.layouts import gridplot
output_notebook()

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
from glob import glob

In [4]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV, StratifiedShuffleSplit, ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

<font color = 'purple'><font size = 5>
    A few functions used to shortcut displaying results<br>
    ================================================<br>

In [5]:
def output_CV(cv_array):     ## for use with bokeh.plotting imported
    pout = figure(plot_width = 300, plot_height = 200, y_range=(0,1)) 
    pout.circle(list(range(1, len(cv_array) +1)), cv_array, size=5)
    print('Mean CV score:  ', round(cv_array.mean(), 3))
    print('Std of CV scores:  ', round(cv_array.std(), 3))
    show(pout)

In [6]:
def give_gridsearch_results(ingridsrch_instance):
    print('Best score: ', round(ingridsrch_instance.best_score_, 3))
    print('Best params: ', ingridsrch_instance.best_params_)

In [7]:
def make_plotgrid(totalrange, ncols):
    """for feeding into bokeh griplot, returns array of rows/columns"""
    nrows = int(np.ceil(totalrange/ncols))
    feeder = iter(range(totalrange))
    plotgrid = [[]]*nrows
    for r in range(nrows):
        thisrow = []
        for c in range(ncols):
            try:
                thisrow.append(next(feeder))
            except:
                thisrow.append(None)
        plotgrid[r] = thisrow
    return plotgrid

In [8]:
make_plotgrid(12,3)

[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]

In [9]:
def draw_gridplot(indf, columns='first', ncols=3, total_width=900, each_height=300, incolours = ['blue','red','yellow','green','purple','orange']):
    """Grid plots by month in bokeh
    Requires:
    > indf has datetime index
    > if columns are left unspecified (should be a list of col names), then just first column is plotted
    > only takes up to 6 columns, then runs out of colours to plot unless you put more into incolours"""
    if columns == 'first':
        columns = [indf.columns[0]]
    else:
        pass
    indf['Month'] = indf.index.month
    indf['mthname'] = indf.index.month_name()
    
    each_width = int(total_width/ncols)
    
    clrcols = [(col, clr) for col, clr in zip(columns, incolours)]
    dictmonth = {}
    
    for no,mth in enumerate(indf.mthname.unique()):
        dictmonth[no] = figure(plot_width=each_width, plot_height=each_height, title=mth, x_axis_type='datetime')
        for col, clr in clrcols:
            dictmonth[no].line(indf.index[indf.Month == no+1], indf[col][indf.Month == no+1], line_width=1, color = clr)
    
    plotgrid = make_plotgrid(len(indf.mthname.unique()), ncols=ncols)
    for row in range(len(plotgrid)):
        for p in range(len(plotgrid[row])):
            plotgrid[row][p] = dictmonth[plotgrid[row][p]]
    
    the_grid = gridplot(plotgrid)
    show(the_grid)
    

<font color = 'purple'><font size = 5>
    getting data in<br>
    ================================================<br>

In [10]:
feat_files = glob('../data/ready-for-model/*.csv')
feat_files

['../data/ready-for-model/2009-18_NEMtotaldemand.csv',
 '../data/ready-for-model/20190226_SAdf_features.csv',
 '../data/ready-for-model/20190226_TASdf_features.csv',
 '../data/ready-for-model/20190226_NSWdf_features.csv',
 '../data/ready-for-model/20190226_VICdf_features.csv',
 '../data/ready-for-model/20190226_QLDdf_features.csv']

In [32]:
fvic = feat_files[-2]
fvic

'../data/ready-for-model/20190226_VICdf_features.csv'

In [33]:
ftarget = feat_files[0]
ftarget

'../data/ready-for-model/2009-18_NEMtotaldemand.csv'

In [34]:
dfvic = pd.read_csv(fvic, index_col=0, parse_dates=[0])
dfvic.head()

Unnamed: 0_level_0,Date,Hour_of_day,Year,shoulder,summer,winter,workdayVIC,MILDURA-AIRPORT_MinT_76031,CAPE-NELSON_MaxT_90184,MORWELL_MaxT_85280,MELBOURNE-AIRPORT_MinT_86282,CAPE-NELSON_MinT_90184,MILDURA-AIRPORT_MaxT_76031,MELBOURNE-AIRPORT_MaxT_86282,MORWELL_MinT_85280
SETTLEMENTDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2009-01-01 00:30:00,2009-01-01,0.5,2009,0,1,0,0.0,13.8,15.9,20.1,11.2,12.7,27.4,19.9,9.4
2009-01-01 01:00:00,2009-01-01,1.0,2009,0,1,0,0.0,13.8,15.9,20.1,11.2,12.7,27.4,19.9,9.4
2009-01-01 01:30:00,2009-01-01,1.5,2009,0,1,0,0.0,13.8,15.9,20.1,11.2,12.7,27.4,19.9,9.4
2009-01-01 02:00:00,2009-01-01,2.0,2009,0,1,0,0.0,13.8,15.9,20.1,11.2,12.7,27.4,19.9,9.4
2009-01-01 02:30:00,2009-01-01,2.5,2009,0,1,0,0.0,13.8,15.9,20.1,11.2,12.7,27.4,19.9,9.4


In [35]:
dfvic.isnull().sum().sum()

577

In [36]:
dfvic.isnull().sum()

Date                              0
Hour_of_day                       0
Year                              0
shoulder                          0
summer                            0
winter                            0
workdayVIC                        1
MILDURA-AIRPORT_MinT_76031        0
CAPE-NELSON_MaxT_90184          144
MORWELL_MaxT_85280              144
MELBOURNE-AIRPORT_MinT_86282      0
CAPE-NELSON_MinT_90184          144
MILDURA-AIRPORT_MaxT_76031        0
MELBOURNE-AIRPORT_MaxT_86282      0
MORWELL_MinT_85280              144
dtype: int64

In [37]:
dfvic.dropna(inplace=True) # just for now, will skip where I don't have temp data, will try and get better data later

In [38]:
dftarget = pd.read_csv(ftarget, index_col=0, parse_dates=[0])
dftarget.head()

Unnamed: 0_level_0,NSW1,QLD1,SA1,TAS1,VIC1,NEMtotal
SETTLEMENTDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-01-01 00:30:00,7535.0,5611.54,1310.89,909.71,4799.87,20167.01
2009-01-01 01:00:00,7229.24,5457.34,1272.69,896.63,4646.21,19502.11
2009-01-01 01:30:00,6857.62,5294.12,1178.87,897.52,4950.16,19178.29
2009-01-01 02:00:00,6535.05,5153.47,1130.78,906.22,4755.46,18480.98
2009-01-01 02:30:00,6287.88,5060.33,1059.53,893.19,4545.67,17846.6


In [39]:
dfvic = pd.merge(dfvic, dftarget[['VIC1']], how='inner', left_index=True, right_index=True)
dfvic.head(3)

Unnamed: 0_level_0,Date,Hour_of_day,Year,shoulder,summer,winter,workdayVIC,MILDURA-AIRPORT_MinT_76031,CAPE-NELSON_MaxT_90184,MORWELL_MaxT_85280,MELBOURNE-AIRPORT_MinT_86282,CAPE-NELSON_MinT_90184,MILDURA-AIRPORT_MaxT_76031,MELBOURNE-AIRPORT_MaxT_86282,MORWELL_MinT_85280,VIC1
SETTLEMENTDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2009-01-01 00:30:00,2009-01-01,0.5,2009,0,1,0,0.0,13.8,15.9,20.1,11.2,12.7,27.4,19.9,9.4,4799.87
2009-01-01 01:00:00,2009-01-01,1.0,2009,0,1,0,0.0,13.8,15.9,20.1,11.2,12.7,27.4,19.9,9.4,4646.21
2009-01-01 01:30:00,2009-01-01,1.5,2009,0,1,0,0.0,13.8,15.9,20.1,11.2,12.7,27.4,19.9,9.4,4950.16


<font color = 'purple'><font size = 4>
    Creating training and holdout test sets - pre-2018 and 2018 data respectively<br>
    =====================================================<br>

In [40]:
Yvic = dfvic[['VIC1']].copy()
Xvic = dfvic.drop(columns=['Date','VIC1'])

In [41]:
# am going to round up to nearest multiple of 5 to reduce regression complexity
Yvic['rVIC'] = np.ceil(Yvic.VIC1/5.0)*5.0
Yvic.head()

Unnamed: 0_level_0,VIC1,rVIC
SETTLEMENTDATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2009-01-01 00:30:00,4799.87,4800.0
2009-01-01 01:00:00,4646.21,4650.0
2009-01-01 01:30:00,4950.16,4955.0
2009-01-01 02:00:00,4755.46,4760.0
2009-01-01 02:30:00,4545.67,4550.0


In [42]:
print(len(Yvic.VIC1.unique()))
print(len(Yvic.rVIC.unique()))
print(f'reduced by factor of: {round(len(Yvic.VIC1.unique())/len(Yvic.rVIC.unique()))} !')

136957
1337
reduced by factor of: 102 !


In [43]:
hoXvic = Xvic[Xvic.Year == 2018].astype(float)
hoYvic = Yvic[Yvic.index.year == 2018].astype(float)
print(len(hoXvic))
print(len(hoYvic))
hoYvic.head()

17472
17472


Unnamed: 0_level_0,VIC1,rVIC
SETTLEMENTDATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01 00:00:00,4445.07,4450.0
2018-01-01 00:30:00,4251.18,4255.0
2018-01-01 01:00:00,4092.53,4095.0
2018-01-01 01:30:00,3958.95,3960.0
2018-01-01 02:00:00,3785.27,3790.0


In [44]:
Xvic = Xvic[Xvic.Year != 2018].astype(float)
Yvic = Yvic[Yvic.index.year != 2018].astype(float)
Xvic.tail()

Unnamed: 0_level_0,Hour_of_day,Year,shoulder,summer,winter,workdayVIC,MILDURA-AIRPORT_MinT_76031,CAPE-NELSON_MaxT_90184,MORWELL_MaxT_85280,MELBOURNE-AIRPORT_MinT_86282,CAPE-NELSON_MinT_90184,MILDURA-AIRPORT_MaxT_76031,MELBOURNE-AIRPORT_MaxT_86282,MORWELL_MinT_85280
SETTLEMENTDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2017-12-31 21:30:00,21.5,2017.0,0.0,1.0,0.0,0.0,12.3,19.9,25.9,9.0,14.7,31.1,26.2,8.7
2017-12-31 22:00:00,22.0,2017.0,0.0,1.0,0.0,0.0,12.3,19.9,25.9,9.0,14.7,31.1,26.2,8.7
2017-12-31 22:30:00,22.5,2017.0,0.0,1.0,0.0,0.0,12.3,19.9,25.9,9.0,14.7,31.1,26.2,8.7
2017-12-31 23:00:00,23.0,2017.0,0.0,1.0,0.0,0.0,12.3,19.9,25.9,9.0,14.7,31.1,26.2,8.7
2017-12-31 23:30:00,23.5,2017.0,0.0,1.0,0.0,0.0,12.3,19.9,25.9,9.0,14.7,31.1,26.2,8.7


In [24]:
# shufs = ShuffleSplit(test_size=0.3, random_state=307132)

In [45]:
stscaler = StandardScaler()

<font color = 'purple'><font size = 5>
    K-Nearest Neighbours<br>
    =====================================================<br>

In [37]:
kpipe = Pipeline(steps=[('stsc', StandardScaler()), ('knn', KNeighborsRegressor(n_jobs=-1))])

In [None]:
# knvic = KNeighborsRegressor(n_jobs=-1)

In [None]:
# stscaler.fit(Xvic)                      # So I am first fitting to just the CV (non-holdout) X dataset,
# sXvic = stscaler.transform(Xvic)        # then using the same StandardScaler object to transform the non-holdout
# shoXvic = stscaler.transform(hoXvic)    # and holdout datasets, using the same parameters

In [39]:
%%time
kscore = cross_val_score(kpipe, Xvic, Yvic.rVIC, cv=9, verbose=1)
# kscore = cross_val_score(knvic, sXvic, Yvic, cv=9)
# kpred = cross_val_predict(knvic, Xvic, Yvic, cv=shufs)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CPU times: user 4.54 s, sys: 432 ms, total: 4.97 s
Wall time: 17.2 s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   17.2s finished


In [40]:
output_CV(kscore)

Mean CV score:   0.781
Std of CV scores:   0.029


In [41]:
kgrid_params = {'knn__n_neighbors' : [2,3,5,7],#list(range(6,25,6)),
               'knn__weights' : ['distance'],     # in previous gridsearches, distance and minkowski
               'knn__metric' : ['minkowski']}     # consistently came out as best

In [42]:
kngrid_search = GridSearchCV(kpipe, param_grid=kgrid_params, n_jobs=-1, cv = 9)

In [43]:
%%time
kngrid_search.fit(Xvic, Yvic.rVIC)



CPU times: user 674 ms, sys: 166 ms, total: 840 ms
Wall time: 1min 48s


GridSearchCV(cv=9, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('stsc', StandardScaler(copy=True, with_mean=True, with_std=True)), ('knn', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
          weights='uniform'))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'knn__n_neighbors': [2, 3, 5, 7], 'knn__weights': ['distance'], 'knn__metric': ['minkowski']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [44]:
give_gridsearch_results(kngrid_search)

Best score:  0.781
Best params:  {'knn__metric': 'minkowski', 'knn__n_neighbors': 5, 'knn__weights': 'distance'}


In [45]:
bstknn = kngrid_search.best_estimator_

In [48]:
%%time
bknscore = cross_val_score(bstknn, Xvic, Yvic.rVIC, n_jobs= -1, cv= 9)

CPU times: user 65.3 ms, sys: 63.1 ms, total: 128 ms
Wall time: 14.6 s


In [49]:
output_CV(bknscore)

Mean CV score:   0.781
Std of CV scores:   0.029


In [50]:
bstknn.score(hoXvic, hoYvic.rVIC)

0.7781782801226463

In [51]:
hoYvic['bestKNN'] = bstknn.predict(hoXvic)
# hoYvic['Month'] = hoYvic.index.month
# hoYvic['mthname'] = hoYvic.index.month_name()
hoYvic.head(2)

Unnamed: 0_level_0,VIC1,rVIC,bestKNN
SETTLEMENTDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01 00:00:00,4445.07,4450.0,4411.048719
2018-01-01 00:30:00,4251.18,4255.0,4234.881296


In [52]:
draw_gridplot(hoYvic, columns=['VIC1','bestKNN'])

<font color = 'purple'><font size = 5>
    Decision Tree<br>
    =====================================================<br>

In [53]:
treebeard = DecisionTreeRegressor()

In [54]:
%%time
dtscore = cross_val_score(treebeard, Xvic, Yvic.rVIC, n_jobs=-1, cv=9)

CPU times: user 76.5 ms, sys: 75.6 ms, total: 152 ms
Wall time: 3.78 s


In [55]:
output_CV(dtscore)

Mean CV score:   0.755
Std of CV scores:   0.04


In [56]:
dtparams = {'max_depth' : [None, 15, 30, 50],
           'min_samples_split' : [2, 6, 12, 24],
           'min_samples_leaf' : [1, 5, 10],
           }

In [57]:
dtgridsrch = GridSearchCV(treebeard, param_grid=dtparams, cv=9, n_jobs=-1)

In [58]:
%%time
dtgridsrch.fit(Xvic, Yvic.rVIC)

CPU times: user 2.46 s, sys: 742 ms, total: 3.2 s
Wall time: 2min 7s


GridSearchCV(cv=9, error_score='raise-deprecating',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [None, 15, 30, 50], 'min_samples_split': [2, 6, 12, 24], 'min_samples_leaf': [1, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [59]:
give_gridsearch_results(dtgridsrch)

Best score:  0.791
Best params:  {'max_depth': 15, 'min_samples_leaf': 10, 'min_samples_split': 24}


In [60]:
trbest = dtgridsrch.best_estimator_

In [61]:
%%time
dtcvscore = cross_val_score(trbest, Xvic, Yvic.rVIC, cv=9, n_jobs=-1)

CPU times: user 57.8 ms, sys: 50.4 ms, total: 108 ms
Wall time: 2.83 s


In [62]:
output_CV(dtcvscore)

Mean CV score:   0.788
Std of CV scores:   0.036


In [63]:
hoYvic['bestDtree'] = trbest.predict(hoXvic)
hoYvic.head(3)

Unnamed: 0_level_0,VIC1,rVIC,bestKNN,Month,mthname,bestDtree
SETTLEMENTDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-01 00:00:00,4445.07,4450.0,4411.048719,1,January,4295.75
2018-01-01 00:30:00,4251.18,4255.0,4234.881296,1,January,4234.166667
2018-01-01 01:00:00,4092.53,4095.0,4232.807674,1,January,4009.117647


In [64]:
draw_gridplot(hoYvic, columns=['VIC1','bestDtree'])

In [None]:
# treebeard.fit(Xvic, Yvic)
# treebeard.score(hoXvic, hoYvic.VIC1)

In [None]:
# hoYvic['dtree_pred'] = treebeard.predict(hoXvic)

In [None]:
# draw_gridplot(hoYvic, columns=['VIC1','dtree_pred'])

In [82]:
# dot_data = StringIO()    # this keeps on giving massive files, I think regression tree is too complex?

# export_graphviz(trbest, out_file='treebeard.dot',  
#                 filled=True, rounded=True,
#                 special_characters=True, feature_names = Xvic.columns)  

# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# Image(graph.create_png())

In [81]:
trbest

DecisionTreeRegressor(criterion='mse', max_depth=15, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=10,
           min_samples_split=24, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [77]:
# from subprocess import call

In [79]:
# %%time          ## also doesn't seem to work properly
# call(['dot', '-Tpng', 'treebeard.dot', '-o', 'treebeard.png', '-Gdpi=600']) 

CPU times: user 3.46 ms, sys: 10.1 ms, total: 13.6 ms
Wall time: 1min 34s


0

<font color = 'purple'><font size = 5>
    Random Forest<br>
    =====================================================<br>

In [46]:
entmoot = RandomForestRegressor(n_jobs=-1)

In [47]:
%%time
rfscore1 = cross_val_score(entmoot, Xvic, Yvic.rVIC, cv=9, n_jobs=-1)

CPU times: user 69.7 ms, sys: 70.6 ms, total: 140 ms
Wall time: 22.3 s


In [48]:
output_CV(rfscore1)

Mean CV score:   0.841
Std of CV scores:   0.037


In [51]:
rfscore1

array([0.87556386, 0.89683791, 0.84796394, 0.84651089, 0.76449471,
       0.80471648, 0.86051995, 0.83520085, 0.83432091])

In [50]:
pout = figure(plot_width = 600, plot_height = 400, y_range=(0,1)) 
pout.circle(list(range(1, len(rfscore1) +1)), rfscore1, size=5)
show(pout)

In [68]:
rfparams = {'n_estimators' : [50,60],
           'max_depth' : [25,30,35],
           'min_samples_leaf' : [3,5],
           }

In [69]:
rfgridsrch = GridSearchCV(entmoot, param_grid=rfparams, cv=9, n_jobs=-1)

In [70]:
%%time
rfgridsrch.fit(Xvic, Yvic.rVIC)

CPU times: user 46.5 s, sys: 627 ms, total: 47.1 s
Wall time: 21min 28s


GridSearchCV(cv=9, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [50, 60], 'max_depth': [25, 30, 35], 'min_samples_leaf': [3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [71]:
give_gridsearch_results(rfgridsrch)

Best score:  0.847
Best params:  {'max_depth': 35, 'min_samples_leaf': 3, 'n_estimators': 60}


In [72]:
bestforest = rfgridsrch.best_estimator_

In [73]:
bestforest.fit(Xvic, Yvic.rVIC)
bestforest.score(hoXvic, hoYvic.rVIC)

0.8559771419446153

In [74]:
hoYvic['bestRForest'] = bestforest.predict(hoXvic)
hoYvic.head(3)

Unnamed: 0_level_0,VIC1,rVIC,bestKNN,Month,mthname,bestDtree,bestRForest
SETTLEMENTDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-01 00:00:00,4445.07,4450.0,4411.048719,1,January,4295.75,4413.583201
2018-01-01 00:30:00,4251.18,4255.0,4234.881296,1,January,4234.166667,4174.095746
2018-01-01 01:00:00,4092.53,4095.0,4232.807674,1,January,4009.117647,4038.849422


In [75]:
draw_gridplot(hoYvic, columns=['VIC1', 'bestRForest'])

In [None]:
entmoot.fit(Xvic, Yvic)
entmoot.score(hoXvic, hoYvic.VIC1)

In [None]:
hoYvic['entpredict'] = entmoot.predict(hoXvic)

In [None]:
draw_gridplot(hoYvic, columns=['VIC1','entpredict'])

In [None]:
Yvic.hist(by='VIC1', bins=20)
plt.show()

KeyboardInterrupt: 

In [None]:
Yvic.shape