In [1]:
import pandas as pd
import numpy as np

In [2]:
from bokeh.plotting import figure, output_notebook, show
from bokeh.layouts import gridplot
output_notebook()

In [3]:
from glob import glob

In [4]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV, StratifiedShuffleSplit, ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

<font color = 'purple'><font size = 5>
    A few functions used to shortcut displaying results<br>
    ================================================<br>

In [5]:
def output_CV(cv_array):     ## for use with bokeh.plotting imported
    pout = figure(plot_width = 300, plot_height = 200, y_range=(0,1)) 
    pout.circle(list(range(1, len(cv_array) +1)), cv_array, size=5)
    print('Mean CV score:  ', round(cv_array.mean(), 3))
    print('Std of CV scores:  ', round(cv_array.std(), 3))
    show(pout)

In [6]:
def give_gridsearch_results(ingridsrch_instance):
    print('Best score: ', round(ingridsrch_instance.best_score_, 3))
    print('Best params: ', ingridsrch_instance.best_params_)

In [7]:
def make_plotgrid(totalrange, ncols):
    """for feeding into bokeh griplot, returns array of rows/columns"""
    nrows = int(np.ceil(totalrange/ncols))
    feeder = iter(range(totalrange))
    plotgrid = [[]]*nrows
    for r in range(nrows):
        thisrow = []
        for c in range(ncols):
            try:
                thisrow.append(next(feeder))
            except:
                thisrow.append(None)
        plotgrid[r] = thisrow
    return plotgrid

In [8]:
make_plotgrid(12,3)

[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]

In [9]:
def draw_gridplot(indf, columns='first', ncols=3, total_width=900, each_height=300, incolours = ['blue','red','yellow','green','purple','orange']):
    """Grid plots by month in bokeh
    Requires:
    > indf has datetime index
    > if columns are left unspecified (should be a list of col names), then just first column is plotted
    > only takes up to 6 columns, then runs out of colours to plot unless you put more into incolours"""
    if columns == 'first':
        columns = [indf.columns[0]]
    else:
        pass
    indf['Month'] = indf.index.month
    indf['mthname'] = indf.index.month_name()
    
    each_width = int(total_width/ncols)
    
    clrcols = [(col, clr) for col, clr in zip(columns, incolours)]
    dictmonth = {}
    
    for no,mth in enumerate(indf.mthname.unique()):
        dictmonth[no] = figure(plot_width=each_width, plot_height=each_height, title=mth, x_axis_type='datetime')
        for col, clr in clrcols:
            dictmonth[no].line(indf.index[indf.Month == no+1], indf[col][indf.Month == no+1], line_width=1, color = clr)
    
    plotgrid = make_plotgrid(len(indf.mthname.unique()), ncols=ncols)
    for row in range(len(plotgrid)):
        for p in range(len(plotgrid[row])):
            plotgrid[row][p] = dictmonth[plotgrid[row][p]]
    
    the_grid = gridplot(plotgrid)
    show(the_grid)
    

<font color = 'purple'><font size = 5>
    getting data in<br>
    ================================================<br>

In [10]:
feat_files = glob('../data/ready-for-model/*.csv')
feat_files

['../data/ready-for-model/2009-18_NEMtotaldemand.csv',
 '../data/ready-for-model/20190223_SAdf_features.csv',
 '../data/ready-for-model/20190223_NSWdf_features.csv',
 '../data/ready-for-model/20190223_TASdf_features.csv',
 '../data/ready-for-model/20190223_QLDdf_features.csv',
 '../data/ready-for-model/20190223_VICdf_features.csv']

In [11]:
fvic = feat_files[-1]
fvic

'../data/ready-for-model/20190223_VICdf_features.csv'

In [12]:
ftarget = feat_files[0]
ftarget

'../data/ready-for-model/2009-18_NEMtotaldemand.csv'

In [13]:
dfvic = pd.read_csv(fvic, index_col=0, parse_dates=[0])
dfvic.head()

Unnamed: 0_level_0,Date,Hour_of_day,Year,shoulder,summer,winter,workdayNSW,workdayQLD,workdaySA,workdayTAS,workdayVIC,CAPE-NELSON_MaxT_90184,MORWELL_MaxT_85280,MELBOURNE-AIRPORT_MinT_86282,CAPE-NELSON_MinT_90184,MILDURA-AIRPORT_MaxT_76031,MELBOURNE-AIRPORT_MaxT_86282,MORWELL_MinT_85280
SETTLEMENTDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2009-01-01 00:30:00,2009-01-01,0.5,2009,0,1,0,0.0,0.0,0.0,0.0,0.0,15.9,20.1,11.2,12.7,27.4,19.9,9.4
2009-01-01 01:00:00,2009-01-01,1.0,2009,0,1,0,0.0,0.0,0.0,0.0,0.0,15.9,20.1,11.2,12.7,27.4,19.9,9.4
2009-01-01 01:30:00,2009-01-01,1.5,2009,0,1,0,0.0,0.0,0.0,0.0,0.0,15.9,20.1,11.2,12.7,27.4,19.9,9.4
2009-01-01 02:00:00,2009-01-01,2.0,2009,0,1,0,0.0,0.0,0.0,0.0,0.0,15.9,20.1,11.2,12.7,27.4,19.9,9.4
2009-01-01 02:30:00,2009-01-01,2.5,2009,0,1,0,0.0,0.0,0.0,0.0,0.0,15.9,20.1,11.2,12.7,27.4,19.9,9.4


In [14]:
dfvic.isnull().sum().sum()

581

In [15]:
dfvic.isnull().sum()

Date                              0
Hour_of_day                       0
Year                              0
shoulder                          0
summer                            0
winter                            0
workdayNSW                        1
workdayQLD                        1
workdaySA                         1
workdayTAS                        1
workdayVIC                        1
CAPE-NELSON_MaxT_90184          144
MORWELL_MaxT_85280              144
MELBOURNE-AIRPORT_MinT_86282      0
CAPE-NELSON_MinT_90184          144
MILDURA-AIRPORT_MaxT_76031        0
MELBOURNE-AIRPORT_MaxT_86282      0
MORWELL_MinT_85280              144
dtype: int64

In [16]:
dfvic.dropna(inplace=True) # just for now, will skip where I don't have temp data, will try and get better data later

In [17]:
dftarget = pd.read_csv(ftarget, index_col=0, parse_dates=[0])
dftarget.head()

Unnamed: 0_level_0,NSW1,QLD1,SA1,TAS1,VIC1,NEMtotal
SETTLEMENTDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-01-01 00:30:00,7535.0,5611.54,1310.89,909.71,4799.87,20167.01
2009-01-01 01:00:00,7229.24,5457.34,1272.69,896.63,4646.21,19502.11
2009-01-01 01:30:00,6857.62,5294.12,1178.87,897.52,4950.16,19178.29
2009-01-01 02:00:00,6535.05,5153.47,1130.78,906.22,4755.46,18480.98
2009-01-01 02:30:00,6287.88,5060.33,1059.53,893.19,4545.67,17846.6


In [18]:
dfvic = pd.merge(dfvic, dftarget[['VIC1']], how='inner', left_index=True, right_index=True)
dfvic.head(3)

Unnamed: 0_level_0,Date,Hour_of_day,Year,shoulder,summer,winter,workdayNSW,workdayQLD,workdaySA,workdayTAS,workdayVIC,CAPE-NELSON_MaxT_90184,MORWELL_MaxT_85280,MELBOURNE-AIRPORT_MinT_86282,CAPE-NELSON_MinT_90184,MILDURA-AIRPORT_MaxT_76031,MELBOURNE-AIRPORT_MaxT_86282,MORWELL_MinT_85280,VIC1
SETTLEMENTDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2009-01-01 00:30:00,2009-01-01,0.5,2009,0,1,0,0.0,0.0,0.0,0.0,0.0,15.9,20.1,11.2,12.7,27.4,19.9,9.4,4799.87
2009-01-01 01:00:00,2009-01-01,1.0,2009,0,1,0,0.0,0.0,0.0,0.0,0.0,15.9,20.1,11.2,12.7,27.4,19.9,9.4,4646.21
2009-01-01 01:30:00,2009-01-01,1.5,2009,0,1,0,0.0,0.0,0.0,0.0,0.0,15.9,20.1,11.2,12.7,27.4,19.9,9.4,4950.16


<font color = 'purple'><font size = 4>
    Creating training and holdout test sets - pre-2018 and 2018 data respectively<br>
    =====================================================<br>

In [71]:
Yvic = dfvic[['VIC1']]
Xvic = dfvic.drop(columns=['Date','VIC1'])

In [72]:
hoXvic = Xvic[Xvic.Year == 2018].astype(float)
hoYvic = Yvic[Yvic.index.year == 2018].astype(float)
print(len(hoXvic))
print(len(hoYvic))
hoYvic.head()

17472
17472


Unnamed: 0_level_0,VIC1
SETTLEMENTDATE,Unnamed: 1_level_1
2018-01-01 00:00:00,4445.07
2018-01-01 00:30:00,4251.18
2018-01-01 01:00:00,4092.53
2018-01-01 01:30:00,3958.95
2018-01-01 02:00:00,3785.27


In [73]:
Xvic = Xvic[Xvic.Year != 2018].astype(float)
Yvic = Yvic[Yvic.index.year != 2018].astype(float)
Xvic.tail()

Unnamed: 0_level_0,Hour_of_day,Year,shoulder,summer,winter,workdayNSW,workdayQLD,workdaySA,workdayTAS,workdayVIC,CAPE-NELSON_MaxT_90184,MORWELL_MaxT_85280,MELBOURNE-AIRPORT_MinT_86282,CAPE-NELSON_MinT_90184,MILDURA-AIRPORT_MaxT_76031,MELBOURNE-AIRPORT_MaxT_86282,MORWELL_MinT_85280
SETTLEMENTDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2017-12-31 21:30:00,21.5,2017.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,19.9,25.9,9.0,14.7,31.1,26.2,8.7
2017-12-31 22:00:00,22.0,2017.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,19.9,25.9,9.0,14.7,31.1,26.2,8.7
2017-12-31 22:30:00,22.5,2017.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,19.9,25.9,9.0,14.7,31.1,26.2,8.7
2017-12-31 23:00:00,23.0,2017.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,19.9,25.9,9.0,14.7,31.1,26.2,8.7
2017-12-31 23:30:00,23.5,2017.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,19.9,25.9,9.0,14.7,31.1,26.2,8.7


In [22]:
# shufs = ShuffleSplit(test_size=0.3, random_state=307132)

In [23]:
stscaler = StandardScaler()

<font color = 'purple'><font size = 5>
    K-Nearest Neighbours<br>
    =====================================================<br>

In [25]:
kpipe = Pipeline(steps=[('stsc', StandardScaler()), ('knn', KNeighborsRegressor(n_jobs=-1))])

In [80]:
# knvic = KNeighborsRegressor(n_jobs=-1)

In [32]:
# stscaler.fit(Xvic)                      # So I am first fitting to just the CV (non-holdout) X dataset,
# sXvic = stscaler.transform(Xvic)        # then using the same StandardScaler object to transform the non-holdout
# shoXvic = stscaler.transform(hoXvic)    # and holdout datasets, using the same parameters

In [33]:
%%time
kscore = cross_val_score(kpipe, Xvic, Yvic, cv=9, verbose=1)
# kscore = cross_val_score(knvic, sXvic, Yvic, cv=9)
# kpred = cross_val_predict(knvic, Xvic, Yvic, cv=shufs)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CPU times: user 2.46 s, sys: 382 ms, total: 2.85 s
Wall time: 16.8 s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   16.8s finished


In [34]:
output_CV(kscore)

Mean CV score:   0.781
Std of CV scores:   0.029


In [35]:
kgrid_params = {'knn__n_neighbors' : [2,3,5,7],#list(range(6,25,6)),
               'knn__weights' : ['distance'],     # in previous gridsearches, distance and minkowski
               'knn__metric' : ['minkowski']}     # consistently came out as best

In [36]:
kngrid_search = GridSearchCV(kpipe, param_grid=kgrid_params, n_jobs=-1, cv = 9)

In [38]:
%%time
kngrid_search.fit(Xvic, Yvic)



CPU times: user 465 ms, sys: 129 ms, total: 594 ms
Wall time: 1min 34s


GridSearchCV(cv=9, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('stsc', StandardScaler(copy=True, with_mean=True, with_std=True)), ('knn', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
          weights='uniform'))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'knn__n_neighbors': [2, 3, 5, 7], 'knn__weights': ['distance'], 'knn__metric': ['minkowski']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [39]:
give_gridsearch_results(kngrid_search)

Best score:  0.781
Best params:  {'knn__metric': 'minkowski', 'knn__n_neighbors': 5, 'knn__weights': 'distance'}


In [40]:
%%time
bknscore = cross_val_score(kngrid_search.best_estimator_, Xvic, Yvic, n_jobs= -1, cv= 9)

CPU times: user 70 ms, sys: 52.3 ms, total: 122 ms
Wall time: 13.6 s


In [41]:
output_CV(bknscore)

Mean CV score:   0.781
Std of CV scores:   0.029


In [42]:
bstknn = kngrid_search.best_estimator_

In [43]:
bstknn.score(hoXvic, hoYvic.VIC1)

  Xt = transform.transform(Xt)


0.7781869793689156

In [74]:
hoYvic['bestKNN'] = bstknn.predict(hoXvic)
# hoYvic['Month'] = hoYvic.index.month
# hoYvic['mthname'] = hoYvic.index.month_name()
hoYvic.head(2)

Unnamed: 0_level_0,VIC1,bestKNN
SETTLEMENTDATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01 00:00:00,4445.07,4407.826539
2018-01-01 00:30:00,4251.18,4231.508702


In [75]:
draw_gridplot(hoYvic, columns=['VIC1','bestKNN'])

<font color = 'purple'><font size = 5>
    Decision Tree<br>
    =====================================================<br>

In [49]:
treebeard = DecisionTreeRegressor()

In [50]:
%%time
dtscore = cross_val_score(treebeard, Xvic, Yvic, n_jobs=-1, cv=9)

CPU times: user 57 ms, sys: 52.3 ms, total: 109 ms
Wall time: 3.73 s


In [51]:
output_CV(dtscore)

Mean CV score:   0.753
Std of CV scores:   0.039


In [52]:
dtparams = {'max_depth' : [None, 15, 30, 50],
           'min_samples_split' : [2, 6, 12, 24],
           'min_samples_leaf' : [1, 5, 10],
           }

In [54]:
dtgridsrch = GridSearchCV(treebeard, param_grid=dtparams, cv=9, n_jobs=-1)

In [55]:
%%time
dtgridsrch.fit(Xvic, Yvic)

CPU times: user 2.02 s, sys: 162 ms, total: 2.18 s
Wall time: 1min 54s


GridSearchCV(cv=9, error_score='raise-deprecating',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [None, 15, 30, 50], 'min_samples_split': [2, 6, 12, 24], 'min_samples_leaf': [1, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [56]:
give_gridsearch_results(dtgridsrch)

Best score:  0.788
Best params:  {'max_depth': 15, 'min_samples_leaf': 10, 'min_samples_split': 24}


In [57]:
trbest = dtgridsrch.best_estimator_

In [59]:
%%time
dtcvscore = cross_val_score(trbest, Xvic, Yvic, cv=9, n_jobs=-1)

CPU times: user 62.1 ms, sys: 58.5 ms, total: 121 ms
Wall time: 2.5 s


In [60]:
output_CV(dtcvscore)

Mean CV score:   0.791
Std of CV scores:   0.037


In [76]:
hoYvic['bestDtree'] = trbest.predict(hoXvic)
hoYvic.head(3)

Unnamed: 0_level_0,VIC1,bestKNN,Month,mthname,bestDtree
SETTLEMENTDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-01 00:00:00,4445.07,4407.826539,1,January,4293.183
2018-01-01 00:30:00,4251.18,4231.508702,1,January,4232.3175
2018-01-01 01:00:00,4092.53,4229.437022,1,January,4006.117059


In [78]:
draw_gridplot(hoYvic, columns=['VIC1','bestDtree'])

In [61]:
# treebeard.fit(Xvic, Yvic)
# treebeard.score(hoXvic, hoYvic.VIC1)

In [62]:
# hoYvic['dtree_pred'] = treebeard.predict(hoXvic)

In [63]:
# draw_gridplot(hoYvic, columns=['VIC1','dtree_pred'])

In [None]:
# dot_data = StringIO()  

# export_graphviz(dtgridsrch.best_estimator_, out_file='treebeard.dot',  
#                 filled=True, rounded=True,
#                 special_characters=True, feature_names = Xvic.columns)  

# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# Image(graph.create_png())

<font color = 'purple'><font size = 5>
    Random Forest<br>
    =====================================================<br>

In [79]:
entmoot = RandomForestRegressor(n_jobs=-1)

In [80]:
%%time
rfscore1 = cross_val_score(entmoot, Xvic, Yvic, cv=9, n_jobs=-1)

CPU times: user 87.2 ms, sys: 132 ms, total: 219 ms
Wall time: 26.8 s


In [81]:
output_CV(rfscore1)

Mean CV score:   0.838
Std of CV scores:   0.037


In [82]:
rfparams = {'n_estimators' : [60,70],
           'max_depth' : [30,40],
           'min_samples_leaf' : [3,5],
           }

In [83]:
rfgridsrch = GridSearchCV(entmoot, param_grid=rfparams, cv=9, n_jobs=-1)

In [84]:
%%time
rfgridsrch.fit(Xvic, Yvic)

  self.best_estimator_.fit(X, y, **fit_params)


CPU times: user 53.4 s, sys: 678 ms, total: 54 s
Wall time: 17min 49s


GridSearchCV(cv=9, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [60, 70], 'max_depth': [30, 40], 'min_samples_leaf': [3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [85]:
give_gridsearch_results(rfgridsrch)

Best score:  0.847
Best params:  {'max_depth': 30, 'min_samples_leaf': 3, 'n_estimators': 70}


In [86]:
bestforest = rfgridsrch.best_estimator_

In [88]:
bestforest.fit(Xvic, Yvic.VIC1)
bestforest.score(hoXvic, hoYvic.VIC1)

0.8534859350024705

In [90]:
hoYvic['bestRForest'] = bestforest.predict(hoXvic)
hoYvic.head(3)

Unnamed: 0_level_0,VIC1,bestKNN,Month,mthname,bestDtree,bestRForest
SETTLEMENTDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-01 00:00:00,4445.07,4407.826539,1,January,4293.183,4397.718265
2018-01-01 00:30:00,4251.18,4231.508702,1,January,4232.3175,4173.128336
2018-01-01 01:00:00,4092.53,4229.437022,1,January,4006.117059,4033.619608


In [91]:
draw_gridplot(hoYvic, columns=['VIC1', 'bestRForest'])

In [302]:
entmoot.fit(Xvic, Yvic)
entmoot.score(hoXvic, hoYvic.VIC1)

  """Entry point for launching an IPython kernel.


0.8436430091242383

In [304]:
hoYvic['entpredict'] = entmoot.predict(hoXvic)

In [307]:
draw_gridplot(hoYvic, columns=['VIC1','entpredict'])