In [1]:
import warnings

warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import BaseCrossValidator
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

from scipy.stats import spearmanr

In [3]:
dataset_path = '../data/model_data.h5'

# Load Data

In [4]:
with pd.HDFStore(dataset_path) as store:
    data = (store['model_data']
            .drop(['Open', 'Close', 'Low', 'High'], axis=1))

In [5]:
data = data.drop([c for c in data.columns if 'lag' in c], axis=1)
data

Unnamed: 0_level_0,Volume,Consumption in mcf,Storage in mcf,US Gross Withdrawal in mcf,Other Gross Withdrawal in mcf,RSI,ATR,MACD,return_1d,return_5d,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-04-09,108772.0,1953071.0,2478.0,2417498.0,93400.0,34.901425,0.089792,-0.081938,0.008617,-0.001794,...,0,1,0,0,0,0,0,0,0,0
2012-04-10,120126.0,1953071.0,2478.0,2417498.0,93400.0,30.556017,0.090521,-0.086858,-0.036070,-0.011507,...,0,1,0,0,0,0,0,0,0,0
2012-04-12,188668.0,1953071.0,2478.0,2417498.0,93400.0,28.170441,0.090913,-0.093349,-0.023634,-0.019393,...,0,1,0,0,0,0,0,0,0,0
2012-04-13,111947.0,1953071.0,2503.0,2417498.0,93400.0,28.072097,0.087276,-0.097552,-0.001009,-0.015414,...,0,1,0,0,0,0,0,0,0,0
2012-04-16,115321.0,1953071.0,2503.0,2417498.0,93400.0,32.512260,0.084828,-0.097134,0.017668,-0.007089,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-26,47082.0,2218011.0,2714.0,3396062.0,32500.0,79.676063,0.129331,0.175313,0.010345,0.016538,...,0,0,0,0,1,0,0,0,0,0
2021-07-27,40472.0,2218011.0,2714.0,3396062.0,32500.0,67.674236,0.132950,0.170944,-0.031936,0.004855,...,0,0,0,0,1,0,0,0,0,0
2021-07-28,3877.0,2218011.0,2714.0,3396062.0,32500.0,70.354143,0.137025,0.171060,0.018383,0.004258,...,0,0,0,0,1,0,0,0,0,0
2021-07-29,113681.0,2218011.0,2714.0,3396062.0,32500.0,70.888206,0.139381,0.170352,0.003709,0.002782,...,0,0,0,0,1,0,0,0,0,0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2330 entries, 2012-04-09 to 2021-07-30
Data columns (total 40 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Volume                         2330 non-null   float64
 1   Consumption in mcf             2330 non-null   float64
 2   Storage in mcf                 2330 non-null   float64
 3   US Gross Withdrawal in mcf     2330 non-null   float64
 4   Other Gross Withdrawal in mcf  2330 non-null   float64
 5   RSI                            2330 non-null   float64
 6   ATR                            2330 non-null   float64
 7   MACD                           2330 non-null   float64
 8   return_1d                      2330 non-null   float64
 9   return_5d                      2330 non-null   float64
 10  return_10d                     2330 non-null   float64
 11  return_21d                     2330 non-null   float64
 12  return_42d                    

In [7]:
y = data.filter(like='target')
X = data.drop(y.columns, axis=1)
X = X.drop(['Volume'], axis=1)

In [8]:

class MultipleTimeSeriesCV(BaseCrossValidator):
    """Generates tuples of train_idx, test_idx pairs"""

    def __init__(self,
                 n_splits=3,
                 train_period_length=126,
                 test_period_length=21,
                 lookahead=None,
                 shuffle=False):
        self.n_splits = n_splits
        self.lookahead = lookahead
        self.test_length = test_period_length
        self.train_length = train_period_length
        self.shuffle = shuffle

    def split(self, X, y=None, groups=None):
        unique_dates = X.index.get_level_values('Date').unique()
        days = sorted(unique_dates, reverse=True)

        split_idx = []
        for i in range(self.n_splits):
            test_end_idx = i * self.test_length
            test_start_idx = test_end_idx + self.test_length
            train_end_idx = test_start_idx + self.lookahead - 1
            train_start_idx = train_end_idx + self.train_length + self.lookahead - 1
            split_idx.append([train_start_idx, train_end_idx,
                              test_start_idx, test_end_idx])

        dates = X.reset_index()[['Date']]
        for train_start, train_end, test_start, test_end in split_idx:
            train_idx = dates[(dates.Date > days[train_start])
                              & (dates.Date <= days[train_end])].index
            test_idx = dates[(dates.Date > days[test_start])
                             & (dates.Date <= days[test_end])].index
            if self.shuffle:
                np.random.shuffle(list(train_idx))
            yield train_idx, test_idx

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

In [9]:
n_splits = 20
train_period_length = 300
test_period_length = 100
lookahead = 5

cv = MultipleTimeSeriesCV(n_splits=n_splits,
                          train_period_length=train_period_length,
                          test_period_length=test_period_length,
                          lookahead=lookahead)

In [10]:
# Utilities functions

def display_score(scores):
    print('scores: ',scores)
    print('mean: ', scores.mean())
    print('standard deviation: ', scores.std())


def rank_correl(y, y_pred):
    return spearmanr(y, y_pred, axis=None)[0]

ic = make_scorer(rank_correl)


def get_cross_val_score(model, X, y, score_fun, cv):
    cv_score = cross_val_score(estimator=model,
                           X=X,
                           y=y,
                           scoring=score_fun,
                           cv=cv,
                           n_jobs=-1,
                           verbose=1)
    display_score(cv_score)

# Decision Tree Regressor

In [11]:
dt_reg = DecisionTreeRegressor(max_depth=None,
                               min_samples_split=2,
                               min_samples_leaf=1,
                               max_features='auto')


In [12]:
get_cross_val_score(dt_reg, X, y, ic, cv)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 3 concurrent workers.


scores:  [ 0.01329095  0.12477587  0.2137844   0.18141166 -0.13071338  0.0379064
  0.16386958 -0.09528192  0.17224216  0.08703313  0.03817422  0.20064852
  0.01276272  0.0691651   0.01216503  0.04083768 -0.00575801  0.12423133
  0.05150286 -0.12109944]
mean:  0.05954744340205091
standard deviation:  0.09909648335620777


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.6s finished


# Random Forest Regressor

In [13]:
rf_reg = RandomForestRegressor(n_estimators=100,
                                max_depth=None,
                                min_samples_split=2,
                                min_samples_leaf=1,
                                min_weight_fraction_leaf=0.0,
                                max_features='auto',
                                max_leaf_nodes=None,
                                min_impurity_decrease=0.0,
                                min_impurity_split=None,
                                bootstrap=True,
                                oob_score=False,
                                n_jobs=-1,
                                random_state=None,
                                verbose=0,
                                warm_start=False)

In [14]:
get_cross_val_score(rf_reg, X, y, ic, cv)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 3 concurrent workers.


scores:  [ 0.13429678  0.11285658  0.19380196  0.109211   -0.12217483  0.31564849
  0.29530436 -0.05807699  0.13123657  0.16527891  0.11920887  0.14896687
  0.0683365   0.12769027 -0.0161251   0.13850808  0.0574783   0.1315662
  0.13594397 -0.00191814]
mean:  0.10935193210987773
standard deviation:  0.10166093617620925


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    4.0s finished
