## Create Decision Tree Model to Forecast Returns

#### Import Packages

In [159]:
import pandas as pd
import datetime
import gc
from sklearn.ensemble import (
BaggingRegressor, RandomForestRegressor, AdaBoostRegressor)
from sklearn.metrics import mean_squared_error
from technical_indicators import * # import all function
from sklearn.model_selection import TimeSeriesSplit
#import parfit as pf
from sklearn.metrics import r2_score
import numpy as np
from sklearn.model_selection import ParameterGrid
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn import linear_model

#### Read in Data via GitHub URL

In [143]:
url = "https://raw.githubusercontent.com/meenmo/Stat479_Project/master/Data/IBM.csv"
df_ORIGINAL = pd.read_csv(url)


***
## Clean Data & Create Technical Indicator Variables

- Create Deep copy of dataframe
- Use Adjusted Close Data
- Drop Close 
- Rename "Adj. Close" as "Close"

In [144]:
df_features = df_ORIGINAL.copy(deep=True) # Create Deep
df_features.drop(['Close'], axis = 1, inplace = True) # drop close column
df_features.columns = ['Date', 'High', 'Low', 'Open', 'Volume', 'Close'] # Close is actually Adj. Close

df_features['Date'] = pd.to_datetime(df_features['Date'])
df_features.head() # sanity check


Unnamed: 0,Date,High,Low,Open,Volume,Close
0,2002-01-02,121.5,119.800003,120.599999,6862800,84.677422
1,2002-01-03,124.220001,120.25,121.5,8621700,86.1828
2,2002-01-04,125.599999,123.980003,124.050003,8405200,87.534859
3,2002-01-07,126.190002,123.699997,125.0,5939600,86.454575
4,2002-01-08,125.199997,123.730003,124.25,5311800,86.9076


#### Function: Create Lagged Returns

In [145]:
"""
Creates Lagged Returns 
- given OHLCV dataframe
- numer of lagged days
"""
def create_lag_features(df, lag_days):
    df_ret = df.copy()
    
    # iterate through the lag days to generate lag values up to lag_days + 1
    for i in range(1,lag_days + 2):
        df_lag = df_ret[['Date', 'Close']].copy()
        # generate dataframe to shift index by i day.
        df_lag['Date'] = df_lag['Date'].shift(-i)
        df_lag.columns = ['Date', 'value_lag' + str(i)]
        # combine the valuelag
        df_ret = pd.merge(df_ret, df_lag, how = 'left', left_on = ['Date'], right_on = ['Date'])
    
    #frees memory
    del df_lag
    
    # calculate today's percentage lag
    df_ret['Today'] = (df_ret['Close'] - df_ret['value_lag1'])/(df_ret['value_lag1']) * 100.0 
    
    # calculate percentage lag
    for i in range(1, lag_days + 1):
        df_ret['lag' + str(i)] = (df_ret['value_lag'+ str(i)] - df_ret['value_lag'+ str(i+1)])/(df_ret['value_lag'+str(i+1)]) * 100.0
    
    # drop unneeded columns which are value_lags
    for i in range(1, lag_days + 2):
        df_ret.drop(['value_lag' + str(i)], axis = 1, inplace = True)
                                                                                                                                                                                                                                                                                                                                     
    return df_ret


### Run Function
df_features = create_lag_features(df_features, 5) # 5 lag features
df_features.head(7)

Unnamed: 0,Date,High,Low,Open,Volume,Close,Today,lag1,lag2,lag3,lag4,lag5
0,2002-01-02,121.5,119.800003,120.599999,6862800,84.677422,,,,,,
1,2002-01-03,124.220001,120.25,121.5,8621700,86.1828,1.777781,,,,,
2,2002-01-04,125.599999,123.980003,124.050003,8405200,87.534859,1.568826,1.777781,,,,
3,2002-01-07,126.190002,123.699997,125.0,5939600,86.454575,-1.234119,1.568826,1.777781,,,
4,2002-01-08,125.199997,123.730003,124.25,5311800,86.9076,0.524004,-1.234119,1.568826,1.777781,,
5,2002-01-09,126.389999,124.150001,124.699997,6839900,86.761269,-0.168376,0.524004,-1.234119,1.568826,1.777781,
6,2002-01-10,124.0,121.419998,123.75,8926300,85.123428,-1.887755,-0.168376,0.524004,-1.234119,1.568826,1.777781


#### Drop Rows with NaN

In [146]:
# drop earlier data with missing lag features
df_features.dropna(inplace=True)
# reset index
df_features.reset_index(drop = True, inplace = True)


#### Create Technical Indicators

- Create Technical Indicators
- Call functions from `technical_inicators.py`
- Drop Rows with NaN
  

In [147]:

#### GENERATE TECHNICAL INDICATORS FEATURES
df_features = standard_deviation(df_features, 14)

df_features = relative_strength_index(df_features, 14) # periods
df_features = average_directional_movement_index(df_features, 14, 13) # n, n_ADX
df_features = moving_average(df_features, 21) # periods
df_features = exponential_moving_average(df_features, 21) # periods
df_features = momentum(df_features, 14) # 

df_features = average_true_range(df_features, 14)
df_features = bollinger_bands(df_features, 21)
df_features = ppsr(df_features)
df_features = stochastic_oscillator_k(df_features)
df_features = stochastic_oscillator_d(df_features, 14)
df_features = trix(df_features, 14)
df_features = macd(df_features, 26, 12)
df_features = mass_index(df_features)
df_features = vortex_indicator(df_features, 14)

df_features = kst_oscillator(df_features, 10, 10, 10, 15, 10, 15, 20, 30)
df_features = true_strength_index(df_features, 25, 13)

#df_features = accumulation_distribution(df_features, 14) # Causes Problems, apparently
df_features = chaikin_oscillator(df_features)
df_features = money_flow_index(df_features, 14)
df_features = on_balance_volume(df_features, 14)
df_features = force_index(df_features, 14)
df_features = ease_of_movement(df_features, 14)
df_features = commodity_channel_index(df_features, 14)
df_features = keltner_channel(df_features, 14)
df_features = ultimate_oscillator(df_features)
df_features = donchian_channel(df_features, 14)
    
# drop earlier data with missing lag features
df_features.dropna(inplace=True)
df_features = df_features.reset_index(drop = True)


In [148]:
### Sanity Check
df_features.head(10)


Unnamed: 0,Date,High,Low,Open,Volume,Close,Today,lag1,lag2,lag3,...,MFI_14,OBV_14,Force_14,EoM_14,CCI_14,KelChM_14,KelChU_14,KelChD_14,Ultimate_Osc,Donchian_14
0,2002-03-14,107.949997,106.589996,107.019997,5335500,74.391068,-0.541176,-1.21658,3.097695,0.142742,...,0.714286,1962464.0,-29211570.0,2.286739e-07,0.95996,93.032764,95.832764,90.232765,0.034641,13.389999
1,2002-03-15,107.449997,105.589996,106.550003,10864100,74.523666,0.178246,-0.541176,-1.21658,3.097695,...,0.642857,3445814.0,5695471.0,2.086708e-07,0.713277,93.570735,96.351449,90.79002,0.031385,13.389999
2,2002-03-18,108.639999,106.230003,107.099999,5301200,74.216637,-0.41199,0.178246,-0.541176,-1.21658,...,0.714286,3898193.0,-40661560.0,2.500726e-07,0.779588,94.192408,96.916693,91.468123,0.058735,13.389999
3,2002-03-19,108.050003,106.489998,106.849999,4614800,75.012154,1.071885,-0.41199,0.178246,-0.541176,...,0.714286,3372200.0,-49641770.0,2.26392e-07,0.762796,94.770295,97.348152,92.192438,0.057835,13.389999
4,2002-03-20,106.900001,105.489998,106.900001,4844100,73.623466,-1.851283,1.071885,-0.41199,0.178246,...,0.642857,2459121.0,-15939280.0,2.007584e-07,0.060704,95.249347,97.800775,92.697918,0.026201,13.389999
5,2002-03-21,106.779999,104.699997,105.699997,5113100,74.516685,1.213226,-1.851283,1.071885,-0.41199,...,0.571429,2074850.0,-14116110.0,1.220918e-07,-0.248791,95.547059,97.918488,93.17563,-0.008129,13.389999
6,2002-03-22,106.699997,105.07,106.5,5507900,73.693192,-1.105114,1.213226,-1.851283,1.071885,...,0.5,912307.1,1101343.0,3.426329e-08,-0.533297,95.592311,97.837311,93.347311,-0.02132,13.389999
7,2002-03-25,106.660004,103.5,105.800003,6277900,72.269615,-1.931761,-1.105114,1.213226,-1.851283,...,0.428571,1004036.0,1890940.0,-5.730833e-09,-1.582313,95.545348,97.84892,93.241776,-0.045311,13.389999
8,2002-03-26,105.699997,102.300003,103.57,8144000,71.809052,-0.637285,-1.931761,-1.105114,1.213226,...,0.357143,-126778.6,-1083363.0,-5.076478e-08,-1.978289,95.397427,97.776713,93.018141,-0.061006,11.739998
9,2002-03-27,103.790001,102.5,103.75,5602400,72.150978,0.476161,-0.637285,-1.931761,-1.105114,...,0.357143,1057100.0,1198994.0,-4.957376e-08,-1.969496,95.286158,97.453301,93.119015,-0.040421,11.739998


***
## Train-Test Split

#### (Subject to change depending)
-  Train     : 2002 - 2017
    -  Validation: 2015 - 2017
    - OR         : Time Series Validation from the website
-  Test      : 2018 - Oct 31, 2018

#### In this Notebook
- Train : 2002 - 2017
- Test  : 2018 - Oct 31, 2018


#### Note
- y is Todays Daily Return.
- We wan't to PREDICT Tomorrows Daily Return
- Hence, we train the model on data __[start, today]__ and __predict y[today + 1]__ and NOT predict y[today]




In [149]:

# do time series split DOING NUMBER 2
splits = TimeSeriesSplit(n_splits=5)

# stores all labels
y = df_features['Today']

# ### Do not need to drop. Just don't use tomorrows data to predict tomorrows prices.
# ### Hence, only use todays data to predict tomorrows prices
# # drop unneded columns
#df_features.drop(['Open','Close','Low','High','Today'], axis = 1, inplace = True)

# # all training data
X_train_all = df_features.loc[(df_features['Date'] < '2018-01-01')]
y_train_all = y[X_train_all.index]

# # creates all test data which is all after January 2018
X_test = df_features.loc[(df_features['Date'] >= '2018-01-01'),:]
y_test = y[X_test.index]


### Train-Validation Split & Train Models

##### Run Models
- Random Forest
- Bagging
- Boosting
- Linear Regression

##### NOTE
- Need to retrain the model for each prediction. Suppose 200 Days total.
- Hence,
   - Train_1[start date, 100] --> Predict y[101]
   - Train_2[start date, 101] --> Predict y[102]
   - Train_3[start date, 103] --> Predict y[103]
   - ...
   - Train_199[start date, 198] --> Predict y[199]
   - Train_200[start date, 199] --> Predict y[200]
- Thus, to forecast 200 daily returns we need to train 200 different models!!!
   


#### Jonathans Code

In [100]:
# Initialize empty arrays
trainsamples  = [] # [664, 1327, 1990, 2653, 3316] # sizes of training samples
model1metrics = [] # Random Forest
model2metrics = [] # Bagging
model3metrics = [] # Boosting
model4metrics = [] # Linear Regression


for train_index, test_index in splits.split(X_train_all.loc[:,'Date']):
    
    # do the split for train, Take row 0 to last element of train index
    X_train = X_train_all.loc[0:train_index[len(train_index) - 1],:].copy(deep=True)
    y_train = y[X_train.index]
        
    # do the split for validation, Take first element of test index to last element of test index
    X_val = X_train_all.loc[test_index[0]:test_index[len(test_index) - 1],:].copy(deep=True)
    y_val = y[X_val.index]
        
    # Print Statements
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print('Observations: ', (X_train.shape[0] + X_test.shape[0]))
    print('Cutoff date, or first date in validation data: ', X_val.iloc[0,0])
    print('Training Observations: ', (X_train.shape[0]))
    print('Testing Observations: ', (X_test.shape[0]))
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    
    # drop date
    X_train.drop(['Date'], axis = 1, inplace = True)
    X_val.drop(['Date'], axis = 1, inplace = True)
    
    # Append current size of training sample
    trainsamples.append(X_train.shape[0]) # .shape[0] --> num rows
    
### Random Forest
    # random forest regression based on default parameter
    rf = RandomForestRegressor(random_state = 0)
    rf.fit(X_train, y_train)
    model1metrics.append([mean_squared_error(y_val, rf.predict(X_val)),r2_score(y_val, rf.predict(X_val))])
    
# ### Bagging
#     # bagging based on default parameter
#     bag = BaggingRegressor(DecisionTreeRegressor())
#     bag.fit(X_train, y_train)
#     model2metrics.append([mean_squared_error(y_val, bag.predict(X_val)),r2_score(y_val, bag.predict(X_val))])

# ### Boosting
#     # boosting on default parameter
#     boost = AdaBoostRegressor(DecisionTreeRegressor(), random_state = 0, learning_rate=0.01)
#     boost.fit(X_train, y_train)
#     model3metrics.append([mean_squared_error(y_val, boost.predict(X_val)),r2_score(y_val, boost.predict(X_val))])
    
# ### Linear Regression
#     # linear regression on default parameter
#     lr = linear_model.LinearRegression()
#     lr.fit(X_train, y_train)
#     model4metrics.append([mean_squared_error(y_val, lr.predict(X_val)),r2_score(y_val, lr.predict(X_val))])

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Observations:  875
Cutoff date, or first date in validation data:  2004-11-01 00:00:00
Training Observations:  664
Testing Observations:  211
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Observations:  1538
Cutoff date, or first date in validation data:  2007-06-21 00:00:00
Training Observations:  1327
Testing Observations:  211
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Observations:  2201
Cutoff date, or first date in validation data:  2010-02-08 00:00:00
Training Observations:  1990
Testing Observations:  211
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Observations:  2864
Cutoff date, or first date in validation data:  2012-09-24 00:00:00
Training Observations:  2653
Testing Observations:  211
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Observations:  3527
Cutoff date, or first date in validation data:  2015-05-15 00:00:00
Training Observations:  3316
Testing Observations:  211
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




In [110]:
model1metrics

[[0.0045937039913561935, 0.9954591784110058],
 [0.07477435108321685, 0.9794299412602301],
 [0.0004523796920241774, 0.9996845945462203],
 [0.0012142172454971058, 0.9990749091061193],
 [0.000794832356181995, 0.9994342788419209]]

#### Sams Code

In [None]:
# Initialize empty arrays
trainsamples  = [] # [664, 1327, 1990, 2653, 3316] # sizes of training samples
model1metrics = [] # Random Forest
model2metrics = [] # Bagging
model3metrics = [] # Boosting
model4metrics = [] # Linear Regression


### Random Forest - Train-Validate model
# in this case we can consider validation set as a test set

# Set the number of predictions we want to make
# Note - Num Predictions == Num models we'll need to train
trainSize = 3960 # 3316 # chosen arbitrarily
numPreds = X_train_all.shape[0] - trainSize # Number of Predictions = numRows - training size

# drop date
X_train_all_temp = X_train_all.copy(deep=True)
X_train_all_temp.drop(['Date'], axis = 1, inplace = True) # expensive so just do once here

# for each prediction # 662 predictions
for i in range(numPreds): # i = 0,1,2,...,661
    trainSize += 1
    # set temp train/test sets
    
    # do the split for train, Take row 0 to last element of train index
    X_train = X_train_all_temp.loc[0:trainSize,:]
    y_train = y[X_train.index]
    #y_train = y_train.iloc[1:] # 
    
    # do the split for validation, Take first element of test index to last element of test index
    X_val = X_train_all_temp.loc[trainSize:,:]
    y_val = y[X_val.index]
    
    trainsamples.append(X_train.shape[0])

    # random forest regression based on default parameter
    #rf = RandomForestRegressor(random_state = 0)
    #rf.fit(X_train, y_train)
    #rf.predict(X_val)
    #model1metrics.append([mean_squared_error(y_val, rf.predict(X_val)),r2_score(y_val, rf.predict(X_val))])
    






In [136]:
X_val.head()

Unnamed: 0,Date,High,Low,Open,Volume,Close,Today,lag1,lag2,lag3,...,MFI_14,OBV_14,Force_14,EoM_14,CCI_14,KelChM_14,KelChU_14,KelChD_14,Ultimate_Osc,Donchian_14
3969,2017-12-15,153.800003,152.029999,153.610001,11279900,145.900604,-0.974037,0.058476,-1.805529,0.855812,...,0.571429,1085450.0,3933165.0,2.16421e-08,-1.347368,152.202559,153.968988,150.43613,-0.091807,6.719986
3970,2017-12-18,154.179993,153.210007,153.589996,5092800,146.694687,0.544263,-0.974037,0.058476,-1.805529,...,0.571429,1165386.0,920761.0,2.650518e-08,-0.851382,152.287863,154.03572,150.540006,0.013931,7.399994
3971,2017-12-19,154.169998,153.089996,154.050003,4116400,146.599014,-0.065219,0.544263,-0.974037,0.058476,...,0.5,507164.3,300733.4,1.36873e-08,-0.97677,152.315812,154.037955,150.593669,-0.103907,8.199997
3972,2017-12-20,153.889999,152.779999,153.649994,3785700,146.331131,-0.182732,-0.065219,0.544263,-0.974037,...,0.428571,-227585.7,2649600.0,-4.985668e-09,-1.160609,152.273053,153.979481,150.566624,-0.18284,8.809998
3973,2017-12-21,153.460007,151.490005,153.169998,4153900,144.943878,-0.948023,-0.182732,-0.065219,0.544263,...,0.357143,-922000.0,4410163.0,-4.007323e-08,-1.718888,152.12784,153.824269,150.431412,-0.439012,10.589996


In [107]:
y_val.head()
rf.predict(X_val).head()

3316   -0.453896
3317   -0.115463
3318    0.242721
3319    0.161402
3320   -0.241713
Name: Today, dtype: float64

In [112]:
X_train_all
X_train_all.shape[0]

3979

In [117]:
train_index[len(train_index) - 1]

3315

In [157]:
X_val

Unnamed: 0,High,Low,Open,Volume,Close,Today,lag1,lag2,lag3,lag4,...,MFI_14,OBV_14,Force_14,EoM_14,CCI_14,KelChM_14,KelChU_14,KelChD_14,Ultimate_Osc,Donchian_14
3969,153.800003,152.029999,153.610001,11279900,145.900604,-0.974037,0.058476,-1.805529,0.855812,0.387562,...,0.571429,1085450.0,3933165.0,2.16421e-08,-1.347368,152.202559,153.968988,150.43613,-0.091807,6.719986
3970,154.179993,153.210007,153.589996,5092800,146.694687,0.544263,-0.974037,0.058476,-1.805529,0.855812,...,0.571429,1165386.0,920761.0,2.650518e-08,-0.851382,152.287863,154.03572,150.540006,0.013931,7.399994
3971,154.169998,153.089996,154.050003,4116400,146.599014,-0.065219,0.544263,-0.974037,0.058476,-1.805529,...,0.5,507164.3,300733.4,1.36873e-08,-0.97677,152.315812,154.037955,150.593669,-0.103907,8.199997
3972,153.889999,152.779999,153.649994,3785700,146.331131,-0.182732,-0.065219,0.544263,-0.974037,0.058476,...,0.428571,-227585.7,2649600.0,-4.985668e-09,-1.160609,152.273053,153.979481,150.566624,-0.18284,8.809998
3973,153.460007,151.490005,153.169998,4153900,144.943878,-0.948023,-0.182732,-0.065219,0.544263,-0.974037,...,0.357143,-922000.0,4410163.0,-4.007323e-08,-1.718888,152.12784,153.824269,150.431412,-0.439012,10.589996
3974,153.0,151.5,151.820007,2990600,145.900604,0.660067,-0.948023,-0.182732,-0.065219,0.544263,...,0.357143,-1041550.0,6341039.0,-1.003256e-07,-1.373146,151.862158,153.542159,150.182158,-0.155801,10.589996
3975,153.860001,152.5,152.509995,2479000,146.216324,0.216394,0.660067,-0.948023,-0.182732,-0.065219,...,0.428571,-502478.6,6241988.0,-5.734949e-08,-0.683644,151.684278,153.314278,150.054279,-0.11259,9.300003
3976,153.179993,152.610001,152.949997,2149300,146.503357,0.196307,0.216394,0.660067,-0.948023,-0.182732,...,0.428571,-105335.7,1170610.0,-3.799101e-08,-0.657686,151.554563,153.073133,150.035993,0.054648,7.970001
3977,154.119995,153.199997,153.199997,2687600,147.373947,0.594246,0.196307,0.216394,0.660067,-0.948023,...,0.5,356000.0,-487174.9,1.001581e-08,0.007211,151.555983,153.055267,150.056699,0.082128,6.900009
3978,154.720001,153.419998,154.169998,3327100,146.780777,-0.402493,0.594246,0.196307,0.216394,0.660067,...,0.5,-133100.0,256929.8,8.395639e-09,0.106578,151.513843,153.000271,150.027416,0.008701,5.600006


In [134]:
trainSize

216194