## Create Decision Tree Model to Forecast Returns

#### Import Packages

In [18]:
import pandas as pd
import datetime
import gc
from sklearn.ensemble import (
BaggingRegressor, RandomForestRegressor, AdaBoostRegressor)
from sklearn.metrics import mean_squared_error
from technical_indicators import * # import all function
from sklearn.model_selection import TimeSeriesSplit
#import parfit as pf
from sklearn.metrics import r2_score
import numpy as np
from sklearn.model_selection import ParameterGrid
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn import linear_model

#### Read in Data via GitHub URL

In [19]:
url = "https://raw.githubusercontent.com/meenmo/Stat479_Project/master/Data/IBM.csv"
df_ORIGINAL = pd.read_csv(url)


***
## Clean Data & Create Technical Indicator Variables

- Create Deep copy of dataframe
- Use Adjusted Close Data
- Drop Close 
- Rename "Adj. Close" as "Close"

In [20]:
df_features = df_ORIGINAL.copy(deep=True) # Create Deep
df_features.drop(['Close'], axis = 1, inplace = True) # drop close column
df_features.columns = ['Date', 'High', 'Low', 'Open', 'Volume', 'Close'] # Close is actually Adj. Close

df_features['Date'] = pd.to_datetime(df_features['Date'])
df_features.head() # sanity check


Unnamed: 0,Date,High,Low,Open,Volume,Close
0,2002-01-02,121.5,119.800003,120.599999,6862800,84.677422
1,2002-01-03,124.220001,120.25,121.5,8621700,86.1828
2,2002-01-04,125.599999,123.980003,124.050003,8405200,87.534859
3,2002-01-07,126.190002,123.699997,125.0,5939600,86.454575
4,2002-01-08,125.199997,123.730003,124.25,5311800,86.9076


#### Function: Create Lagged Returns

In [21]:
"""
Creates Lagged Returns 
- given OHLCV dataframe
- numer of lagged days
"""
def create_lag_features(df, lag_days):
    df_ret = df.copy()
    
    # iterate through the lag days to generate lag values up to lag_days + 1
    for i in range(1,lag_days + 2):
        df_lag = df_ret[['Date', 'Close']].copy()
        # generate dataframe to shift index by i day.
        df_lag['Date'] = df_lag['Date'].shift(-i)
        df_lag.columns = ['Date', 'value_lag' + str(i)]
        # combine the valuelag
        df_ret = pd.merge(df_ret, df_lag, how = 'left', left_on = ['Date'], right_on = ['Date'])
    
    #frees memory
    del df_lag
    
    # calculate today's percentage lag
    df_ret['Today'] = (df_ret['Close'] - df_ret['value_lag1'])/(df_ret['value_lag1']) * 100.0 
    
    # calculate percentage lag
    for i in range(1, lag_days + 1):
        df_ret['lag' + str(i)] = (df_ret['value_lag'+ str(i)] - df_ret['value_lag'+ str(i+1)])/(df_ret['value_lag'+str(i+1)]) * 100.0
    
    # drop unneeded columns which are value_lags
    for i in range(1, lag_days + 2):
        df_ret.drop(['value_lag' + str(i)], axis = 1, inplace = True)
                                                                                                                                                                                                                                                                                                                                     
    return df_ret


### Run Function
df_features = create_lag_features(df_features, 30) # 3 lag features
df_features.head(7)

Unnamed: 0,Date,High,Low,Open,Volume,Close,Today,lag1,lag2,lag3,...,lag21,lag22,lag23,lag24,lag25,lag26,lag27,lag28,lag29,lag30
0,2002-01-02,121.5,119.800003,120.599999,6862800,84.677422,,,,,...,,,,,,,,,,
1,2002-01-03,124.220001,120.25,121.5,8621700,86.1828,1.777781,,,,...,,,,,,,,,,
2,2002-01-04,125.599999,123.980003,124.050003,8405200,87.534859,1.568826,1.777781,,,...,,,,,,,,,,
3,2002-01-07,126.190002,123.699997,125.0,5939600,86.454575,-1.234119,1.568826,1.777781,,...,,,,,,,,,,
4,2002-01-08,125.199997,123.730003,124.25,5311800,86.9076,0.524004,-1.234119,1.568826,1.777781,...,,,,,,,,,,
5,2002-01-09,126.389999,124.150001,124.699997,6839900,86.761269,-0.168376,0.524004,-1.234119,1.568826,...,,,,,,,,,,
6,2002-01-10,124.0,121.419998,123.75,8926300,85.123428,-1.887755,-0.168376,0.524004,-1.234119,...,,,,,,,,,,


#### Drop Rows with NaN

In [22]:
# drop earlier data with missing lag features
df_features.dropna(inplace=True)
# reset index
df_features.reset_index(drop = True, inplace = True)


#### Create Technical Indicators

- Create Technical Indicators
- Call functions from `technical_inicators.py`
- Drop Rows with NaN
  

In [23]:

#### GENERATE TECHNICAL INDICATORS FEATURES
df_features = standard_deviation(df_features, 14)

df_features = relative_strength_index(df_features, 14) # periods
df_features = average_directional_movement_index(df_features, 14, 13) # n, n_ADX
df_features = moving_average(df_features, 21) # periods
df_features = exponential_moving_average(df_features, 21) # periods
df_features = momentum(df_features, 14) # 

df_features = average_true_range(df_features, 14)
df_features = bollinger_bands(df_features, 21)
df_features = ppsr(df_features)
df_features = stochastic_oscillator_k(df_features)
df_features = stochastic_oscillator_d(df_features, 14)
df_features = trix(df_features, 14)
df_features = macd(df_features, 26, 12)
df_features = mass_index(df_features)
df_features = vortex_indicator(df_features, 14)

df_features = kst_oscillator(df_features, 10, 10, 10, 15, 10, 15, 20, 30)
df_features = true_strength_index(df_features, 25, 13)

#df_features = accumulation_distribution(df_features, 14) # Causes Problems, apparently
df_features = chaikin_oscillator(df_features)
df_features = money_flow_index(df_features, 14)
df_features = on_balance_volume(df_features, 14)
df_features = force_index(df_features, 14)
df_features = ease_of_movement(df_features, 14)
df_features = commodity_channel_index(df_features, 14)
df_features = keltner_channel(df_features, 14)
df_features = ultimate_oscillator(df_features)
df_features = donchian_channel(df_features, 14)
    
# drop earlier data with missing lag features
df_features.dropna(inplace=True)
df_features = df_features.reset_index(drop = True)


In [24]:
### Sanity Check
df_features.head(10)

Unnamed: 0,Date,High,Low,Open,Volume,Close,Today,lag1,lag2,lag3,...,MFI_14,OBV_14,Force_14,EoM_14,CCI_14,KelChM_14,KelChU_14,KelChD_14,Ultimate_Osc,Donchian_14
0,2002-04-19,90.029999,87.599998,89.900002,9273200,62.108883,0.056187,4.881574,-1.612569,0.995846,...,0.5,-1787864.0,-35653630.0,-1.540823e-07,-0.296547,81.59287,84.155727,79.030014,-0.062308,8.139999
1,2002-04-22,88.949997,87.360001,88.949997,6757700,61.362175,-1.202256,0.056187,4.881574,-1.612569,...,0.5,-1743257.0,5674225.0,-1.370178e-07,-0.305867,80.752489,83.321773,78.183204,-0.009782,9.739998
2,2002-04-23,89.080002,87.239998,88.349998,5144100,60.943462,-0.682363,-1.202256,0.056187,4.881574,...,0.5,-1564329.0,22078700.0,-1.111541e-07,-0.208521,79.969302,82.484301,77.454302,-0.036853,9.739998
3,2002-04-24,88.25,86.269997,88.199997,6520500,60.364246,-0.950415,-0.682363,-1.202256,0.056187,...,0.428571,-2453543.0,-5924266.0,-1.330461e-07,-0.271156,79.10913,81.648415,76.569845,-0.052266,11.389999
4,2002-04-25,86.919998,85.769997,86.050003,6870000,60.51078,0.24275,-0.950415,-0.682363,-1.202256,...,0.428571,-1262700.0,21564450.0,-1.127969e-07,-0.516442,78.319478,80.655192,75.983764,-0.048195,22.700005
5,2002-04-26,86.959999,84.699997,86.699997,6684200,59.115082,-2.306529,0.24275,-0.950415,-0.682363,...,0.428571,1203236.0,65048460.0,-5.626763e-08,-1.084705,78.220331,80.486045,75.954616,-0.029546,21.550003
6,2002-04-29,85.099998,83.550003,84.709999,6535600,58.542847,-0.968002,-2.306529,0.24275,-0.950415,...,0.357143,-505278.6,29145560.0,-8.666365e-08,-1.681848,77.990409,80.265409,75.715409,-0.108974,21.43
7,2002-04-30,85.230003,83.529999,83.910004,8883900,58.452141,-0.154939,-0.968002,-2.306529,0.24275,...,0.357143,-2078693.0,15607430.0,-9.794218e-08,-1.467087,77.679606,79.951035,75.408178,-0.115874,23.360001
8,2002-05-01,84.650002,81.669998,83.849998,12203800,58.828953,0.64465,-0.154939,-0.968002,-2.306529,...,0.357143,460414.3,-854963.8,-6.938986e-08,-1.679569,77.549767,79.664767,75.434767,-0.073329,23.320007
9,2002-05-02,84.800003,83.129997,83.75,6582900,58.521915,-0.521915,0.64465,-0.154939,-0.968002,...,0.357143,-1139743.0,11215360.0,-5.890175e-08,-1.216196,77.41538,79.395381,75.435379,-0.077745,22.360001


***
## Train-Test Split

#### (Subject to change depending)
-  Train     : 2002 - 2017
    -  Validation: 2015 - 2017
    - OR         : Time Series Validation from the website
-  Test      : 2018 - Oct 31, 2018

#### In this Notebook
- Train : 2002 - 2017
- Test  : 2018 - Oct 31, 2018


#### Note
- y is Todays Daily Return.
- We wan't to PREDICT Tomorrows Daily Return
- Hence, we train the model on data __[start, today]__ and __predict y[today + 1]__ and NOT predict y[today]




In [25]:
df_features['Future Returns'] = df_features['Today'].shift(-1) 

# drop earlier data with missing lag features
df_features.dropna(inplace=True)

df_features.head()

Unnamed: 0,Date,High,Low,Open,Volume,Close,Today,lag1,lag2,lag3,...,OBV_14,Force_14,EoM_14,CCI_14,KelChM_14,KelChU_14,KelChD_14,Ultimate_Osc,Donchian_14,Future Returns
0,2002-04-19,90.029999,87.599998,89.900002,9273200,62.108883,0.056187,4.881574,-1.612569,0.995846,...,-1787864.0,-35653630.0,-1.540823e-07,-0.296547,81.59287,84.155727,79.030014,-0.062308,8.139999,-1.202256
1,2002-04-22,88.949997,87.360001,88.949997,6757700,61.362175,-1.202256,0.056187,4.881574,-1.612569,...,-1743257.0,5674225.0,-1.370178e-07,-0.305867,80.752489,83.321773,78.183204,-0.009782,9.739998,-0.682363
2,2002-04-23,89.080002,87.239998,88.349998,5144100,60.943462,-0.682363,-1.202256,0.056187,4.881574,...,-1564329.0,22078700.0,-1.111541e-07,-0.208521,79.969302,82.484301,77.454302,-0.036853,9.739998,-0.950415
3,2002-04-24,88.25,86.269997,88.199997,6520500,60.364246,-0.950415,-0.682363,-1.202256,0.056187,...,-2453543.0,-5924266.0,-1.330461e-07,-0.271156,79.10913,81.648415,76.569845,-0.052266,11.389999,0.24275
4,2002-04-25,86.919998,85.769997,86.050003,6870000,60.51078,0.24275,-0.950415,-0.682363,-1.202256,...,-1262700.0,21564450.0,-1.127969e-07,-0.516442,78.319478,80.655192,75.983764,-0.048195,22.700005,-2.306529


In [26]:
# ### Do not need to drop. Just don't use tomorrows data to predict tomorrows prices.
# ### Hence, only use todays data to predict tomorrows prices
# # drop unneded columns
#df_features.drop(['Open','Close','Low','High','Today'], axis = 1, inplace = True)

df_features = df_features.set_index(['Date'])

# stores all labels
y = df_features['Future Returns']

# drop labels
df_features.drop(['Future Returns'], axis = 1, inplace = True)

# # all training data
X_train_all = df_features.loc[df_features.index < '2018-01-01']
y_train_all = y[X_train_all.index]

# # creates all test data which is all after January 2018
X_test = df_features.loc[(df_features.index >= '2018-01-01'),:]
y_test = y[X_test.index]

In [27]:
print(X_train_all.shape)

(3954, 73)


In [43]:
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import LinearRegression

predict = []
actual = []
for count in range(1,500):
    X_train_i = X_train_all.iloc[0:(3454+count-1), :]
    y_train_i = y_train_all.iloc[0:(3454+count-1)]
     
    X_val_i = X_train_all.iloc[(3454+count),:].reshape(1,-1)
    y_val_i = y_train_all.iloc[(3454+count)]
        
#     rf = RandomForestRegressor(random_state = 0)
#     rf.fit(X_train_i, y_train_i)
    model = BayesianRidge(fit_intercept = True)
    model.fit(X_train_i, y_train_i)
    
    predict.append(model.predict(X_val_i)[0])
    actual.append(y_val_i)        

  # Remove the CWD from sys.path while we load stuff.


In [44]:
mean_squared_error(actual, predict)

1.268343637390248

In [45]:
r2_score(actual, predict)

-0.004875944857932746

In [33]:
print(r2_score(y_test, model.predict(X_test)))

-0.022087377128748642


### Train-Validation Split & Train Models

##### Run Models
- Random Forest
- Bagging
- Boosting
- Linear Regression

##### NOTE
- Need to retrain the model for each prediction. Suppose 200 Days total.
- Hence,
   - Train_1[start date, 100] --> Predict y[101]
   - Train_2[start date, 101] --> Predict y[102]
   - Train_3[start date, 103] --> Predict y[103]
   - ...
   - Train_199[start date, 198] --> Predict y[199]
   - Train_200[start date, 199] --> Predict y[200]
- Thus, to forecast 200 daily returns we need to train 200 different models!!!
   


#### Jonathans Code

In [100]:
# do time series split DOING NUMBER 2
splits = TimeSeriesSplit(n_splits=5)

# Initialize empty arrays
trainsamples  = [] # [664, 1327, 1990, 2653, 3316] # sizes of training samples
model1metrics = [] # Random Forest
model2metrics = [] # Bagging
model3metrics = [] # Boosting
model4metrics = [] # Linear Regression


for train_index, test_index in splits.split(X_train_all.loc[:,'Date']):
    
    # do the split for train, Take row 0 to last element of train index
    X_train = X_train_all.loc[0:train_index[len(train_index) - 1],:].copy(deep=True)
    y_train = y[X_train.index]
        
    # do the split for validation, Take first element of test index to last element of test index
    X_val = X_train_all.loc[test_index[0]:test_index[len(test_index) - 1],:].copy(deep=True)
    y_val = y[X_val.index]
        
    # Print Statements
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print('Observations: ', (X_train.shape[0] + X_test.shape[0]))
    print('Cutoff date, or first date in validation data: ', X_val.iloc[0,0])
    print('Training Observations: ', (X_train.shape[0]))
    print('Testing Observations: ', (X_test.shape[0]))
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    
    # drop date
    X_train.drop(['Date'], axis = 1, inplace = True)
    X_val.drop(['Date'], axis = 1, inplace = True)
    
    # Append current size of training sample
    trainsamples.append(X_train.shape[0]) # .shape[0] --> num rows
    
### Random Forest
    # random forest regression based on default parameter
    rf = RandomForestRegressor(random_state = 0)
    rf.fit(X_train, y_train)
    model1metrics.append([mean_squared_error(y_val, rf.predict(X_val)),r2_score(y_val, rf.predict(X_val))])
    
# ### Bagging
#     # bagging based on default parameter
#     bag = BaggingRegressor(DecisionTreeRegressor())
#     bag.fit(X_train, y_train)
#     model2metrics.append([mean_squared_error(y_val, bag.predict(X_val)),r2_score(y_val, bag.predict(X_val))])

# ### Boosting
#     # boosting on default parameter
#     boost = AdaBoostRegressor(DecisionTreeRegressor(), random_state = 0, learning_rate=0.01)
#     boost.fit(X_train, y_train)
#     model3metrics.append([mean_squared_error(y_val, boost.predict(X_val)),r2_score(y_val, boost.predict(X_val))])
    
# ### Linear Regression
#     # linear regression on default parameter
#     lr = linear_model.LinearRegression()
#     lr.fit(X_train, y_train)
#     model4metrics.append([mean_squared_error(y_val, lr.predict(X_val)),r2_score(y_val, lr.predict(X_val))])

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Observations:  875
Cutoff date, or first date in validation data:  2004-11-01 00:00:00
Training Observations:  664
Testing Observations:  211
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Observations:  1538
Cutoff date, or first date in validation data:  2007-06-21 00:00:00
Training Observations:  1327
Testing Observations:  211
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Observations:  2201
Cutoff date, or first date in validation data:  2010-02-08 00:00:00
Training Observations:  1990
Testing Observations:  211
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Observations:  2864
Cutoff date, or first date in validation data:  2012-09-24 00:00:00
Training Observations:  2653
Testing Observations:  211
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Observations:  3527
Cutoff date, or first date in validation data:  2015-05-15 00:00:00
Training Observations:  3316
Testing Observations:  211
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




In [110]:
model1metrics

[[0.0045937039913561935, 0.9954591784110058],
 [0.07477435108321685, 0.9794299412602301],
 [0.0004523796920241774, 0.9996845945462203],
 [0.0012142172454971058, 0.9990749091061193],
 [0.000794832356181995, 0.9994342788419209]]

#### Sams Code

In [118]:

### Random Forest - Train-Validate model
# in this case we can consider validation set as a test set

# Set the number of predictions we want to make
# Note - Num Predictions == Num models we'll need to train
trainSize = 3316 # chosen arbitrarily
numPreds = X_train_all.shape[0] - trainSize # Number of Predictions = numRows - training size

# for each prediction # 662 predictions
for i in range(numPreds): # i = 0,1,2,...,661
    trainSize += i
    # set temp train/test sets
    
    # do the split for train, Take row 0 to last element of train index
    X_train = X_train_all.loc[0:trainSize,:]
    y_train = y[X_train.index]
    





In [106]:
X_val.head()

Unnamed: 0,High,Low,Open,Volume,Close,Today,lag1,lag2,lag3,lag4,...,MFI_14,OBV_14,Force_14,EoM_14,CCI_14,KelChM_14,KelChU_14,KelChD_14,Ultimate_Osc,Donchian_14
3316,174.410004,172.600006,173.910004,2916600,149.597687,-0.453896,1.027401,1.014373,-0.333112,-0.903399,...,0.571429,523378.571429,-1111363.0,-1.267987e-07,1.040867,164.342709,166.943423,161.741994,0.129086,14.660004
3317,173.490005,172.300003,173.440002,1923600,149.424957,-0.115463,-0.453896,1.027401,1.014373,-0.333112,...,0.5,-29500.0,-5935885.0,-2.374802e-07,0.574141,164.404726,166.756869,162.052583,0.06595,15.100006
3318,173.75,171.929993,172.970001,2523000,149.787643,0.242721,-0.115463,-0.453896,1.027401,1.014373,...,0.5,-172028.571429,-2951631.0,-2.996166e-07,0.671242,164.386374,166.702089,162.070658,0.040401,15.100006
3319,174.440002,172.460007,173.330002,2300300,150.029404,0.161402,0.242721,-0.115463,-0.453896,1.027401,...,0.571429,353914.285714,-12060620.0,-1.680592e-07,0.949665,164.545557,166.718415,162.372698,0.093207,15.100006
3320,174.139999,173.039993,173.320007,2295600,149.666763,-0.241713,0.161402,0.242721,-0.115463,-0.453896,...,0.5,-46635.714286,-2011766.0,-1.946159e-07,0.844002,164.610773,166.749346,162.472201,0.082987,16.270004


In [107]:
y_val.head()
rf.predict(X_val).head()

3316   -0.453896
3317   -0.115463
3318    0.242721
3319    0.161402
3320   -0.241713
Name: Today, dtype: float64

In [112]:
X_train_all
X_train_all.shape[0]

3979

In [117]:
train_index[len(train_index) - 1]

3315