In [21]:
#Ref - https://towardsdatascience.com/random-forest-in-python-24d0893d51c0
import pandas as pd
import numpy as np
import datetime
import talib as ta
import matplotlib.pyplot as plt
import seaborn as sns
import talib as ta

In [22]:
#Read data
data = pd.read_csv('data/analysis/D1_XAUUSD_Returns.csv', parse_dates=[0], index_col='Date')
#print(data.head())

              Open    High     Low   Close  Volume    Weekday  Yearday  \
Date                                                                     
2003-05-05  340.35  342.59  339.73  341.31   11948     Monday      125   
2003-05-06  341.53  344.21  340.28  343.28   12161    Tuesday      126   
2003-05-07  343.45  344.19  339.00  341.23   11588  Wednesday      127   
2003-05-08  341.28  348.13  339.96  347.38   12108   Thursday      128   
2003-05-09  347.33  348.58  345.82  347.92   11539     Friday      129   

            Return_per_day  Return_log  
Date                                    
2003-05-05        0.282063         NaN  
2003-05-06        0.512400    0.005755  
2003-05-07       -0.646382   -0.005990  
2003-05-08        1.787389    0.017863  
2003-05-09        0.169867    0.001553  


In [23]:
#Date related features
data['Week'] = data.index.week
data['Day_Week'] = data.index.dayofweek
data['daily_return'] = data.Close.pct_change().mul(100)
data['day_week_sin'] = np.sin(data.Day_Week*(2.*np.pi/7))
data['day_week_cos'] = np.cos(data.Day_Week*(2.*np.pi/7))
data['week_sin'] = np.sin((data.Week-1)*(2.*np.pi/52))
data['week_cos'] = np.cos((data.Week-1)*(2.*np.pi/52))
data['yearday_sin'] = np.sin((data.Yearday)*(2.*np.pi/365.25))
data['yearday_cos'] = np.cos((data.Yearday)*(2.*np.pi/365.25))
#Simple Moving Average - 50 day
data['SMA_50'] = ta.SMA(data.Close, timeperiod = 50)
#Simple Moving Average - 200 day
data['SMA_200'] = ta.SMA(data.Close, timeperiod = 200)
# Exponential Moving Average - 9 day
data['EMA_9'] = ta.EMA(data.Close, timeperiod = 9)
# Exponential Moving Average - 21 day
data['EMA_21'] = ta.EMA(data.Close, timeperiod = 21)
# MACD
data['macd'], data['macdsignal'], data['macdhist'] = ta.MACD(data.Close, fastperiod=12, slowperiod=200, signalperiod=9)
# RSI
data['RSI'] = ta.RSI(data.Close, timeperiod=14)
#Define relative indicators
data['SMA_Delta'] = (data['SMA_50'] - data['SMA_200']) / data['Close']
#Define relative indicators
data['EMA_Delta'] = (data['EMA_9'] - data['EMA_21']) / data['Close']

In [24]:
print(data.tail())

               Open     High      Low    Close  Volume    Weekday  Yearday  \
Date                                                                         
2019-04-03  1291.70  1294.49  1288.34  1291.76   94534  Wednesday       93   
2019-04-04  1291.76  1294.34  1280.85  1292.10   77631   Thursday       94   
2019-04-05  1292.10  1293.15  1284.33  1291.18  102223     Friday       95   
2019-04-07  1291.31  1293.63  1291.20  1291.67    2133     Sunday       97   
2019-04-08  1291.73  1303.65  1291.58  1297.94   73029     Monday       98   

            Return_per_day  Return_log  Week  ...     SMA_50     SMA_200  \
Date                                          ...                          
2019-04-03        0.004645    0.000046    14  ...  1308.0620  1252.10065   
2019-04-04        0.026321    0.000263    14  ...  1307.6060  1252.59180   
2019-04-05       -0.071202   -0.000712    14  ...  1307.2982  1253.17900   
2019-04-07        0.027879    0.000379    14  ...  1306.9242  1253.76085 

In [5]:
#Choosing target variable
#data['Large_return'] = np.where(data['daily_return']>=0.5, 1, (np.where(data['daily_return']<=-0.5, -1, 0)))

In [25]:
#Drop rows with nan values
data.dropna(inplace=True)

In [26]:
#Pick features
features_list = ['Volume','day_week_sin','day_week_cos','week_sin','week_cos','yearday_sin','yearday_cos',
                 'macdhist', 'RSI', 'SMA_Delta','EMA_Delta']
features = data[features_list]
features.head()

Unnamed: 0_level_0,Volume,day_week_sin,day_week_cos,week_sin,week_cos,yearday_sin,yearday_cos,macdhist,RSI,SMA_Delta,EMA_Delta
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2004-01-12,20736,0.0,1.0,0.239316,0.970942,0.204966,0.978769,2.239954,70.847411,0.071611,0.011534
2004-01-13,21473,0.781831,0.62349,0.239316,0.970942,0.221772,0.975099,1.835514,68.227339,0.072198,0.010996
2004-01-14,20415,0.974928,-0.222521,0.239316,0.970942,0.238513,0.971139,0.945729,57.107955,0.073547,0.009352
2004-01-15,22449,0.433884,-0.900969,0.239316,0.970942,0.255182,0.966893,-0.982156,39.857679,0.075568,0.005214
2004-01-16,19571,-0.433884,-0.900969,0.239316,0.970942,0.271777,0.96236,-2.539018,37.444023,0.076086,0.001362


In [27]:
#Pick target variable 
target_unshifted = data['daily_return']
#Add shift so we can predict target on time t=1 with features from time t
target = target_unshifted.shift(-1)
#Drop last value from features and target (Nan)
features = features[:-1]
target = target[:-1]
print(features.tail())
print(target.tail())

            Volume  day_week_sin  day_week_cos  week_sin      week_cos  \
Date                                                                     
2019-04-02   84862      0.781831      0.623490       1.0  6.123234e-17   
2019-04-03   94534      0.974928     -0.222521       1.0  6.123234e-17   
2019-04-04   77631      0.433884     -0.900969       1.0  6.123234e-17   
2019-04-05  102223     -0.433884     -0.900969       1.0  6.123234e-17   
2019-04-07    2133     -0.781831      0.623490       1.0  6.123234e-17   

            yearday_sin  yearday_cos  macdhist        RSI  SMA_Delta  \
Date                                                                   
2019-04-02     0.999930    -0.011826 -5.395313  42.392384   0.044015   
2019-04-03     0.999579    -0.029025 -5.344675  42.440840   0.043322   
2019-04-04     0.998932    -0.046215 -5.125334  42.734791   0.042577   
2019-04-05     0.997989    -0.063391 -4.943795  42.108149   0.041915   
2019-04-07     0.995218    -0.097683 -4.629876  4

In [29]:
def split_train_test(data, train_ratio):
    indices = np.arange(len(data))
    train_set_size = int(len(data) * train_ratio)
    train_indices = indices[:train_set_size]
    test_indices = indices[train_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

X_train_set, X_test_set = split_train_test(features, 0.8)

In [30]:
print(len(X_train_set))
print(len(X_test_set))
print("Training Series:", "\n", X_train_set.head(), "\n")
print("Training Series:", "\n", X_train_set.tail(), "\n")
print("Testing Series:", "\n", X_test_set.head())
print("Testing Series:", "\n", X_test_set.tail())

3802
951
Training Series: 
             Volume  day_week_sin  day_week_cos  week_sin  week_cos  \
Date                                                                 
2004-01-12   20736      0.000000      1.000000  0.239316  0.970942   
2004-01-13   21473      0.781831      0.623490  0.239316  0.970942   
2004-01-14   20415      0.974928     -0.222521  0.239316  0.970942   
2004-01-15   22449      0.433884     -0.900969  0.239316  0.970942   
2004-01-16   19571     -0.433884     -0.900969  0.239316  0.970942   

            yearday_sin  yearday_cos  macdhist        RSI  SMA_Delta  \
Date                                                                   
2004-01-12     0.204966     0.978769  2.239954  70.847411   0.071611   
2004-01-13     0.221772     0.975099  1.835514  68.227339   0.072198   
2004-01-14     0.238513     0.971139  0.945729  57.107955   0.073547   
2004-01-15     0.255182     0.966893 -0.982156  39.857679   0.075568   
2004-01-16     0.271777     0.962360 -2.539018  3

In [31]:
y_train_set, y_test_set = split_train_test(target, 0.8)

In [32]:
print(len(y_train_set))
print(len(y_test_set))
print("Training Series:", "\n", y_train_set.head(), "\n")
print("Training Series:", "\n", y_train_set.tail(), "\n")
print("Testing Series:", "\n", y_test_set.head())
print("Testing Series:", "\n", y_test_set.tail())

3802
951
Training Series: 
 Date
2004-01-12   -0.211964
2004-01-13   -1.038471
2004-01-14   -2.587646
2004-01-15   -0.526380
2004-01-16   -0.135368
Name: daily_return, dtype: float64 

Training Series: 
 Date
2016-03-10   -1.625428
2016-03-11    0.191140
2016-03-13   -1.329044
2016-03-14   -0.266153
2016-03-15    2.102463
Name: daily_return, dtype: float64 

Testing Series: 
 Date
2016-03-16    0.017478
2016-03-17   -0.332809
2016-03-18   -0.133090
2016-03-20   -0.760502
2016-03-21    0.382763
Name: daily_return, dtype: float64
Testing Series: 
 Date
2019-04-02    0.004645
2019-04-03    0.026321
2019-04-04   -0.071202
2019-04-05    0.037950
2019-04-07    0.485418
Name: daily_return, dtype: float64


In [66]:
from sklearn.ensemble import RandomForestRegressor

best_error = 100
best_accuracy = 0
nr_estimators = 0

for i in range(1,101):
    model = RandomForestRegressor(n_estimators = i, random_state = 40)
    model.fit(X_train_set, y_train_set.values.ravel())
    y_model = model.predict(X_test_set)
    errors = abs(y_model - y_test_set)
    # Calculate mean absolute percentage error (MAPE)
    mape = 100 * (errors / y_test_set)
    # Calculate and display accuracy
    accuracy = 100 - np.mean(mape)
    if accuracy > best_accuracy:
        nr_estimators = i
        best_accuracy = accuracy
        best_error = round(np.mean(errors), 2)
    
print('Best Nr. estimators: ', nr_estimators)
print('Mean Absolute Error:', round(best_error, 2), 'percent')
print('Accuracy:', round(best_accuracy, 2), '%.')

Best Nr. estimators:  50
Mean Absolute Error: 0.55 percent
Accuracy: 66.41 %.


In [70]:
# Get numerical feature importances
importances = list(model.feature_importances_)
feature_list = list(features.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: SMA_Delta            Importance: 0.16
Variable: EMA_Delta            Importance: 0.16
Variable: Volume               Importance: 0.13
Variable: macdhist             Importance: 0.12
Variable: RSI                  Importance: 0.12
Variable: yearday_sin          Importance: 0.09
Variable: yearday_cos          Importance: 0.09
Variable: day_week_sin         Importance: 0.04
Variable: day_week_cos         Importance: 0.03
Variable: week_sin             Importance: 0.03
Variable: week_cos             Importance: 0.03
