## ML Random Forest Model: Trading Signals

---

In [185]:
# Initial imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from pathlib import Path
from datetime import datetime

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [186]:
# Set path to CSV and read in CSV
csv_path = Path("signals.csv")
trading_signals_df=pd.read_csv(csv_path, index_col='Date', parse_dates=True, infer_datetime_format=True)

In [187]:
trading_signals_df["Daily_Returns"] = trading_signals_df["Close"].pct_change()
trading_signals_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SP_Signal,V_Signal,Option_Signal,AAII_Signal,News_Signal,Daily_Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-04-13,2040.0,2180.0,2038.0,2168.87,6716709,-1.0,0,0.0,0.0,-1.0,
2020-04-09,2044.3,2053.0,2017.66,2042.76,4612716,0.0,0,0.0,0.0,1.0,-0.058145
2020-04-08,2021.0,2044.0,2011.15,2043.0,3943414,0.0,0,0.0,0.0,1.0,0.000117
2020-04-07,2017.11,2035.72,1997.62,2011.6,5082216,0.0,0,0.0,0.0,-1.0,-0.01537
2020-04-06,1936.0,1998.52,1930.02,1997.59,5723143,0.0,0,0.0,0.0,1.0,-0.006965


In [190]:
# Set x variable list of features
#["SP_Signal", "V_Signal",'Option_Signal', 'AAII_Signal','News_Signal']

x_var_list = ["SP_Signal", "V_Signal",'Option_Signal', 'AAII_Signal','News_Signal']

In [191]:
# Shift DataFrame values by 1
trading_signals_df[x_var_list] = trading_signals_df[x_var_list].shift(1)
trading_signals_df[x_var_list].head()

Unnamed: 0_level_0,SP_Signal,V_Signal,Option_Signal,AAII_Signal,News_Signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-13,,,,,
2020-04-09,-1.0,,0.0,0.0,-1.0
2020-04-08,0.0,0.0,0.0,0.0,1.0
2020-04-07,0.0,0.0,0.0,0.0,1.0
2020-04-06,0.0,0.0,0.0,0.0,-1.0


In [192]:
# Drop NAs and replace positive/negative infinity values
trading_signals_df.dropna(subset=x_var_list, inplace=True)
trading_signals_df.dropna(subset=['Daily_Returns'], inplace=True)
trading_signals_df = trading_signals_df.replace([np.inf, -np.inf], np.nan)
trading_signals_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SP_Signal,V_Signal,Option_Signal,AAII_Signal,News_Signal,Daily_Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-04-08,2021.0,2044.0,2011.15,2043.0,3943414,0.0,0.0,0.0,0.0,1.0,0.000117
2020-04-07,2017.11,2035.72,1997.62,2011.6,5082216,0.0,0.0,0.0,0.0,1.0,-0.01537
2020-04-06,1936.0,1998.52,1930.02,1997.59,5723143,0.0,0.0,0.0,0.0,-1.0,-0.006965
2020-04-03,1911.15,1926.33,1889.15,1906.59,3609870,0.0,0.0,0.0,0.0,1.0,-0.045555
2020-04-02,1901.64,1927.53,1890.0,1918.83,4305149,0.0,0.0,0.0,0.0,1.0,0.00642


In [193]:
# Create signals for  trading days (1:buy, 0:hold, -1:sell) as the dependent variable
trading_signals_df["Actual_Signal"] = np.where(trading_signals_df["Daily_Returns"]>0,1,np.where(trading_signals_df["Daily_Returns"]<0,-1,0))

In [194]:
# Set training start and end dates
#training_start = trading_signals_df.index.min().strftime(format= '%Y-%m-%d')
training_start = trading_signals_df.index.min().strftime(format= '%Y-%m-%d')
#training_end = '2016-04-15'
training_end = '2016-04-15'

# set testing start and end dates
#testing_start =  '2016-04-16'
testing_start =  '2016-04-16'
#testing_end = trading_signals_df.index.max().strftime(format= '%Y-%m-%d')
testing_end = trading_signals_df.index.max().strftime(format= '%Y-%m-%d')

# Print training and testing start/end dates
print(f"Training Start: {training_start}")
print(f"Training End: {training_end}")
print(f"Testing Start: {testing_start}")
print(f"Testing End: {testing_end}")

Training Start: 1997-05-16
Training End: 2016-04-15
Testing Start: 2016-04-16
Testing End: 2020-04-08


In [195]:
# Construct the x train and y train datasets
x_train = trading_signals_df[x_var_list][training_end:]
y_train = trading_signals_df["Actual_Signal"][training_end:]
x_train.head()

Unnamed: 0_level_0,SP_Signal,V_Signal,Option_Signal,AAII_Signal,News_Signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-04-15,-1.0,0.0,0.0,1.0,-1.0
2016-04-14,0.0,0.0,0.0,1.0,-1.0
2016-04-13,0.0,0.0,0.0,0.0,-1.0
2016-04-12,0.0,0.0,0.0,0.0,-1.0
2016-04-11,0.0,0.0,0.0,0.0,-1.0


In [196]:
# Construct the x test and y test datasets
x_test = trading_signals_df[x_var_list][:testing_start]
y_test = trading_signals_df["Actual_Signal"][:testing_start]
y_test.tail()

Date
2016-04-22   -1
2016-04-21    1
2016-04-20    1
2016-04-19   -1
2016-04-18    1
Name: Actual_Signal, dtype: int32

### Model Testing

In [197]:
# Import SKLearn Library and Classes
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [198]:
# Fit a SKLearn linear regression using just the training set (X_train, Y_train):
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
model.fit(x_train, y_train)

# Make a prediction of "y" values from the x test dataset
predictions = model.predict(x_test)

# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe: 
Results = y_test.to_frame()
Results["Predicted Value"] = predictions

# Rename the y-test column and reoder df columns
Results.rename(columns={"Actual_Signal": "Actual Value"}, inplace=True)
Results_df = Results[["Predicted Value", "Actual Value"]]
Results_df.head(10)

Unnamed: 0_level_0,Predicted Value,Actual Value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-08,1,1
2020-04-07,1,-1
2020-04-06,1,-1
2020-04-03,1,-1
2020-04-02,1,1
2020-04-01,1,-1
2020-03-31,1,1
2020-03-30,1,1
2020-03-27,1,-1
2020-03-26,1,1


In [200]:
# Save the pre-trained model
from joblib import dump, load
dump(model, 'random_forest_model(ls).joblib')

['random_forest_model(ls).joblib']

### Model Evaluation

In [201]:
# Model evaluation reports
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

In [202]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 1", "Actual 0", "Actual -1" ], columns=["Predicted 1", "Predicted 0","Predicted -1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

### Displaying results

In [203]:
# Confusion Matrix
print("Confusion Matrix")
display(cm_df)

Confusion Matrix


Unnamed: 0,Predicted 1,Predicted 0,Predicted -1
Actual 1,144,0,411
Actual 0,1,0,0
Actual -1,81,0,365


In [204]:
# Accuracy Score
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.5079840319361277


In [205]:
# Classification Report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

          -1       0.64      0.26      0.37       555
           0       0.00      0.00      0.00         1
           1       0.47      0.82      0.60       446

    accuracy                           0.51      1002
   macro avg       0.37      0.36      0.32      1002
weighted avg       0.56      0.51      0.47      1002

