In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import hvplot.pandas
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from pandas.tseries.offsets import DateOffset
from sklearn.metrics import classification_report
from finta import TA
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import sklearn.ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
import pydotplus
from IPython.display import Image

## Stock Data Import and Cleaning (AMZN, META, TSLA)

In [2]:
# Read the csv file 
amzn_df = pd.read_csv(Path("Resources/amzn.csv"))

# Convert 'time' column from timestamp (seconds since epoch) to acutal time
amzn_df ['time'] = pd.to_datetime(
    amzn_df['time'],
    unit = 's',
    infer_datetime_format=True,
    utc=True
)    
# Convert timezone from UTC to Eastern Time
amzn_df['time'] = amzn_df['time'].dt.tz_convert('US/Eastern')

# Set 'time' column as the index 
amzn_df.set_index('time', inplace=True)

# Review df
amzn_df.head()          

Unnamed: 0_level_0,open,high,low,close,VWAP,Upper Band #1,Lower Band #1,Upper Band #2,Lower Band #2,Upper Band #3,...,Volume,Volume MA,EMA,Smoothing Line,Developing Poc,Developing VA High,Developing VA Low,Developing Poc.1,Developing VA High.1,Developing VA Low.1
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-03-14 09:30:00-04:00,9.1825,9.215995,9.057,9.112999,9.128665,9.128665,9.128665,9.128665,9.128665,9.128665,...,73994680,107206355.0,9.59791,9.60661,,,,,,
2012-03-15 09:30:00-04:00,9.101,9.2215,9.015,9.2215,9.152667,9.152667,9.152667,9.152667,9.152667,9.152667,...,83212340,103564913.0,9.594164,9.602464,,,,,,
2012-03-16 09:30:00-04:00,9.164,9.284,9.117501,9.2525,9.218,9.218,9.218,9.218,9.218,9.218,...,98696480,95934268.0,9.590765,9.598431,,,,,,
2012-03-19 09:30:00-04:00,9.1725,9.334,9.15,9.276,9.253333,9.253333,9.253333,9.253333,9.253333,9.253333,...,78080380,92419724.0,9.587633,9.594651,,,,,,
2012-03-20 09:30:00-04:00,9.244,9.7205,9.144,9.6165,9.493667,9.493667,9.493667,9.493667,9.493667,9.493667,...,183346180,94828072.0,9.58792,9.591678,,,,,,


In [3]:
# Filter column and only keep ones needed
amzn_df = amzn_df[['open','high','low','close','VWAP','Volume','Volume MA','EMA']]

#Rename EMA to 200EMA
amzn_df = amzn_df.rename(columns={'EMA':'200EMA'})

#Dropping NAN Values 
amzn_df = amzn_df.dropna()

# Review the DataFrame
amzn_df.head()

Unnamed: 0_level_0,open,high,low,close,VWAP,Volume,Volume MA,200EMA
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-03-14 09:30:00-04:00,9.1825,9.215995,9.057,9.112999,9.128665,73994680,107206355.0,9.59791
2012-03-15 09:30:00-04:00,9.101,9.2215,9.015,9.2215,9.152667,83212340,103564913.0,9.594164
2012-03-16 09:30:00-04:00,9.164,9.284,9.117501,9.2525,9.218,98696480,95934268.0,9.590765
2012-03-19 09:30:00-04:00,9.1725,9.334,9.15,9.276,9.253333,78080380,92419724.0,9.587633
2012-03-20 09:30:00-04:00,9.244,9.7205,9.144,9.6165,9.493667,183346180,94828072.0,9.58792


In [4]:
# Read the csv file 
meta_df = pd.read_csv(Path("Resources/meta.csv"))

# Convert 'time' column from timestamp (seconds since epoch) to acutal time
meta_df ['time'] = pd.to_datetime(
    meta_df['time'],
    unit = 's',
    infer_datetime_format=True,
    utc=True
)    
# Convert timezone from UTC to Eastern Time
meta_df['time'] = meta_df['time'].dt.tz_convert('US/Eastern')

# Set 'time' column as the index 
meta_df.set_index('time', inplace=True)

# Review df
meta_df.head()

Unnamed: 0_level_0,open,high,low,close,VWAP,Upper Band #1,Lower Band #1,Upper Band #2,Lower Band #2,Upper Band #3,...,Volume,Volume MA,EMA,Smoothing Line,Developing Poc,Developing VA High,Developing VA Low,Developing Poc.1,Developing VA High.1,Developing VA Low.1
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-05-18 09:30:00-04:00,42.05,45.0,38.0,38.2318,40.4106,40.4106,40.4106,40.4106,40.4106,40.4106,...,580587776,,,,,,,,,
2012-05-21 09:30:00-04:00,36.53,36.66,33.0,34.03,34.563333,34.563333,34.563333,34.563333,34.563333,34.563333,...,168309808,,,,,,,,,
2012-05-22 09:30:00-04:00,32.61,33.59,30.94,31.0,31.843333,31.843333,31.843333,31.843333,31.843333,31.843333,...,102053808,,,,,,,,,
2012-05-23 09:30:00-04:00,31.37,32.5,31.36,32.0,31.953333,31.953333,31.953333,31.953333,31.953333,31.953333,...,73721120,,,,,,,,,
2012-05-24 09:30:00-04:00,32.95,33.21001,31.77,33.03,32.670003,32.670003,32.670003,32.670003,32.670003,32.670003,...,50275872,,,,,,,,,


In [5]:
# Filter column and only keep ones needed
meta_df = meta_df[['open','high','low','close','VWAP','Volume','Volume MA','EMA']]

#Rename EMA to 200EMA
meta_df = meta_df.rename(columns={'EMA':'200EMA'})

#Dropping NAN Values 
meta_df = meta_df.dropna()

# Review the DataFrame
meta_df.head()

Unnamed: 0_level_0,open,high,low,close,VWAP,Volume,Volume MA,200EMA
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-03-07 09:30:00-05:00,27.57001,28.675,27.47,28.578,28.241,74540128,46413816.0,25.669904
2013-03-08 09:30:00-05:00,28.425,28.47,27.73,27.96001,28.053337,44198832,46896756.8,25.692691
2013-03-11 09:30:00-04:00,28.00999,28.64,27.83,28.14,28.203333,35642064,46793423.2,25.717042
2013-03-12 09:30:00-04:00,28.10001,28.32001,27.60001,27.83,27.916673,27569584,46303815.2,25.738067
2013-03-13 09:30:00-04:00,27.62,27.64999,26.92,27.08,27.216663,39619440,43610005.6,25.751419


In [6]:
# Read the csv file 
tsla_df = pd.read_csv(Path("./Resources/tsla.csv"))

# Convert 'time' column from timestamp (seconds since epoch) to acutal time
tsla_df ['time'] = pd.to_datetime(
    tsla_df['time'],
    unit = 's',
    infer_datetime_format=True,
    utc=True
)    
# Convert timezone from UTC to Eastern Time
tsla_df['time'] = tsla_df['time'].dt.tz_convert('US/Eastern')

# Set 'time' column as the index 
tsla_df.set_index('time', inplace=True)
                      
# Review df
tsla_df.head()   

Unnamed: 0_level_0,open,high,low,close,VWAP,Upper Band #1,Lower Band #1,Upper Band #2,Lower Band #2,Upper Band #3,...,Volume,Volume MA,EMA,Smoothing Line,Developing Poc,Developing VA High,Developing VA Low,Developing Poc.1,Developing VA High.1,Developing VA Low.1
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-03-12 09:30:00-04:00,2.312664,2.41933,2.306665,2.400664,2.375553,2.375553,2.375553,2.375553,2.375553,2.375553,...,29467829,16848705.0,1.929784,1.921918,,,,,,
2012-03-13 09:30:00-04:00,2.433997,2.439331,2.366664,2.405998,2.403998,2.403998,2.403998,2.403998,2.403998,2.403998,...,15024660,16731543.65,1.934522,1.92575,,,,,,
2012-03-14 09:30:00-04:00,2.399998,2.399998,2.319998,2.352664,2.357553,2.357553,2.357553,2.357553,2.357553,2.357553,...,12771972,16011617.15,1.938683,1.929832,,,,,,
2012-03-15 09:30:00-04:00,2.351998,2.365331,2.318664,2.333331,2.339109,2.339109,2.339109,2.339109,2.339109,2.339109,...,8573948,14366955.5,1.94261,1.93413,,,,,,
2012-03-16 09:30:00-04:00,2.326664,2.392664,2.321998,2.354665,2.356442,2.356442,2.356442,2.356442,2.356442,2.356442,...,10938550,13249154.35,1.94671,1.938462,,,,,,


In [7]:
# Filter column and only keep ones needed
tsla_df = tsla_df[['open','high','low','close','VWAP','Volume','Volume MA','EMA']]

#Rename EMA to 200EMA
tsla_df = tsla_df.rename(columns={'EMA':'200EMA'})

#Dropping NAN Values 
tsla_df = tsla_df.dropna()

# Review the DataFrame
tsla_df.head()

Unnamed: 0_level_0,open,high,low,close,VWAP,Volume,Volume MA,200EMA
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-03-12 09:30:00-04:00,2.312664,2.41933,2.306665,2.400664,2.375553,29467829,16848705.0,1.929784
2012-03-13 09:30:00-04:00,2.433997,2.439331,2.366664,2.405998,2.403998,15024660,16731543.65,1.934522
2012-03-14 09:30:00-04:00,2.399998,2.399998,2.319998,2.352664,2.357553,12771972,16011617.15,1.938683
2012-03-15 09:30:00-04:00,2.351998,2.365331,2.318664,2.333331,2.339109,8573948,14366955.5,1.94261
2012-03-16 09:30:00-04:00,2.326664,2.392664,2.321998,2.354665,2.356442,10938550,13249154.35,1.94671


## Importing Additional Data for ML Features Set (VIX, Search Trends)

In [8]:
# Read the csv file 
vix_df = pd.read_csv(Path("Resources/vix.csv"))

# Convert 'time' column from timestamp (seconds since epoch) to acutal time
vix_df ['time'] = pd.to_datetime(
    vix_df['time'],
    unit = 's',
    infer_datetime_format=True,
    utc=True
)    
# Convert timezone from UTC to Eastern Time
vix_df['time'] = vix_df['time'].dt.tz_convert('US/Eastern')

# Set 'time' column as the index 
vix_df.set_index('time', inplace=True)

# Calculate VIX EMA 
vix_df['VIX_40_EMA'] = TA.EMA(vix_df,40)

#Remove unnecessary columns
vix_df = vix_df[['close','VIX_40_EMA']]

#Rename close column
vix_df = vix_df.rename(columns={'close':'VIX_close'})

# Review df
vix_df.head()    

Unnamed: 0_level_0,VIX_close,VIX_40_EMA
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2009-10-13 09:30:00-04:00,22.98999,22.98999
2009-10-14 09:30:00-04:00,22.85999,22.923365
2009-10-15 09:30:00-04:00,21.71999,22.502021
2009-10-16 09:30:00-04:00,21.42999,22.213586
2009-10-19 09:30:00-04:00,21.48999,22.054043


In [9]:
# Add VIX data to stock dfs
AMZN = pd.concat([amzn_df,vix_df],axis=1)
META = pd.concat([meta_df,vix_df],axis=1)
TSLA = pd.concat([tsla_df,vix_df],axis=1)


In [10]:
AMZN.head(3)

Unnamed: 0_level_0,open,high,low,close,VWAP,Volume,Volume MA,200EMA,VIX_close,VIX_40_EMA
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2009-10-13 09:30:00-04:00,,,,,,,,,22.98999,22.98999
2009-10-14 09:30:00-04:00,,,,,,,,,22.85999,22.923365
2009-10-15 09:30:00-04:00,,,,,,,,,21.71999,22.502021


In [11]:
amzn_trend = pd.read_csv(Path('Search_Trends/Amazon.csv'))
amzn_trend = amzn_trend.set_index('Month')
amzn_trend = amzn_trend.rename(columns={'Amazon':'AMZN_search_trends'})
#display(amzn_trend.head())

meta_trend = pd.read_csv(Path('Search_Trends/Meta.csv'))
meta_trend = meta_trend.set_index('Month')
meta_trend = meta_trend.rename(columns={'Meta':'META_search_trends'})
#display(meta_trend.head())


tsla_trend = pd.read_csv(Path('Search_Trends/Tesla.csv'))
tsla_trend = tsla_trend.set_index('Month')
tsla_trend = tsla_trend.rename(columns={'tesla':'TSLA_search_trends'})
#display(tsla_trend.head())


In [12]:
# Create Month and Year cols based on index 
AMZN['Month'] = AMZN.index.month
AMZN['Year'] = AMZN.index.year

# Create Month and Year cols based on index (after formatting index as datetime)
amzn_trend.index = pd.to_datetime(amzn_trend.index)
amzn_trend['Month'] = amzn_trend.index.month
amzn_trend['Year'] = amzn_trend.index.year

#Create placeholder column for search trend data
AMZN['AMZN_search_trends'] = 0

# Create a dictionary from df2 where col1 and col2 are the keys, and value_df2 is the value
mapping_dict = amzn_trend.set_index(['Month', 'Year'])['AMZN_search_trends'].to_dict()

# Map the values from df2 to df1 based on col1 and col2
AMZN['AMZN_search_trends'] = AMZN.apply(lambda x: mapping_dict.get((x['Month'], x['Year']), x['AMZN_search_trends']), axis=1)

# Drop month & year columns
AMZN = AMZN.drop(columns=['Month','Year'])

# The df1 will now have values from df2 mapped based on matching col1 and col2.
AMZN

Unnamed: 0_level_0,open,high,low,close,VWAP,Volume,Volume MA,200EMA,VIX_close,VIX_40_EMA,AMZN_search_trends
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2009-10-13 09:30:00-04:00,,,,,,,,,22.98999,22.989990,0.0
2009-10-14 09:30:00-04:00,,,,,,,,,22.85999,22.923365,0.0
2009-10-15 09:30:00-04:00,,,,,,,,,21.71999,22.502021,0.0
2009-10-16 09:30:00-04:00,,,,,,,,,21.42999,22.213586,0.0
2009-10-19 09:30:00-04:00,,,,,,,,,21.48999,22.054043,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2023-07-18 09:30:00-04:00,132.71,133.86,131.355,132.83,132.681667,54969133.0,52859640.65,113.656816,13.30000,14.809797,0.0
2023-07-19 09:30:00-04:00,,,,,,,,,13.76000,14.758588,0.0
2023-07-20 09:30:00-04:00,,,,,,,,,13.99000,14.721096,0.0
2023-07-21 09:30:00-04:00,,,,,,,,,13.60000,14.666408,0.0


In [13]:
# Create Month and Year cols based on index 
META['Month'] = META.index.month
META['Year'] = META.index.year

# Create Month and Year cols based on index (after formatting index as datetime)
meta_trend.index = pd.to_datetime(meta_trend.index)
meta_trend['Month'] = meta_trend.index.month
meta_trend['Year'] = meta_trend.index.year

#Create placeholder column for search trend data
META['META_search_trends'] = 0

# Create a dictionary from df2 where col1 and col2 are the keys, and value_df2 is the value
mapping_dict = meta_trend.set_index(['Month', 'Year'])['META_search_trends'].to_dict()

# Map the values from df2 to df1 based on col1 and col2
META['META_search_trends'] = META.apply(lambda x: mapping_dict.get((x['Month'], x['Year']), x['META_search_trends']), axis=1)

# Drop month & year columns
META = META.drop(columns=['Month','Year'])

# The df1 will now have values from df2 mapped based on matching col1 and col2.
META

Unnamed: 0_level_0,open,high,low,close,VWAP,Volume,Volume MA,200EMA,VIX_close,VIX_40_EMA,META_search_trends
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2009-10-13 09:30:00-04:00,,,,,,,,,22.98999,22.989990,0.0
2009-10-14 09:30:00-04:00,,,,,,,,,22.85999,22.923365,0.0
2009-10-15 09:30:00-04:00,,,,,,,,,21.71999,22.502021,0.0
2009-10-16 09:30:00-04:00,,,,,,,,,21.42999,22.213586,0.0
2009-10-19 09:30:00-04:00,,,,,,,,,21.48999,22.054043,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2023-07-18 09:30:00-04:00,310.875,314.2,307.6188,312.05,311.2896,20757323.0,26430892.9,216.492856,13.30000,14.809797,0.0
2023-07-19 09:30:00-04:00,,,,,,,,,13.76000,14.758588,0.0
2023-07-20 09:30:00-04:00,,,,,,,,,13.99000,14.721096,0.0
2023-07-21 09:30:00-04:00,,,,,,,,,13.60000,14.666408,0.0


In [14]:
# Create Month and Year cols based on index 
TSLA['Month'] = TSLA.index.month
TSLA['Year'] = TSLA.index.year

# Create Month and Year cols based on index (after formatting index as datetime)
tsla_trend.index = pd.to_datetime(tsla_trend.index)
tsla_trend['Month'] = tsla_trend.index.month
tsla_trend['Year'] = tsla_trend.index.year

#Create placeholder column for search trend data
TSLA['TSLA_search_trends'] = 0

# Create a dictionary from df2 where col1 and col2 are the keys, and value_df2 is the value
mapping_dict = tsla_trend.set_index(['Month', 'Year'])['TSLA_search_trends'].to_dict()

# Map the values from df2 to df1 based on col1 and col2
TSLA['TSLA_search_trends'] = TSLA.apply(lambda x: mapping_dict.get((x['Month'], x['Year']), x['TSLA_search_trends']), axis=1)

# Drop month & year columns
TSLA = TSLA.drop(columns=['Month','Year'])

# The df1 will now have values from df2 mapped based on matching col1 and col2.
TSLA

Unnamed: 0_level_0,open,high,low,close,VWAP,Volume,Volume MA,200EMA,VIX_close,VIX_40_EMA,TSLA_search_trends
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2009-10-13 09:30:00-04:00,,,,,,,,,22.98999,22.989990,0.0
2009-10-14 09:30:00-04:00,,,,,,,,,22.85999,22.923365,0.0
2009-10-15 09:30:00-04:00,,,,,,,,,21.71999,22.502021,0.0
2009-10-16 09:30:00-04:00,,,,,,,,,21.42999,22.213586,0.0
2009-10-19 09:30:00-04:00,,,,,,,,,21.48999,22.054043,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2023-07-18 09:30:00-04:00,290.15,295.26,286.01,293.34,291.536667,112434713.0,1.370000e+08,214.699154,13.30000,14.809797,0.0
2023-07-19 09:30:00-04:00,,,,,,,,,13.76000,14.758588,0.0
2023-07-20 09:30:00-04:00,,,,,,,,,13.99000,14.721096,0.0
2023-07-21 09:30:00-04:00,,,,,,,,,13.60000,14.666408,0.0


In [15]:
# Filter each dataframe to 10yr period
AMZN = AMZN.loc['05-2012':'05-2022']
META = META.loc['05-2012':'05-2022']
TSLA = TSLA.loc['05-2012':'05-2022']

#drop nan
AMZN = AMZN.dropna()
META = META.dropna()
TSLA = TSLA.dropna()


## Trading Algo (Signal Generation)

In [16]:
#set some options for display and troubleshooting
pd.set_option("display.max_rows", 2000)
pd.set_option("display.max_columns", 2000)
pd.set_option("display.width", 1000)

In [17]:
# Define periods for fast and slow EMAs (triggers)
fast_ema = 9
slow_ema = 40

# Initialize list of ticker dfs for use in for loop
ticker_df = [AMZN,META,TSLA]

# For each ticker dataframe in the list:
for df in ticker_df:
    
    # Calculate % returns for later use
    df['pct_returns'] = df['close'].pct_change()
    
    # Calculate Fast EMA 
    df['Fast_EMA'] = TA.EMA(df,fast_ema)
    
    # Calculate Slow EMA
    df['Slow_EMA'] = TA.EMA(df,slow_ema)

    # Initialize empty Signal column with 0.0 default value
    df['Signal'] = 0.0

    # Generate Signal value ("long if fast_ema > slow_ema", otherwise flip short)
    df['Signal'] = np.where(
        df['Slow_EMA'] < df['Fast_EMA'], 1.0, -1.0)
    
#This loop should not generate Entry/Exit - this will not be needed until after ML predictions    
    # Create Entry/Exit column and fill with the differences (trades) from Signal column
    #df['Entry/Exit'] = df['Signal'].diff()
    
    #Drop any NA values
    df.dropna()


In [18]:
#drop nan
AMZN = AMZN.dropna()
META = META.dropna()
TSLA = TSLA.dropna()

# Review complete dfs (with signal)
display(AMZN.head(3))
display(META.head(3))
display(TSLA.head(3))

Unnamed: 0_level_0,open,high,low,close,VWAP,Volume,Volume MA,200EMA,VIX_close,VIX_40_EMA,AMZN_search_trends,pct_returns,Fast_EMA,Slow_EMA,Signal
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2012-05-02 09:30:00-04:00,11.391001,11.572,11.37,11.5125,11.484833,91879460.0,110478210.0,9.684092,16.88,17.494492,40.0,0.000913,11.507833,11.507381,1.0
2012-05-03 09:30:00-04:00,11.487001,11.6265,11.4015,11.4725,11.500167,81157180.0,109076585.0,9.701887,17.56,17.497688,40.0,-0.003474,11.493352,11.495168,-1.0
2012-05-04 09:30:00-04:00,11.39,11.485995,11.187001,11.199501,11.290832,91757900.0,110446703.0,9.716789,19.16,17.578776,40.0,-0.023796,11.393809,11.415618,-1.0


Unnamed: 0_level_0,open,high,low,close,VWAP,Volume,Volume MA,200EMA,VIX_close,VIX_40_EMA,META_search_trends,pct_returns,Fast_EMA,Slow_EMA,Signal
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2013-03-08 09:30:00-05:00,28.425,28.47,27.73,27.96001,28.053337,44198832.0,46896756.8,25.692691,12.59,14.26904,91.0,-0.021625,28.234672,28.26128,-1.0
2013-03-11 09:30:00-04:00,28.00999,28.64,27.83,28.14,28.203333,35642064.0,46793423.2,25.717042,11.56,14.136891,91.0,0.006437,28.195872,28.218816,-1.0
2013-03-12 09:30:00-04:00,28.10001,28.32001,27.60001,27.83,27.916673,27569584.0,46303815.2,25.738067,12.27,14.045823,91.0,-0.011016,28.071932,28.114203,-1.0


Unnamed: 0_level_0,open,high,low,close,VWAP,Volume,Volume MA,200EMA,VIX_close,VIX_40_EMA,TSLA_search_trends,pct_returns,Fast_EMA,Slow_EMA,Signal
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2012-05-02 09:30:00-04:00,2.233331,2.292664,2.225998,2.262664,2.260442,7458772.0,16338969.05,2.038579,16.88,17.494492,8.0,0.004736,2.257924,2.257464,1.0
2012-05-03 09:30:00-04:00,2.260664,2.266664,2.141998,2.163999,2.190887,12631617.0,13608434.55,2.039827,17.56,17.497688,8.0,-0.043606,2.21943,2.224739,-1.0
2012-05-04 09:30:00-04:00,2.154665,2.163999,2.093331,2.121998,2.126443,18712308.0,13411940.35,2.040645,19.16,17.578776,8.0,-0.019409,2.186425,2.197096,-1.0


## RANDOM FOREST

In [19]:
from pandas.tseries.offsets import DateOffset

#INPUTS
data_dfs = [AMZN,META,TSLA]

#OUTPUTS (predictions, eval metrics, and setting up dfs for comparative backtesting)

confusion_dfs = {}
confusion_names = {
    1:'amzn_cm',
    2:'meta_cm',
    3:'tsla_cm'
}

accuracy_dfs = {}
accuracy_names = {
    1:'amzn_acc',
    2:'meta_acc',
    3:'tsla_acc'
}

y_test_dfs = {}
y_test_names = {
    1:'amzn_cr',
    2:'meta_cr',
    3:'tsla_cr'
}

importance_dfs = {}
importance_names = {
    1:'amzn_imp',
    2:'meta_imp',
    3:'tsla_imp'
}

prediction_dfs = {}
prediction_names = {
    1:'pred_amzn_df',
    2:'pred_meta_df',
    3:'pred_tsla_df'
}

raw_prediction_dfs = {}
raw_prediction_names = {
    1:'amzn_pred',
    2:'meta_pred',
    3:'tsla_pred'
}

#WORK
for i, df in enumerate(data_dfs):
# Random Forest Model Setup    
    #Define feature set
    X = df.drop(columns='Signal')
    
    #Define target set
    y = df['Signal'].astype('int8')
    
    # Select the start of the training period
    training_begin = X.index.min()
    
    # Select the ending period for the training data with an offset of 2 years
    training_end = X.index.min() + DateOffset(years=2)
    
    # Generate the X_train and y_train DataFrames
    X_train = X.loc[training_begin:training_end]
    y_train = y.loc[training_begin:training_end]
    
    # Generate the X_test and y_test DataFrames
    X_test = X.loc[training_end:]
    y_test = y.loc[training_end:]
    
    # Creating StandardScaler instance
    scaler = StandardScaler()

    # Fitting Standard Scaller
    X_scaler = scaler.fit(X_train)

    # Scaling data
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    # Create a random forest classifier
    rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

    # Fitting the model
    rf_model = rf_model.fit(X_train_scaled, y_train)

    # Making predictions using the testing data
    predictions = rf_model.predict(X_test_scaled)
    
# Number loop iterations
    df_num = i + 1
    
    ## Store the predictions in dictionary
    raw_prediction_dfs[raw_prediction_names[df_num]] = predictions
    
# Model Evaluations    
    # Calculating the confusion matrix
    cm = confusion_matrix(y_test, predictions)
    cm_df = pd.DataFrame(
        cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
    )
    
    #Store confusion matrix
    confusion_dfs[confusion_names[df_num]] = cm_df

    # Calculating the accuracy score
    acc_score = accuracy_score(y_test, predictions)
    
    #Store accuracy
    accuracy_dfs[accuracy_names[df_num]] = acc_score
    
    # Store y_test for later use with classification report
    y_test_dfs[y_test_names[df_num]] = y_test

    # Determine, sort, and store feature importances
    importance_dfs[importance_names[df_num]] = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
    
# Setup df for comparative backtest results
    # Create df using same index as X_test
    prediction_dfs[prediction_names[df_num]] = pd.DataFrame(index=X_test.index)
    
    # Store random forest predictions in df as rf_signal 
    prediction_dfs[prediction_names[df_num]]["rf_signal"] = predictions
    
    # Bring basic (non-ML) signal in from data dfs for comp
    prediction_dfs[prediction_names[df_num]]["basic_signal"] = df['Signal']
    
    #Bring in close price from data dfs for calculations
    prediction_dfs[prediction_names[df_num]]["close"] = df['close']

### Random Forest Model Confusion Matrices

In [20]:
# Extract confusion matrices from dictionary
AMZN_rf_confusion = confusion_dfs['amzn_cm']
META_rf_confusion = confusion_dfs['meta_cm']
TSLA_rf_confusion = confusion_dfs['tsla_cm']

display(AMZN_rf_confusion)
display(META_rf_confusion)
display(TSLA_rf_confusion)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,80,605
Actual 1,58,1292


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,174,427
Actual 1,260,961


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,460,425
Actual 1,648,502


### Random Forest Model Accuracy Scores and Classification Reports

In [21]:
# Extract accuracy scores from dictionary filled in previous for loop
AMZN_rf_accuracy = round(accuracy_dfs['amzn_acc'],3)
META_rf_accuracy = round(accuracy_dfs['meta_acc'],3)
TSLA_rf_accuracy = round(accuracy_dfs['tsla_acc'],3)

# Generate Classification Reports (using stored y_test and predictions from dictionaries filled in previous for loop)
print('Random Forest Classification Report: AMZN')
print(f'AMZN Random Forest Model Accuracy: {AMZN_rf_accuracy}')
print(classification_report(y_test_dfs['amzn_cr'],raw_prediction_dfs['amzn_pred']))
print('Random Forest Classification Report: META')
print(f'META Random Forest Model Accuracy: {META_rf_accuracy}')
print(classification_report(y_test_dfs['meta_cr'],raw_prediction_dfs['meta_pred']))
print('Random Forest Classification Report: TSLA')
print(f'TSLA Random Forest Model Accuracy: {TSLA_rf_accuracy}')
print(classification_report(y_test_dfs['tsla_cr'],raw_prediction_dfs['tsla_pred']))

Random Forest Classification Report: AMZN
AMZN Random Forest Model Accuracy: 0.674
              precision    recall  f1-score   support

          -1       0.58      0.12      0.19       685
           1       0.68      0.96      0.80      1350

    accuracy                           0.67      2035
   macro avg       0.63      0.54      0.50      2035
weighted avg       0.65      0.67      0.59      2035

Random Forest Classification Report: META
META Random Forest Model Accuracy: 0.623
              precision    recall  f1-score   support

          -1       0.40      0.29      0.34       601
           1       0.69      0.79      0.74      1221

    accuracy                           0.62      1822
   macro avg       0.55      0.54      0.54      1822
weighted avg       0.60      0.62      0.60      1822

Random Forest Classification Report: TSLA
TSLA Random Forest Model Accuracy: 0.473
              precision    recall  f1-score   support

          -1       0.42      0.52      0.4

### Random Forest Model Feature Importance

In [22]:
importance_dfs

{'amzn_imp': [(0.18658560956339373, '200EMA'),
  (0.1859867556371003, 'Volume MA'),
  (0.08486531422694422, 'Fast_EMA'),
  (0.07377606212067153, 'Slow_EMA'),
  (0.07284268955876638, 'VIX_40_EMA'),
  (0.06963374311797446, 'open'),
  (0.060169777117187066, 'high'),
  (0.058292155760864064, 'VWAP'),
  (0.05299330982877774, 'low'),
  (0.05281532744994819, 'close'),
  (0.033137564629769904, 'AMZN_search_trends'),
  (0.031289691960863564, 'VIX_close'),
  (0.026380581280623067, 'Volume'),
  (0.01123141774711563, 'pct_returns')],
 'meta_imp': [(0.13288213238363797, '200EMA'),
  (0.10230785778855934, 'VIX_40_EMA'),
  (0.09347326809250507, 'VWAP'),
  (0.09231136574284636, 'Fast_EMA'),
  (0.0871900183008184, 'close'),
  (0.0865811157513733, 'Slow_EMA'),
  (0.0853374287681326, 'high'),
  (0.07915139110481623, 'low'),
  (0.07262536556687021, 'open'),
  (0.05253331365871752, 'Volume MA'),
  (0.04788417476404313, 'META_search_trends'),
  (0.031626242388349095, 'VIX_close'),
  (0.025132988537799206, '

## Comparative Backtest Results

In [23]:
#Extract backtests dfs
pred_amzn_df = prediction_dfs['pred_amzn_df']
pred_meta_df = prediction_dfs['pred_meta_df']
pred_tsla_df = prediction_dfs['pred_tsla_df']

In [24]:
# Set the initial capital
initial_capital = float(100000)

# Set position size to 500 shares 
share_size = 200

#initialize list of prediction_dfs for backtesting
backtest_dfs = [pred_amzn_df,pred_meta_df,pred_tsla_df]

In [25]:
# Use for loop to iterate through backtest_dfs and calculate trade performance for both basic (non-ML) and random forest signals

for df in backtest_dfs:

    # Take a share_size position (Basic)
    df["basic_position"] = share_size * df["basic_signal"]
    # Take a share_size position (RF)
    df["rf_position"] = share_size * df["rf_signal"]

    # Find the points in time where a trade occurs (Basic)
    df["basic_Entry_Exit_Position"] = df["basic_position"].diff()
    # Find the points in time where a trade occurs (RF)
    df["rf_Entry_Exit_Position"] = df["rf_position"].diff()


    # Multiply share price by entry/exit positions and get the cumulative sum (Basic)
    df["basic_port_holdings"] = (
        df["close"] * df["basic_Entry_Exit_Position"].cumsum()
    )
    # Multiply share price by entry/exit positions and get the cumulative sum (RF)
    df["rf_port_holdings"] = (
        df["close"] * df["rf_Entry_Exit_Position"].cumsum()
    )

    # Subtract the initial capital by the portfolio holdings to get the amount of liquid cash in the portfolio (Basic)
    df["basic_port_cash"] = (
        initial_capital - (df["close"] * df["basic_Entry_Exit_Position"]).cumsum()
    )
    # Subtract the initial capital by the portfolio holdings to get the amount of liquid cash in the portfolio (RF)    
    df["rf_port_cash"] = (
        initial_capital - (df["close"] * df["rf_Entry_Exit_Position"]).cumsum()
    )

    # Get the total portfolio value by adding the cash amount by the portfolio holdings (Basic)
    df["Portfolio Value (Basic Algo)"] = (
        df["basic_port_cash"] + df["basic_port_holdings"]
    )
    # Get the total portfolio value by adding the cash amount by the portfolio holdings (RF)    
    df["Portfolio Value (ML Algo)"] = (
        df["rf_port_cash"] + df["rf_port_holdings"]
    )

    # Calculate the portfolio daily returns (Basic)
    df["basic_port_daily_returns"] = df["Portfolio Value (Basic Algo)"].pct_change()
    # Calculate the portfolio daily returns (RF)
    df["rf_port_daily_returns"] = df["Portfolio Value (ML Algo)"].pct_change()


    # Calculate the cumulative returns (Basic)
    df["basic_port_cumulative_returns"] = (
        1 + df["basic_port_daily_returns"]
    ).cumprod() - 1
    # Calculate the cumulative returns (RF)    
    df["rf_port_cumulative_returns"] = (
        1 + df["rf_port_daily_returns"]
    ).cumprod() - 1




In [26]:
# Review completed dataframes
#display(pred_amzn_df.tail(3))
#display(pred_meta_df.tail(3))
#display(pred_tsla_df.tail(3))

In [28]:
# Calculate and plot the cumulative returns for the `actual_returns` and the `trading_algorithm_returns`
display(pred_amzn_df[["Portfolio Value (Basic Algo)", "Portfolio Value (ML Algo)"]].hvplot(
    height=300, width=700, yformatter='%.0f', title='AMZN Portfolio Value: Trading Algo Returns (Basic and ML) from 2014-2022'))
display(pred_meta_df[["Portfolio Value (Basic Algo)", "Portfolio Value (ML Algo)"]].hvplot(
    height=300, width=700, yformatter='%.0f', title='META Portfolio Value: Trading Algo Returns (Basic and ML) from 2014-2022'))
display(pred_tsla_df[["Portfolio Value (Basic Algo)", "Portfolio Value (ML Algo)"]].hvplot(
    height=300, width=700, yformatter='%.0f', title='TSLA Portfolio Value: Trading Algo Returns (Basic and ML) from 2014-2022'))

### Trading Algo Statistics

In [29]:
# Create the list of the metric names
metrics = [
    'Annualized Return (Basic)',
    'Annualized Return (RF)',
    'Cumulative Returns (Basic)',
    'Cumulative Returns (RF)',
    'Annual Volatility (Basic)',
    'Annual Volatility (RF)',
    'Sharpe Ratio (Basic)',
    'Sharpe Ratio (RF)',
    'Sortino Ratio (Basic)',
    'Sortino Ratio (RF)'
]

# Create df to store performance stats
algo_eval_df = pd.DataFrame(
    index = metrics,
    columns = ['AMZN','META','TSLA']
)

# Review the DataFrame
algo_eval_df

Unnamed: 0,AMZN,META,TSLA
Annualized Return (Basic),,,
Annualized Return (RF),,,
Cumulative Returns (Basic),,,
Cumulative Returns (RF),,,
Annual Volatility (Basic),,,
Annual Volatility (RF),,,
Sharpe Ratio (Basic),,,
Sharpe Ratio (RF),,,
Sortino Ratio (Basic),,,
Sortino Ratio (RF),,,


In [30]:
# Define function to run on each of the tested dfs
def evaluate_performance(df, trading_days_per_year=252,col=0):

    # Calculate Annualized Return (Basic)
    annualized_return_basic = df['basic_port_daily_returns'].mean() * trading_days_per_year
    # Calculate Annualized Return (RF)
    annualized_return_rf = df['rf_port_daily_returns'].mean() * trading_days_per_year
    
    # Calculate Cumulative Return (Basic)
    cumulative_return_basic = df['basic_port_cumulative_returns'].iloc[-1]
    # Calculate Cumulative Return (RF)
    cumulative_return_rf = df['rf_port_cumulative_returns'].iloc[-1]
    
    # Calculate the Annual volatility metric (Basic)
    annual_volatility_basic = (df['basic_port_daily_returns'].std() * np.sqrt(252))
    # Calculate the Annual volatility metric (RF)
    annual_volatility_rf = (df['rf_port_daily_returns'].std() * np.sqrt(252))
    
    # Calculate the Sharpe ratio (Basic)
    sharpe_basic = (df['basic_port_daily_returns'].mean() * 252) / (
        df['basic_port_daily_returns'].std() * np.sqrt(252))
    # Calculate the Sharpe ratio (RF)
    sharpe_rf = (df['rf_port_daily_returns'].mean() * 252) / (
        df['rf_port_daily_returns'].std() * np.sqrt(252))
    
    
    # Calculate the Sortino ratio (Basic)
    # Start by calculating the downside return values
    # Create a DataFrame that contains the Portfolio Daily Returns column
    sortino_ratio_df_basic = df[['basic_port_daily_returns']].copy()

    # Create a column to hold downside return values
    sortino_ratio_df_basic.loc[:,'basic_downside_returns'] = 0

    # Find Portfolio Daily Returns values less than 0, 
    # square those values, and add them to the Downside Returns column
    sortino_ratio_df_basic.loc[sortino_ratio_df_basic['basic_port_daily_returns'] < 0, 
                             'basic_downside_returns'] = sortino_ratio_df_basic['basic_port_daily_returns']**2

    ### Annualized return already Calculated ###
    #annualized_return = sortino_ratio_df['Portfolio Daily Returns'].mean() * 252

    # Calculate the annualized downside standard deviation value
    downside_standard_deviation_basic = np.sqrt(sortino_ratio_df_basic['basic_downside_returns'].mean()) * np.sqrt(252)

    # Divide the annualized return value by the downside standard deviation value
    sortino_ratio_basic = annualized_return_basic/downside_standard_deviation_basic
    
    
    # Calculate the Sortino ratio (RF)
    # Start by calculating the downside return values
    # Create a DataFrame that contains the Portfolio Daily Returns column
    sortino_ratio_df_rf = df[['rf_port_daily_returns']].copy()

    # Create a column to hold downside return values
    sortino_ratio_df_rf.loc[:,'rf_downside_returns'] = 0

    # Find Portfolio Daily Returns values less than 0, 
    # square those values, and add them to the Downside Returns column
    sortino_ratio_df_rf.loc[sortino_ratio_df_rf['rf_port_daily_returns'] < 0, 
                             'rf_downside_returns'] = sortino_ratio_df_rf['rf_port_daily_returns']**2

    ### Annualized return already Calculated ###
    #annualized_return = sortino_ratio_df['Portfolio Daily Returns'].mean() * 252

    # Calculate the annualized downside standard deviation value
    downside_standard_deviation_rf = np.sqrt(sortino_ratio_df_rf['rf_downside_returns'].mean()) * np.sqrt(252)

    # Divide the annualized return value by the downside standard deviation value
    sortino_ratio_rf = annualized_return_rf/downside_standard_deviation_rf
    
    # Assign the values to the rows in algo_eval_df
    algo_eval_df.loc['Annualized Return (Basic)'][col] = annualized_return_basic
    algo_eval_df.loc['Cumulative Returns (Basic)'][col] = cumulative_return_basic
    algo_eval_df.loc['Annual Volatility (Basic)'][col] = annual_volatility_basic
    algo_eval_df.loc['Sharpe Ratio (Basic)'][col] = sharpe_basic
    algo_eval_df.loc['Sortino Ratio (Basic)'][col] = sortino_ratio_basic
    
    algo_eval_df.loc['Annualized Return (RF)'][col] = annualized_return_rf
    algo_eval_df.loc['Cumulative Returns (RF)'][col] = cumulative_return_rf
    algo_eval_df.loc['Annual Volatility (RF)'][col] = annual_volatility_rf
    algo_eval_df.loc['Sharpe Ratio (RF)'][col] = sharpe_rf
    algo_eval_df.loc['Sortino Ratio (RF)'][col] = sortino_ratio_rf  
    
    return annualized_return_basic, cumulative_return_basic, annual_volatility_basic, sharpe_basic, sortino_ratio_basic, annualized_return_rf, cumulative_return_rf, annual_volatility_rf,sharpe_rf, sortino_ratio_rf


In [31]:
# Evaluate performance using the defined function
# Set destination column in algo_eval_df
evaluate_performance(pred_amzn_df,col=0)
evaluate_performance(pred_meta_df,col=1)
evaluate_performance(pred_tsla_df,col=2)

# Review dataframe
algo_eval_df

Unnamed: 0,AMZN,META,TSLA
Annualized Return (Basic),0.023517,0.057526,0.077749
Annualized Return (RF),0.041783,0.049459,0.072484
Cumulative Returns (Basic),0.188482,-0.02804,0.727271
Cumulative Returns (RF),0.357577,0.16668,0.568983
Annual Volatility (Basic),0.064985,0.385704,0.141371
Annual Volatility (RF),0.088136,0.245157,0.182714
Sharpe Ratio (Basic),0.361884,0.149145,0.549962
Sharpe Ratio (RF),0.474069,0.201743,0.396706
Sortino Ratio (Basic),0.510096,0.319749,0.807372
Sortino Ratio (RF),0.669241,0.356729,0.584467
