In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import hvplot.pandas
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from pandas.tseries.offsets import DateOffset
from sklearn.metrics import classification_report

### Initial Data Import and Cleaning

In [2]:
# Read the csv file 
amzn_df = pd.read_csv(Path("Resources/amzn.csv"))

# Convert 'time' column from timestamp (seconds since epoch) to acutal time
amzn_df ['time'] = pd.to_datetime(
    amzn_df['time'],
    unit = 's',
    infer_datetime_format=True,
    utc=True
)    
# Convert timezone from UTC to Eastern Time
amzn_df['time'] = amzn_df['time'].dt.tz_convert('US/Eastern')

# Set 'time' column as the index 
amzn_df.set_index('time', inplace=True)

# Review df
amzn_df.head()          

Unnamed: 0_level_0,open,high,low,close,VWAP,Upper Band #1,Lower Band #1,Upper Band #2,Lower Band #2,Upper Band #3,...,Volume,Volume MA,EMA,Smoothing Line,Developing Poc,Developing VA High,Developing VA Low,Developing Poc.1,Developing VA High.1,Developing VA Low.1
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-03-14 09:30:00-04:00,9.1825,9.215995,9.057,9.112999,9.128665,9.128665,9.128665,9.128665,9.128665,9.128665,...,73994680,107206355.0,9.59791,9.60661,,,,,,
2012-03-15 09:30:00-04:00,9.101,9.2215,9.015,9.2215,9.152667,9.152667,9.152667,9.152667,9.152667,9.152667,...,83212340,103564913.0,9.594164,9.602464,,,,,,
2012-03-16 09:30:00-04:00,9.164,9.284,9.117501,9.2525,9.218,9.218,9.218,9.218,9.218,9.218,...,98696480,95934268.0,9.590765,9.598431,,,,,,
2012-03-19 09:30:00-04:00,9.1725,9.334,9.15,9.276,9.253333,9.253333,9.253333,9.253333,9.253333,9.253333,...,78080380,92419724.0,9.587633,9.594651,,,,,,
2012-03-20 09:30:00-04:00,9.244,9.7205,9.144,9.6165,9.493667,9.493667,9.493667,9.493667,9.493667,9.493667,...,183346180,94828072.0,9.58792,9.591678,,,,,,


In [3]:
# Filter column and only keep ones needed
amzn_df = amzn_df[['open','high','low','close','VWAP','Volume','Volume MA','EMA']]

#Rename EMA to 200EMA
amzn_df = amzn_df.rename(columns={'EMA':'200EMA'})

#Dropping NAN Values 
amzn_df = amzn_df.dropna()

# Review the DataFrame
amzn_df.head()

Unnamed: 0_level_0,open,high,low,close,VWAP,Volume,Volume MA,200EMA
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-03-14 09:30:00-04:00,9.1825,9.215995,9.057,9.112999,9.128665,73994680,107206355.0,9.59791
2012-03-15 09:30:00-04:00,9.101,9.2215,9.015,9.2215,9.152667,83212340,103564913.0,9.594164
2012-03-16 09:30:00-04:00,9.164,9.284,9.117501,9.2525,9.218,98696480,95934268.0,9.590765
2012-03-19 09:30:00-04:00,9.1725,9.334,9.15,9.276,9.253333,78080380,92419724.0,9.587633
2012-03-20 09:30:00-04:00,9.244,9.7205,9.144,9.6165,9.493667,183346180,94828072.0,9.58792


In [4]:
# Read the csv file 
meta_df = pd.read_csv(Path("Resources/meta.csv"))

# Convert 'time' column from timestamp (seconds since epoch) to acutal time
meta_df ['time'] = pd.to_datetime(
    meta_df['time'],
    unit = 's',
    infer_datetime_format=True,
    utc=True
)    
# Convert timezone from UTC to Eastern Time
meta_df['time'] = meta_df['time'].dt.tz_convert('US/Eastern')

# Set 'time' column as the index 
meta_df.set_index('time', inplace=True)

# Review df
meta_df.head()

Unnamed: 0_level_0,open,high,low,close,VWAP,Upper Band #1,Lower Band #1,Upper Band #2,Lower Band #2,Upper Band #3,...,Volume,Volume MA,EMA,Smoothing Line,Developing Poc,Developing VA High,Developing VA Low,Developing Poc.1,Developing VA High.1,Developing VA Low.1
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-05-18 09:30:00-04:00,42.05,45.0,38.0,38.2318,40.4106,40.4106,40.4106,40.4106,40.4106,40.4106,...,580587776,,,,,,,,,
2012-05-21 09:30:00-04:00,36.53,36.66,33.0,34.03,34.563333,34.563333,34.563333,34.563333,34.563333,34.563333,...,168309808,,,,,,,,,
2012-05-22 09:30:00-04:00,32.61,33.59,30.94,31.0,31.843333,31.843333,31.843333,31.843333,31.843333,31.843333,...,102053808,,,,,,,,,
2012-05-23 09:30:00-04:00,31.37,32.5,31.36,32.0,31.953333,31.953333,31.953333,31.953333,31.953333,31.953333,...,73721120,,,,,,,,,
2012-05-24 09:30:00-04:00,32.95,33.21001,31.77,33.03,32.670003,32.670003,32.670003,32.670003,32.670003,32.670003,...,50275872,,,,,,,,,


In [5]:
# Filter column and only keep ones needed
meta_df = meta_df[['open','high','low','close','VWAP','Volume','Volume MA','EMA']]

#Rename EMA to 200EMA
meta_df = meta_df.rename(columns={'EMA':'200EMA'})

#Dropping NAN Values 
meta_df = meta_df.dropna()

# Review the DataFrame
meta_df.head()

Unnamed: 0_level_0,open,high,low,close,VWAP,Volume,Volume MA,200EMA
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-03-07 09:30:00-05:00,27.57001,28.675,27.47,28.578,28.241,74540128,46413816.0,25.669904
2013-03-08 09:30:00-05:00,28.425,28.47,27.73,27.96001,28.053337,44198832,46896756.8,25.692691
2013-03-11 09:30:00-04:00,28.00999,28.64,27.83,28.14,28.203333,35642064,46793423.2,25.717042
2013-03-12 09:30:00-04:00,28.10001,28.32001,27.60001,27.83,27.916673,27569584,46303815.2,25.738067
2013-03-13 09:30:00-04:00,27.62,27.64999,26.92,27.08,27.216663,39619440,43610005.6,25.751419


In [6]:
# Read the csv file 
tsla_df = pd.read_csv(Path("./Resources/tsla.csv"))

# Convert 'time' column from timestamp (seconds since epoch) to acutal time
tsla_df ['time'] = pd.to_datetime(
    tsla_df['time'],
    unit = 's',
    infer_datetime_format=True,
    utc=True
)    
# Convert timezone from UTC to Eastern Time
tsla_df['time'] = tsla_df['time'].dt.tz_convert('US/Eastern')

# Set 'time' column as the index 
tsla_df.set_index('time', inplace=True)
                      
# Review df
tsla_df.head()   

Unnamed: 0_level_0,open,high,low,close,VWAP,Upper Band #1,Lower Band #1,Upper Band #2,Lower Band #2,Upper Band #3,...,Volume,Volume MA,EMA,Smoothing Line,Developing Poc,Developing VA High,Developing VA Low,Developing Poc.1,Developing VA High.1,Developing VA Low.1
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-03-12 09:30:00-04:00,2.312664,2.41933,2.306665,2.400664,2.375553,2.375553,2.375553,2.375553,2.375553,2.375553,...,29467829,16848705.0,1.929784,1.921918,,,,,,
2012-03-13 09:30:00-04:00,2.433997,2.439331,2.366664,2.405998,2.403998,2.403998,2.403998,2.403998,2.403998,2.403998,...,15024660,16731543.65,1.934522,1.92575,,,,,,
2012-03-14 09:30:00-04:00,2.399998,2.399998,2.319998,2.352664,2.357553,2.357553,2.357553,2.357553,2.357553,2.357553,...,12771972,16011617.15,1.938683,1.929832,,,,,,
2012-03-15 09:30:00-04:00,2.351998,2.365331,2.318664,2.333331,2.339109,2.339109,2.339109,2.339109,2.339109,2.339109,...,8573948,14366955.5,1.94261,1.93413,,,,,,
2012-03-16 09:30:00-04:00,2.326664,2.392664,2.321998,2.354665,2.356442,2.356442,2.356442,2.356442,2.356442,2.356442,...,10938550,13249154.35,1.94671,1.938462,,,,,,


In [7]:
# Filter column and only keep ones needed
tsla_df = tsla_df[['open','high','low','close','VWAP','Volume','Volume MA','EMA']]

#Rename EMA to 200EMA
tsla_df = tsla_df.rename(columns={'EMA':'200EMA'})

#Dropping NAN Values 
tsla_df = tsla_df.dropna()

# Review the DataFrame
tsla_df.head()

Unnamed: 0_level_0,open,high,low,close,VWAP,Volume,Volume MA,200EMA
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-03-12 09:30:00-04:00,2.312664,2.41933,2.306665,2.400664,2.375553,29467829,16848705.0,1.929784
2012-03-13 09:30:00-04:00,2.433997,2.439331,2.366664,2.405998,2.403998,15024660,16731543.65,1.934522
2012-03-14 09:30:00-04:00,2.399998,2.399998,2.319998,2.352664,2.357553,12771972,16011617.15,1.938683
2012-03-15 09:30:00-04:00,2.351998,2.365331,2.318664,2.333331,2.339109,8573948,14366955.5,1.94261
2012-03-16 09:30:00-04:00,2.326664,2.392664,2.321998,2.354665,2.356442,10938550,13249154.35,1.94671


### Trading Algo (Signal Generation)

In [8]:
# Import the finta library
from finta import TA

#set some options for display and troubleshooting
pd.set_option("display.max_rows", 2000)
pd.set_option("display.max_columns", 2000)
pd.set_option("display.width", 1000)

In [9]:
# Define periods for fast and slow EMAs (triggers)
fast_ema = 9
slow_ema = 40

# Initialize list of ticker dfs for use in for loop
ticker_df = [amzn_df,meta_df,tsla_df]

# For each ticker dataframe in the list:
for df in ticker_df:
    
    # Calculate % returns for later use
    df['pct_returns'] = df['close'].pct_change()
    
    # Calculate Fast EMA 
    df['Fast_EMA'] = TA.EMA(df,fast_ema)
    
    # Calculate Slow EMA
    df['Slow_EMA'] = TA.EMA(df,slow_ema)

    # Initialize empty Signal column with 0.0 default value
    df['Signal'] = 0.0

    # Generate Signal value ("long if fast_ema > slow_ema", otherwise flip short)
    df['Signal'] = np.where(
        df['Slow_EMA'] < df['Fast_EMA'], 1.0, -1.0)
    
    # Create Entry/Exit column and fill with the differences (trades) from Signal column
    df['Entry/Exit'] = df['Signal'].diff()
    
    #Drop any NA values
    df.dropna()


In [10]:
# Filter each dataframe
# We will train the model on two years of data (05-2012 to 05-2014)
# For purposes of comparison, these should be filtered to what will be
# the eventual testing dataset (05-2014 and on)
tsla_df_filt = tsla_df.loc['05-2014':'05-2022']
meta_df_filt = meta_df.loc['05-2014':'05-2022']
amzn_df_filt = amzn_df.loc['05-2014':'05-2022']

In [11]:
tsla_df_filt.head(3)

Unnamed: 0_level_0,open,high,low,close,VWAP,Volume,Volume MA,200EMA,pct_returns,Fast_EMA,Slow_EMA,Signal,Entry/Exit
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2014-05-01 09:30:00-04:00,13.80532,14.267986,13.712653,13.848653,13.943097,81670131,114412700.0,11.3913,-0.00077,13.724193,14.004311,-1.0,0.0
2014-05-02 09:30:00-04:00,13.906653,14.090653,13.767986,14.060653,13.973097,61370971,109288400.0,11.417861,0.015308,13.791485,14.00706,-1.0,0.0
2014-05-05 09:30:00-04:00,13.965319,14.512652,13.901319,14.440652,14.284874,77285222,104643500.0,11.447938,0.027026,13.921319,14.028211,-1.0,0.0


### Evaluate Simple (non-ML) Trading Strategy

In [12]:
# Set the initial capital
initial_capital = float(100000)

# Set position size to 500 shares 
share_size = 200

In [13]:
filt_dfs = [amzn_df_filt,meta_df_filt,tsla_df_filt]
backtest_dfs = {}  # Create an empty dictionary to store the backtested dataframes
tested_df_names = {
    1:'amzn_tested',
    2:'meta_tested',
    3:'tsla_tested'
}

for i, df in enumerate(filt_dfs):
    # Filter from May 2012 to May 2022
    test_df = df.copy()

    # Take a share_size position 
    test_df["Position"] = share_size * test_df["Signal"]

    # Find the points in time where a trade occurs
    test_df["Entry/Exit Position"] = test_df["Position"].diff()

    # Multiply share price by entry/exit positions and get the cumulatively sum
    test_df["Portfolio Holdings"] = (
        test_df["close"] * test_df["Entry/Exit Position"].cumsum()
    )

    # Subtract the initial capital by the portfolio holdings to get the amount of liquid cash in the portfolio
    test_df["Portfolio Cash"] = (
        initial_capital - (test_df["close"] * test_df["Entry/Exit Position"]).cumsum()
    )

    # Get the total portfolio value by adding the cash amount by the portfolio holdings (or investments)
    test_df["Portfolio Total"] = (
        test_df["Portfolio Cash"] + test_df["Portfolio Holdings"]
    )

    # Calculate the portfolio daily returns
    test_df["Portfolio Daily Returns"] = test_df["Portfolio Total"].pct_change()

    # Calculate the cumulative returns
    test_df["Portfolio Cumulative Returns"] = (
        1 + test_df["Portfolio Daily Returns"]
    ).cumprod() - 1
    
    # Store the filtered dataframe in the dictionary
    df_num = i + 1
    backtest_dfs[tested_df_names[df_num]] = test_df    


In [14]:
# Extract tested dfs from backtest_df dictionary
AMZN = backtest_dfs['amzn_tested']
META = backtest_dfs['meta_tested']
TSLA = backtest_dfs['tsla_tested']

# Create list of tested dfs
tested_dfs = [AMZN,META,TSLA]

names = {
    1:'AMZN',
    2:'META',
    3:'TSLA'
}

In [25]:
# For each of the tested dfs:
for i,df in enumerate(tested_dfs):

    # Visualize exit positions relative to total portfolio value
    long = df[df["Entry/Exit"] == 2.0]["Portfolio Total"].hvplot.scatter(
        color='blue',
        marker='^',
        legend=False, 
        ylabel="Total Portfolio Value", 
        width=600, 
        height=300
    )

    # Visualize entry positions relative to total portfolio value
    short = df[df["Entry/Exit"] == -2.0]["Portfolio Total"].hvplot.scatter(
        color='red',
        marker='v',
        legend=False, 
        ylabel="Total Portfolio Value", 
        width=600, 
        height=300
    )

    # Visualize the total portoflio value for the investment
    total_portfolio_value = df[['Portfolio Total']].hvplot(
        line_color='lightgray',
        ylabel='Total Portfolio Value',
        width=600,
        height=300
    )
    
    df_num = i + 1
    
    # Overlay the plots
    portfolio_entry_exit_plot = total_portfolio_value * long * short
    portfolio_entry_exit_plot.opts(
        title=f"{names[df_num]} DMAC (non-ML) Algorithm - Total Portfolio Value",
        yformatter='%.0f'
    )
    
    tested_df_names[df_num] = portfolio_entry_exit_plot 

In [26]:
#Assign plots to variables and display them
AMZN_plot = tested_df_names[1]
META_plot = tested_df_names[2]
TSLA_plot = tested_df_names[3]

display(AMZN_plot)
display(META_plot)
display(TSLA_plot)


### Trading Algo Statistics

In [17]:
# Create the list of the metric names
metrics = [
    'Annualized Return',
    'Cumulative Returns',
    'Annual Volatility',
    'Sharpe Ratio',
    'Sortino Ratio'
]

# Create df to store performance stats
algo_eval_df = pd.DataFrame(
    index = metrics,
    columns = ['AMZN','META','TSLA']
)

# Review the DataFrame
algo_eval_df

Unnamed: 0,AMZN,META,TSLA
Annualized Return,,,
Cumulative Returns,,,
Annual Volatility,,,
Sharpe Ratio,,,
Sortino Ratio,,,


In [18]:
# Define function to run on each of the tested dfs
def evaluate_performance(df, trading_days_per_year=252,col=0):

    # Calculate Annualized Return
    annualized_return = df['Portfolio Daily Returns'].mean() * trading_days_per_year
    
    # Calculate Cumulative Return
    cumulative_return = df['Portfolio Cumulative Returns'].iloc[-1]
    
    # Calculate the Annual volatility metric
    annual_volatility = (df['Portfolio Daily Returns'].std() * np.sqrt(252))
    
    # Calculate the Sharpe ratio
    sharpe = (df['Portfolio Daily Returns'].mean() * 252) / (
        df['Portfolio Daily Returns'].std() * np.sqrt(252))
    
    # Calculate the Sortino ratio
    # Start by calculating the downside return values
    # Create a DataFrame that contains the Portfolio Daily Returns column
    sortino_ratio_df = df[['Portfolio Daily Returns']].copy()

    # Create a column to hold downside return values
    sortino_ratio_df.loc[:,'Downside Returns'] = 0

    # Find Portfolio Daily Returns values less than 0, 
    # square those values, and add them to the Downside Returns column
    sortino_ratio_df.loc[sortino_ratio_df['Portfolio Daily Returns'] < 0, 
                             'Downside Returns'] = sortino_ratio_df['Portfolio Daily Returns']**2

    ### Annualized return already Calculated ###
    #annualized_return = sortino_ratio_df['Portfolio Daily Returns'].mean() * 252

    # Calculate the annualized downside standard deviation value
    downside_standard_deviation = np.sqrt(sortino_ratio_df['Downside Returns'].mean()) * np.sqrt(252)

    # Divide the annualized return value by the downside standard deviation value
    sortino_ratio = annualized_return/downside_standard_deviation
    
    # Assign the values to the rows in algo_eval_df
    algo_eval_df.loc['Annualized Return'][col] = annualized_return
    algo_eval_df.loc['Cumulative Returns'][col] = cumulative_return
    algo_eval_df.loc['Annual Volatility'][col] = annual_volatility
    algo_eval_df.loc['Sharpe Ratio'][col] = sharpe
    algo_eval_df.loc['Sortino Ratio'][col] = sortino_ratio    
    
    return annualized_return, cumulative_return, annual_volatility, sharpe, sortino_ratio


In [19]:
# Evaluate performance using the defined function
# Set destination column in algo_eval_df
evaluate_performance(AMZN,col=0)
evaluate_performance(META,col=1)
evaluate_performance(TSLA,col=2)

# Review dataframe
algo_eval_df

Unnamed: 0,AMZN,META,TSLA
Annualized Return,0.023505,0.055694,0.077711
Cumulative Returns,0.188482,0.44588,0.727271
Annual Volatility,0.064969,0.141004,0.141337
Sharpe Ratio,0.361795,0.394982,0.549826
Sortino Ratio,0.509971,0.549567,0.807173
