In [None]:
import pandas as pd
import numpy as np
import pandas_datareader.data as web
import datetime as dt
import plotly.graph_objects as go
from arch.unitroot import VarianceRatio
from plotly.subplots import make_subplots

sp500_stock_data = pd.read_csv(
    "../Data/sp500_stocks.csv"
)
sp500_stock_data["Date"] = sp500_stock_data["Date"].astype("datetime64[ns]")

sp500_stock_names = pd.read_csv(
    "../Data/sp500_companies.csv"
)
sp500_stock_names.loc[len(sp500_stock_names.index)] = [
    "SPY",
    "SPY",
    "Index",
]
sp500_stock_names = sp500_stock_names.set_index("Symbol")

spy_data = pd.read_csv(
    "../Data/spy_daily.csv"
)
spy_data["Date"] = spy_data["Date"].astype("datetime64[ns]")
spy_data["Open"] = spy_data["Open"].astype("float")
spy_data["High"] = spy_data["High"].astype("float")
spy_data["Low"] = spy_data["Low"].astype("float")
spy_data["Close"] = spy_data["Close"].astype("float")
spy_data["Volume"] = spy_data["Volume"].astype("float")
spy_data["Symbol"] = "SPY"

# combine all three dataframes
# sp500_stock_data = pd.concat([sp500_stock_data, spy_data])
sp500_stock_data = sp500_stock_data.reset_index()
sp500_stock_data = sp500_stock_data.join(sp500_stock_names, on="Symbol", how="left")
# reindex with index as stock symbols
sp500_stock_data = sp500_stock_data.set_index(["Symbol", "Sector", "Date"])

                                index   Adj Close       Close        High  \
Symbol Sector      Date                                                     
A      Health Care 2018-01-02       0   64.625580   67.599998   67.889999   
                   2018-01-03       1   66.269882   69.320000   69.489998   
                   2018-01-04       2   65.772766   68.800003   69.820000   
                   2018-01-05       3   66.824364   69.900002   70.099998   
                   2018-01-08       4   66.967758   70.050003   70.330002   
...                               ...         ...         ...         ...   
ZTS    NaN         2023-12-22  747188  194.538773  194.979996  195.910004   
                   2023-12-26  747189  195.057587  195.500000  196.339996   
                   2023-12-27  747190  196.454422  196.899994  197.009995   
                   2023-12-28  747191  196.713837  197.160004  198.600006   
                   2023-12-29  747192  196.923355  197.369995  198.009995   

# Backtesting Practice on Financial Data
## Goal:
- creating a simple strategy that predicts price movements based on passed price chart data
    - stocks used based on stationarity
- in depth analysis of important metrics to evaluate the efficiency of the strategy
- possible variations to further improve the strategy
## Notebook Structure
- determine Stock selection method
- select features to predict target variable "price movement"
- select strategy, indicators, and necessary components
- backtest strategy
- evaluate strategy based on proven metrics

### Calculate Hurst exponent and Variance Ration Test

In [1]:
# measure momentum of a stock, 
# use hurst exponent and variance ratio test to rule out random walk, 
lags = range(2, 100)

def hurst_exponent(ts, lags = range(2, 20)):
    """
    Calculate the Hurst Exponent of a time series
    """
    tau = [np.sqrt(np.std(np.subtract(ts[lag:], ts[:-lag]))) for lag in lags]
    log_lags = np.log(lags)
    log_tau = np.log(tau)
    hurst = np.polyfit(log_lags, log_tau, 1)
    hurst_exponent = hurst[0] * 2
    return hurst_exponent

def variance_ratio_test(ts, lags = range(2, 20)):
    """
    Calculate the Variance Ratio of a time series for multiple lags and return the lag with the lowest p-value
    """
    vr = pd.DataFrame([], columns = ["lag", "pvalue"])
    for lag in lags:
        vr.loc[len(vr.index)] = [lag, VarianceRatio(ts, lag).pvalue]
    vr = vr.sort_values(by = "pvalue")
    return vr.iloc[0]

# prove correlation between the returns of the stock with correlation coefficient
def prove_correlation(ts, lags = range(2, 20)):
    """
    Prove correlation between the returns of the stock
    """
    hurst = hurst_exponent(ts)
    vr = variance_ratio_test(ts)
    vrPvalue = vr.loc["pvalue"]
    lag = vr.loc["lag"]
    return hurst, vr, lag

### Decide which stock should be tested

In [2]:
# calculate the hurst exponent and variance ratio test for each stock
def calculate_hurst_and_variance_ratio(sp500_stock_data):
    hurst = pd.DataFrame([], columns = ["Hurst", "VarianceRatio", "Lag"])
    for stock in sp500_stock_data.index.levels[0]:
        stock_data = sp500_stock_data.loc[stock]
        stock_data = stock_data.sort_values(by = "Date")
        stock_data = stock_data.set_index("Date")
        stock_data = stock_data["Close"]
        stock_data = stock_data.pct_change()
        hurst.loc[len(hurst.index)] = [stock, *prove_correlation(stock_data)]
    return hurst

# check which stocks fulfill the criteria for momentum
def check_momentum(hurst):
    momentum = hurst[hurst["Hurst"] > 0.6]
    momentum = momentum[momentum["VarianceRatio"] < 0.05]
    return momentum

# check which stocks fulfill the criteria for mean reversion
# def check_mean_reversion(hurst):
#     mean_reversion = hurst[hurst["Hurst"] < 0.5]
#     mean_reversion = mean_reversion[mean_reversion["VarianceRatio"] < 0.05]
#     return mean_reversion

### Set the target and feature variables and choose model
target variable: 
- future pct change in prices or future close price

independent variables:
- current price
- momentum indicator -> RSI
- stock name / symbol
- Sector
- Volume


Models I want to try out:
- Long Short-Term Memory
- Recurrent Neural Networks
- Hidden Markov Model

In [None]:
# fit long short-term memory model

In [None]:
# fit recurrent neural network model

In [None]:
# fit hidden markov model

### Evaluation metrics of models

- MAE
- RMSE
- MAPE
- Forecast Bias