In [7]:
import numpy as np
import pandas as pd
import os
import zipfile
import matplotlib.pyplot as plt
import glob

from config.config import *
#from analysis.analysis_functions import *
import re

In [10]:
abspath = r"C:\Users\Andy\PycharmProjects\finrlpaper2\MT-DRL-Pytorch"
os.chdir(abspath)
cwd_ = os.getcwd() # get current working directory
cwd_

'C:\\Users\\Andy\\PycharmProjects\\finrlpaper2\\MT-DRL-Pytorch'

In [98]:
results_path = "results"
run_path = os.path.join(results_path, "07-06-2021_17-43-02_ppoCustomBase_fm2_st")
seed = 0
seed_path = os.path.join(run_path, "agentSeed0")

# paths to DATAFILE
data_path = os.path.join(abspath, "data", "preprocessed", "US_stocks_WDB.csv")

In [76]:
# mode
def get_results_dict_for_one_seed(seed_path: str, 
                                   backtest_path: str, 
                                   mode="test", 
                                  ):
    # performance paths for FOLDERS
    pfvalue_path = os.path.join(seed_path, "portfolio_value")
    reward_path = os.path.join(seed_path, "rewards")
    all_weights_path = os.path.join(seed_path, "all_weights_cashAtEnd")
    equity_weights_path = os.path.join(seed_path, "asset_equity_weights")
    policy_actions_path = os.path.join(seed_path, "policy_actions")
    exer_actions_path = os.path.join(seed_path, "exercised_actions")
    state_mem_path = os.path.join(seed_path, "state_memory")
    # training performance
    training_performance_path = os.path.join(seed_path, "training_performance")
    # for backtesting
    backtest_path = os.path.join(seed_path, "backtest")
    bt_pfvalue_path = os.path.join(backtest_path, "portfolio_value")
    bt_reward_path = os.path.join(backtest_path, "rewards")
    bt_all_weights_path = os.path.join(backtest_path, "all_weights_cashAtEnd")
    bt_equity_weights_path = os.path.join(backtest_path, "asset_equity_weights")
    bt_policy_actions_path = os.path.join(backtest_path, "policy_actions")
    bt_exer_actions_path = os.path.join(backtest_path, "exercised_actions")
    bt_state_mem_path = os.path.join(backtest_path, "state_memory")
    # create dictionarys with results paths of the folders aggregated
    results_dict = {"pfvalue": glob.glob(os.path.join(pfvalue_path, f"*{mode}*.csv")),
                   "reward": glob.glob(os.path.join(reward_path, f"*{mode}*.csv")),
                   "all_weights": glob.glob(os.path.join(all_weights_path, f"*{mode}*.csv")),
                   "equity_weights": glob.glob(os.path.join(equity_weights_path, f"*{mode}*.csv")),
                   "policy_actions": glob.glob(os.path.join(policy_actions_path, f"*{mode}*.csv")),
                   "exer_actions": glob.glob(os.path.join(exer_actions_path, f"*{mode}*.csv")),
                   "state_mem": glob.glob(os.path.join(state_mem_path, f"*{mode}*.csv")),
                   }
    backtest_dict = {"pfvalue": glob.glob(os.path.join(bt_pfvalue_path, f"*{mode}*.csv")),
                   "reward": glob.glob(os.path.join(bt_reward_path, f"*{mode}*.csv")),
                   "all_weights": glob.glob(os.path.join(bt_all_weights_path, f"*{mode}*.csv")),
                   "equity_weights": glob.glob(os.path.join(bt_equity_weights_path, f"*{mode}*.csv")),
                   "policy_actions": glob.glob(os.path.join(bt_policy_actions_path, f"*{mode}*.csv")),
                   "exer_actions": glob.glob(os.path.join(bt_exer_actions_path, f"*{mode}*.csv")),
                   "state_mem": glob.glob(os.path.join(bt_state_mem_path, f"*{mode}*.csv")),
                   }
    state_header_df = pd.read_csv(os.path.join(state_mem_path, "state_header.csv"), index_col=0)
    # get the data from the paths saved in the dictionary
    ### FOR RESULTS DICTIONARY
    results_dicty = results_dict.copy() 
    for key in results_dicty:
        # for every key in results_dicty (e.g. "pfvalue", "reward",...) we get the list of filepaths
        # (there are multiple filepaths since we have multiple episodes for which we needed to save results
        # and we want to concatenate these episodes into one time series to get the overall result
        filepaths = results_dicty[key]
        # create empty list
        li = []
        # for each filepath, we read in the csv file as pandas dataframe and then append the df to the list
        for file in filepaths:
            df = pd.read_csv(file, index_col=0)
            li.append(df)
        # finally, we concatenate the df's in the list to one dataframe (concatenate on index axis 
        # => below each other, since they build a time series)
        df = pd.concat(li)
        # rename the first column, which is always "datadate" in our results files
        df.rename(columns={df.columns[0]: "datadate"}, inplace = True)
        # rename the one other column for rewards and portfolio value using the respective key
        if key in ["pfvalue", "reward"]:
            df.rename(columns={df.columns[1]: key}, inplace = True)
        # for the state memory, we use the state header to as column names
        if key == "state_mem":
            df.columns = ["datadate"] + state_header_df.values.flatten().tolist()
        # sort based on date (since we want to have a nice time series and "glob" does not
        # necessarily import the fileüaths in the correct order)
        df = df.sort_values("datadate")
        # drop duplicate values (usually, the last state (where no action done anymore) is still saved in the episode results file,
        # and at the same time, it is saved in the results file of the next episode as the "initial" state, where we do an action.
        # This is not wrong (it is actuall practical to debug and check if the cirrect starting state is used in the episodes),
        # but we don't want to have it double here for time series analysis (wouldn't make sense))
        df = df.drop_duplicates(subset=["datadate"], keep='last')
        # include the results in the dictionary
        results_dicty.update({key: df})
    ### FOR BACKTESTING DICTIONARY
    backtest_dicty = results_dict.copy() 
    for key in backtest_dicty:
        # for every key in results_dicty (e.g. "pfvalue", "reward",...) we get the list of filepaths
        # (there are multiple filepaths since we have multiple episodes for which we needed to save results
        # and we want to concatenate these episodes into one time series to get the overall result
        filepaths = backtest_dicty[key]
        # create empty list
        li = []
        # for each filepath, we read in the csv file as pandas dataframe and then append the df to the list
        for file in filepaths:
            df = pd.read_csv(file, index_col=0)
            li.append(df)
        # finally, we concatenate the df's in the list to one dataframe (concatenate on index axis 
        # => below each other, since they build a time series)
        df = pd.concat(li)
        # rename the first column, which is always "datadate" in our results files
        df.rename(columns={df.columns[0]: "datadate"}, inplace = True)
        # rename the one other column for rewards and portfolio value using the respective key
        if key in ["pfvalue", "reward"]:
            df.rename(columns={df.columns[1]: key}, inplace = True)
        # for the state memory, we use the state header to as column names
        if key == "state_mem":
            df.columns = ["datadate"] + state_header_df.values.flatten().tolist()
        # sort based on date (since we want to have a nice time series and "glob" does not
        # necessarily import the fileüaths in the correct order)
        df = df.sort_values("datadate")
        # drop duplicate values (usually, the last state (where no action done anymore) is still saved in the episode results file,
        # and at the same time, it is saved in the results file of the next episode as the "initial" state, where we do an action.
        # This is not wrong (it is actuall practical to debug and check if the cirrect starting state is used in the episodes),
        # but we don't want to have it double here for time series analysis (wouldn't make sense))
        df = df.drop_duplicates(subset=["datadate"], keep='last')
        # include the results in the dictionary
        backtest_dicty.update({key: df})

    # the last three outputs are optional, just used for debugging
    return results_dicty, backtest_dicty,  state_header_df, results_dict, backtest_dict 
print("done.")

done.


In [77]:
results_dict, backtest_dict, _, _, _ = get_results_dict_for_one_seed(seed_path=seed_path, 
                                                                              backtest_path=backtest_path,
                                                                              mode="test")

In [100]:
def calculate_and_save_performance_metrics(results_dict: dict, save_path: str, seed: int=None, mode: str="test"):
    import ffn
    ### CALCULATE
    # sharpe ratio, max DD, average DD, total ret, USING RISK.FREE RATE OF 0
    # first, we need to convert datadate from integer to datetime format, so the library (ffn) an work with it
    results_dict["pfvalue"]["datadate"] =  pd.to_datetime(results_dict["pfvalue"]["datadate"], format='%Y%m%d')
    # then we can create a "perf" object (performances) with the function .calc_stats()
    perf = results_dict["pfvalue"].set_index("datadate")["pfvalue"].calc_stats()
    # now we can acces sthe statistics like this, for example: (ann = annuaized)
    sharpe_ratio_daily_ann = perf.daily_sharpe
    total_return = perf.total_return
    avg_daily_return_ann = perf.daily_mean
    std_daily_return_ann = perf.daily_vol
    maxdd = perf.max_drawdown
    avg_dd = perf.avg_drawdown
    avg_dd_days = perf.avg_drawdown_days
    # calculate cumulative return
    cumret =  results_dict["pfvalue"]["pfvalue"]
    
    ### SAVE
    df = pd.DataFrame({"performance_metric": 
                       ["sharpe_ratio_daily_ann", "total_return", "avg_daily_return_ann", "std_daily_return_ann",
                        "maxdd", "avg_dd", "avg_dd_days" ], 
                       f"seed{seed}": 
                       [sharpe_ratio_daily_ann, total_return, avg_daily_return_ann, std_daily_return_ann, 
                        maxdd, avg_dd, avg_dd_days]})
    df.to_csv(os.path.join(save_path, f"{mode}_performance_metrics_seed{seed}.csv"))
    return None
print("done.")

done.


In [101]:
calculate_and_save_performance_metrics(results_dict=results_dict, 
                                       save_path=seed_path, 
                                       seed=seed,
                                       mode="test")

In [55]:
results_dicty["pfvalue"]["return_daily"] = results_dicty["pfvalue"]["PF_Value"].pct_change()
results_dicty["pfvalue"]['log_return_daily'] = np.log(
    results_dicty["pfvalue"]["PF_Value"]) - np.log(results_dicty["pfvalue"]["PF_Value"].shift(1))
results_dicty["pfvalue"]['cum_return_daily2'] = np.exp(np.log1p(results_dicty["pfvalue"]['return_daily']).cumsum())
results_dicty["pfvalue"]['cum_logreturn_daily'] = results_dicty["pfvalue"]['log_return_daily'].cumsum()

results_dicty["pfvalue"].tail(5)

Unnamed: 0,datadate,PF_Value,cum_return_daily,return_daily,log_return_daily,cum_logreturn_daily,cum_return_daily2
43,2021-06-07,1969737.0,1.969737,-0.006111,-0.006129,0.6779,1.969737
44,2021-06-08,1962651.0,1.962651,-0.003598,-0.003604,0.674296,1.962651
45,2021-06-09,1958215.0,1.958215,-0.00226,-0.002263,0.672033,1.958215
46,2021-06-10,1935486.0,1.935486,-0.011607,-0.011675,0.660358,1.935486
47,2021-06-11,1945899.0,1.945899,0.00538,0.005366,0.665724,1.945899


In [88]:
# Sharpe Ratio Annualized
# note: daily yield curve, 1year, has been going down from ~0.5% (2009) to 0.05% (2021), 
# and here the test period is only from 2016 on,
# so might consider 0 for simplicity or an average
# for now, 0
# see also: https://quant.stackexchange.com/questions/28385/what-value-should-the-risk-free-monthly-return-rate-be-sharpe-ratio-calculation
# and: https://www.treasury.gov/resource-center/data-chart-center/interest-rates/Pages/TextView.aspx?data=yield

avg_daily_return_ann = 252 * results_dicty["pfvalue"]["return_daily"].mean() # assuming 252 trading days
std_daily_return_ann = np.sqrt(252) * results_dicty["pfvalue"]["return_daily"].std() # assuming 252 trading days
sharpe_ratio_ann =  avg_daily_return_ann/std_daily_return_ann
total_return = results_dicty["pfvalue"]["PF_Value"].iloc[-1] / results_dicty["pfvalue"]["PF_Value"].iloc[0] -1

print(total_return) 
print(avg_daily_return_ann)
print(std_daily_return_ann)
print(sharpe_ratio_ann)

0.9458994134600001
0.15294309075252352
0.2213404189979843
0.6909858192412491


In [86]:
results_dict["pfvalue"]["datadate"] =  pd.to_datetime(results_dict["pfvalue"]["datadate"], format='%Y%m%d')
perf = results_dict["pfvalue"].set_index("datadate")["pfvalue"].calc_stats()


# daily return:
df['daily_return'] = df['close'].pct_change()
# calculate cumluative return
df['cumluative_return'] = np.exp(np.log1p(df['daily_return']).cumsum())

In [87]:
perf.plot()
perf.display()
perf.plot_histogram()
perf.display_monthly_returns()

<AxesSubplot:title={'center':'pfvalue Price Series'}, xlabel='datadate'>

#### CUMULATIVE RETURNS PLOT OF PORTFOLIO