### Forwards optimization notebook

By forward optimization here I mean optimization that only tunes existing parameters.
AKA finding the optimal value for any given parameter the strategy has so that it yields better results.

This notebook aims to create an interface json which can later be easily read for the optimal parameters for a given timeframe (start to date and period before to date).

### Imports and variables

In [None]:
import datetime
import json
from datetime import timedelta
from dateutil.relativedelta import relativedelta
from dateutil import rrule

import pandas as pd
import tqdm

# from matplotlib import pyplot
# import seaborn as sns
import dask.dataframe as dd

TIMEFRAME = "daily"
# TIMEFRAME = 'weekly'

# FRAMEWORK = 'dask'
FRAMEWORK = "pandas"

FIRST_DATE = datetime.datetime(2000, 1, 1)
LAST_DATE = datetime.datetime(2020, 12, 31)

FORWARD_OPT_RESULTS_JSON = f"results/{TIMEFRAME}/forward_opt_results.json"

### Utils

In [None]:
def get_sample(the_df):
    print(f"There are {len(the_df)} rows in the dataframe/series")
    try:
        res = the_df.sample(frac=5)
    except ValueError:
        res = the_df.sample(3)

    return res

In [None]:
def save_results(key, results_for_key):
    with open(FORWARD_OPT_RESULTS_JSON, "r") as f:
        results_dict = json.loads(f.read())

    results_dict[key] = results_for_key

    with open(FORWARD_OPT_RESULTS_JSON, "w") as f:
        f.write(json.dumps(results_dict))


# save_results('a', {1:2,3:4})

In [None]:
def get_pair_entry(start_date, end_date):
    return (
        f"{start_date.strftime('%y-%m')}-to-{end_date.strftime('%y-%m')}",
        start_date,
        end_date,
    )


def generate_date_ranges():
    resulting_dict = {}
    for end_date in rrule.rrule(rrule.MONTHLY, dtstart=FIRST_DATE, until=LAST_DATE):
        resulting_dict[end_date.strftime("%y-%m")] = {
            #             '1-month': get_pair_entry(end_date + relativedelta(months=-1), end_date),
            #             '3-month': get_pair_entry(end_date + relativedelta(months=-3), end_date),
            "6-month": get_pair_entry(end_date + relativedelta(months=-6), end_date),
            "12-month": get_pair_entry(end_date + relativedelta(months=-12), end_date),
            "max": get_pair_entry(datetime.datetime(1998, 1, 1), end_date),
        }

    return resulting_dict


def get_parsed_date_ranges():
    with open(FORWARD_OPT_RESULTS_JSON, "r") as f:
        results_dict = json.loads(f.read())

    return results_dict


# for k,v in generate_date_ranges().items():
#     print(k)
#     print(v)
# #     range_name, key, start_date, end_date = v
# #     print(range_name, key, start_date, end_date)
#     raise Exception

### Load the dataframe

In [None]:
if TIMEFRAME == "daily":
    path = "dataset/daily/"
    column_names = [
        "uid",
        "ticker",
        "date",
        "price_open",
        "maperiod",
        "rsi_open_period",
        "adx8",
        "adx16",
        "adx32",
        "ppo8",
        "ppo16",
        "ppo32",
        "stochastic8",
        "stochastic16",
        "stochastic32",
        "price_sell",
        "days_ago_close_period",
        "rsi_close_period",
    ]
elif TIMEFRAME == "weekly":
    path = "dataset/weekly/"
    column_names = [
        "uid",
        "ticker",
        "date",
        "price_open",
        "maperiod",
        "rsi_open_period",
        "adx3",
        "adx6",
        "adx9",
        "ppo3",
        "ppo6",
        "ppo9",
        "stochastic3",
        "stochastic6",
        "stochastic9",
        "price_sell",
        "days_ago_close_period",
        "rsi_close_period",
    ]

if FRAMEWORK == "pandas":
    d = pd
    f = f"{path}/all_results.csv"
elif FRAMEWORK == "dask":
    d = dd
    f = f"{path}/*.csv"

df = d.read_csv(
    f,
    names=column_names,
)

In [None]:
if FRAMEWORK == "dask":
    print(df.shape[0].compute())
else:
    print(df.shape[0])

### Compute the change percentage
And drop the price_sell and price_open columns because they are no longer used

In [None]:
# Add the percentage change for the price
df["perc_change"] = (df["price_sell"] - df["price_open"]) / df["price_open"] * 100

In [None]:
# The prices can now be dropped since I won't need them anymore
df = df.drop(["price_sell", "price_open"], axis=1)

In [None]:
get_sample(df)

In [None]:
def mean_optimal_func(tmp_df_groupby):
    tmp_series = tmp_df_groupby["perc_change"].mean()

    if FRAMEWORK == "dask":
        tmp_series = tmp_series.compute()

    return tmp_series.sort_values(ascending=False).index[0]


def order_optimal_func(tmp_df_groupby):
    def outcome_computation(x):
        l = []
        for _, perc_change in x.sort_values(["date"])["perc_change"].iteritems():
            l.append(perc_change)

        results = []
        for i in range(len(l)):
            res = 1
            for j in range(i, len(l)):
                res += (res * l[j]) / 100
            results.append(res)

        return sum(results) / len(results)

    if FRAMEWORK == "pandas":
        outcome_series = tmp_df_groupby.apply(outcome_computation)
        res = outcome_series.sort_values(ascending=False).index[0]
    else:
        outcome_series = tmp_df_groupby.apply(outcome_computation, meta=("float"))
        res = outcome_series.compute().sort_values(ascending=False).index[0]

    return res


def compute_optimal_params_using_func(df, func):
    optimal_days_ago = int(func(df.groupby(["days_ago_close_period"])))
    df = df[df["days_ago_close_period"] == optimal_days_ago]

    optimal_rsi_open = int(func(df.groupby(["rsi_open_period"])))
    df = df[df["rsi_open_period"] == optimal_rsi_open]

    optimal_maperiod = int(func(df.groupby(["maperiod"])))
    df = df[df["maperiod"] == optimal_maperiod]

    result = {
        "days_ago_close_period": optimal_days_ago,
        "rsi_open_period": optimal_rsi_open,
        "maperiod": optimal_maperiod,
    }

    return result


def compute_optimal_results(df):
    return {
        "mean": compute_optimal_params_using_func(df, mean_optimal_func),
        "order": compute_optimal_params_using_func(df, order_optimal_func),
    }

In [None]:
def outcome_computation(x):
    l = []

    for perc_change in x.sort_values(["date"])["perc_change"].values:
        l.append(perc_change)

    results = []
    for i in range(len(l)):
        res = 1
        for j in range(i, len(l)):
            res += (res * l[j]) / 100
        results.append(res)

    return sum(results) / len(results)

In [None]:
def filter_df_by_dates(df, start_date, end_date):
    return df[
        (df["date"] >= start_date.strftime("%Y-%m-%d"))
        & (df["date"] <= end_date.strftime("%Y-%m-%d"))
    ]

In [None]:
STRATEGY_OUT_OF_THE_BOX_KWARGS = (
    {"maperiod": 200, "rsi_open_period": 10, "days_ago_close_period": 10}
    if TIMEFRAME == "daily"
    else {"maperiod": 40, "rsi_open_period": 2, "days_ago_close_period": 2}
)
DEFAULT_STRATEGY_KWARGS = {
    "mean": STRATEGY_OUT_OF_THE_BOX_KWARGS,
    "order": STRATEGY_OUT_OF_THE_BOX_KWARGS,
}

In [None]:
all_date_ranges = generate_date_ranges()
parsed_date_ranges = get_parsed_date_ranges()
unparsed_date_ranges = {
    k: v for k, v in all_date_ranges.items() if k not in parsed_date_ranges
}
# TODO: check 01-11 adn 02-12 (or up to '03-02'?) for daily timeframe
try:
    print(list(unparsed_date_ranges.items())[0])
except IndexError:
    print("All ranges have been parsed!")

In [None]:
with tqdm.tqdm(total=len(unparsed_date_ranges)) as pbar:
    # Simple caching mechanism to speed things up for a lot of computations
    PREVIOUS_SIZE = None
    PREVIOUS_RES = None

    for date_value, date_ranges in unparsed_date_ranges.items():
        optimal_results = {}

        def optimize_range(df, range_name, default_if_empty):
            range_values = date_ranges[range_name]
            _, start_date, end_date = range_values

            filtered_df = filter_df_by_dates(df, start_date, end_date)
            size = (
                filtered_df.shape[0]
                if FRAMEWORK == "pandas"
                else filtered_df.shape[0].compute()
            )

            if size == globals()["PREVIOUS_SIZE"]:
                print(f"Using chache for {range_name} - {start_date} to {end_date}")
                return globals()["PREVIOUS_RES"]

            if size < 100:
                print(
                    f"Empty or too small df from (size: {size}) {start_date} to {end_date}"
                )
                res = default_if_empty
            else:
                print(f"Parsing {size} lines from {start_date} to {end_date}")
                res = compute_optimal_results(filtered_df)

            globals()["PREVIOUS_SIZE"] = size
            globals()["PREVIOUS_RES"] = res

            return res

        optimal_results["12-month"] = optimize_range(df, "12-month", None)
        optimal_results["6-month"] = optimize_range(
            df, "6-month", optimal_results["12-month"]
        )

        print(date_value)
        print(optimal_results)
        save_results(date_value, optimal_results)
        pbar.update()