# Reward test
Let's see if buying the asset for a discounted price or selling it for a slightly higher price actually look profitable from the perspective of the discounted future reward.

In [1]:
import numpy as np
import plotly.graph_objects as go

In [2]:
discount_factor = 0.99
period = 16
start_cash = 1024
start_asset = 1024
buy_price = 1 / 1.1
sell_price = 1.1
price_process = lambda step: np.exp(0.25 * np.sin(step * 2 * np.pi / period))
utility = lambda cash, asset, step: np.log(1 + cash + price_process(step) * asset)

def evaluation(cash, asset, step):
    offsets = np.arange(65536)
    return np.sum(utility(cash, asset, step + offsets) * discount_factor ** offsets)

steps = np.arange(256)
prices = price_process(steps)
go.Figure(
    layout=dict(
        xaxis_title="Asset price",
        yaxis_title="Reward",
    ),
    data=[
        go.Scatter(
            name="No action",
            x=prices,
            y=[evaluation(cash=start_cash, asset=start_asset, step=step) for step in steps],
        ),
        go.Scatter(
            name="After purchase",
            x=prices,
            y=[evaluation(cash=start_cash - buy_price, asset=start_asset + 1, step=step) for step in steps],
        ),
        go.Scatter(
            name="After sale",
            x=prices,
            y=[evaluation(cash=start_cash + sell_price, asset=start_asset - 1, step=step) for step in steps],
        ),
    ]
)

It turns out that the expected future utility depends much more on where we are on this circle than on what action we took. Probably a really bad sign!

Incorporating history might help a bit so that the critic knows where on this circle we are. More importantly, though, we need a reward signal which does not treat actions with so much indifference!