### Exemplo - Tensortrade

- Dados diários de OHLCV obtidos via API da Yahoo! Finance para uma ação específica.
- Função de recompensa anexada ao esquema de ações (profit based).
- O agente transfere todos os recursos de uma carteira para ativos e vice-versa.

In [None]:
import ray
import numpy as np
import pandas as pd
import yfinance as yf
import pandas_ta as ta

from ray import tune
from ray.tune.registry import register_env

import tensortrade.env.default as default

from tensortrade.feed.core import DataFeed, Stream
from tensortrade.oms.exchanges import Exchange
from tensortrade.oms.services.execution.simulated import execute_order
from tensortrade.oms.wallets import Wallet, Portfolio
from tensortrade.env.default.rewards import TensorTradeRewardScheme
from tensortrade.feed.core import Stream, DataFeed

from gym.spaces import Discrete
from tensortrade.env.default.actions import TensorTradeActionScheme
from tensortrade.env.generic import ActionScheme, TradingEnv
from tensortrade.core import Clock
from tensortrade.oms.instruments import ExchangePair, Instrument
from tensortrade.oms.wallets import Portfolio
from tensortrade.oms.orders import (
    Order,
    proportion_order,
    TradeSide,
    TradeType
)

import matplotlib.pyplot as plt
from tensortrade.env.generic import Renderer
import ray.rllib.agents.ppo as ppo

In [None]:
BRL = Instrument("BRL", 2, "Brazilian Currency")
ASSET = Instrument("ASSET", 2, "Asset")

---

### Action Scheme

In [None]:
class BSH(TensorTradeActionScheme):
    """The ActionScheme interprets and applies the agent’s actions to the environment."""

    registered_name = "bsh"

    def __init__(self, cash: 'Wallet', asset: 'Wallet'):
        super().__init__()
        self.cash = cash
        self.asset = asset

        self.listeners = []
        self.action = 0

    @property
    def action_space(self):
        return Discrete(2)

    def attach(self, listener):
        self.listeners += [listener]
        return self

    def get_orders(self,
                   action: int,
                   portfolio: 'Portfolio'):
        
        order = None

        if abs(action - self.action) > 0:
            src = self.cash if self.action == 0 else self.asset
            tgt = self.asset if self.action == 0 else self.cash

            if src == self.cash:
                # Calculates proportional order size (n lots of 100 shares)
                lot_size = 100.00
                current_price = float(portfolio.exchange_pairs[0].price)
                source_balance = src.balance.as_float()

                qtd_assets = source_balance / (lot_size * current_price)

                num_shares = int(qtd_assets - (qtd_assets % 10)) * lot_size

                proportional_lot_size = (num_shares * current_price) / source_balance
            else:
                proportional_lot_size = 1.0

            print('--' * 50)
            
            if src == self.cash:
                print ('CASH TO ASSET')
                print('Source Balance: ', src.balance.as_float())
                print('Target Balance: ', tgt.balance.as_float())
                print('Proportional Lot Size', proportional_lot_size)
                print('Current Price: ', float(portfolio.exchange_pairs[0].price))
                print('# Shares: ', num_shares)
                print('Current Price x # Shares: ', num_shares * float(portfolio.exchange_pairs[0].price))
            else:
                print ('ASSET TO CASH')
                print('Source Balance: ', src.balance.as_float())
                print('Target Balance: ', tgt.balance.as_float())
                print('Proportional Lot Size', proportional_lot_size)

            order = proportion_order(
                        portfolio, 
                        src, 
                        tgt, 
                        proportional_lot_size
                    )

            self.action = action

        for listener in self.listeners:
            listener.on_action(action)

        return [order]

    def reset(self):
        super().reset()
        self.action = 0

### Reward Scheme

In [None]:
class PBR(TensorTradeRewardScheme):

    """ Position-based reward scheme (PBR).
    
    The RewardScheme computes the reward for 
    each time step based on the agent’s performance.
    """
    
    registered_name = "pbr"

    def __init__(self, price: 'Stream'):
        super().__init__()
        self.position = -1

        r = Stream.sensor(price, lambda p: p.value, dtype="float").diff()
        position = Stream.sensor(self, lambda rs: rs.position, dtype="float")

        reward = (r * position).fillna(0).rename("reward")

        self.feed = DataFeed([reward])
        self.feed.compile()

    def on_action(self, action: int):
        self.position = -1 if action == 0 else 1

    def get_reward(self, portfolio: 'Portfolio'):
        return self.feed.next()["reward"]

    def reset(self):
        self.position = -1
        self.feed.reset()

### Renderer

In [None]:
class PositionChangeChart(Renderer):
    """The Renderer renders a view of the environment and interactions."""
    
    def __init__(self, color: str = "orange"):
        self.color = "orange"

    def render(self, env, **kwargs):
        # The Observer generates the next observation for the agent.
        history = pd.DataFrame(env.observer.renderer_history)

        actions = list(history.action)
        p = list(history.price)

        buy = {}
        sell = {}

        for i in range(len(actions) - 1):
            a1 = actions[i]
            a2 = actions[i + 1]

            if a1 != a2:
                if a1 == 0 and a2 == 1:
                    buy[i] = p[i]
                else:
                    sell[i] = p[i]

        buy = pd.Series(buy)
        sell = pd.Series(sell)

        fig, axs = plt.subplots(1, 2, figsize=(15, 5))

        fig.suptitle("Performance")

        axs[0].plot(np.arange(len(p)), p, label="price", color=self.color)
        axs[0].scatter(buy.index, buy.values, marker="v", color="red") # BUY
        axs[0].scatter(sell.index, sell.values, marker="^", color="green") # SELL
        axs[0].set_title("Trading Chart")
        axs[0].legend(['Price', 'Buys', 'Sells'])

        performance_df = pd.DataFrame().from_dict(env.action_scheme.portfolio.performance, orient='index')
        performance_df.plot(ax=axs[1])
        axs[1].set_title("Net Worth")

        plt.show()

---

### Train

Now in order to use our custom environment in ray we must first write a function that creates an instance of the TradingEnv from a configuration dictionary.

In [None]:
def create_training_env(config):
    """Creates Trading Environment. """
    
    ticker = 'PETR4'

    # PRICES

    yahoo_df = yf.download(
        f'{ticker}.SA', 
        start='2021-01-01', 
        end='2021-12-31'
    )
    
    y = yahoo_df['Adj Close'].dropna().values
    p = Stream.source(y, dtype="float").rename("BRL-ASSET")
    
    b3 = Exchange("B3", service=execute_order)(p)

    # Portfolio
    cash = Wallet(b3, 100000 * BRL) # Money
    asset = Wallet(b3, 0 * ASSET) # Stocks
    
    portfolio = Portfolio(BRL, [cash, asset])

    # Data
    feed = DataFeed([
        p,
        p.rolling(window=10).mean().rename("fast"),
        p.rolling(window=50).mean().rename("medium"),
        p.rolling(window=100).mean().rename("slow"),
        p.log().diff().fillna(0).rename("lr")
    ])
    
    # Reward
    reward_scheme = PBR(
        price=p
    )
    
    # Actions
    action_scheme = BSH(
        cash=cash,
        asset=asset
    ).attach(reward_scheme)
    
    # Visualization
    renderer_feed = DataFeed([
        Stream.source(list(yahoo_df.index)).rename("date"),
        Stream.source(y, dtype="float").rename("price"),
        Stream.sensor(action_scheme, lambda s: s.action, dtype="float").rename("action")
    ])

    # Environment
    environment = default.create(
        feed=feed,
        portfolio=portfolio,
        action_scheme=action_scheme,
        reward_scheme=reward_scheme,
        renderer_feed=renderer_feed,
        renderer=PositionChangeChart(),
        window_size=config["window_size"],
        max_allowed_loss=0.6
    )
    
    return environment

register_env("TradingEnv", create_training_env)

---

Now that the environment is registered we can run the training algorithm using the Proximal Policy Optimization (PPO) algorithm implemented in rllib.

In [None]:
window_size = 20

analysis = tune.run(
    "PPO",
    stop={
      "episode_reward_mean": 2.5
    },
    config={
        "env": "TradingEnv",
        "env_config": {
            "window_size": window_size
        },
        "log_level": "DEBUG",
        "framework": "tf2",
        "eager_tracing": False,
        "ignore_worker_failures": True,
        "num_workers": 1,
        "num_gpus": 1,
        "clip_rewards": True,
        "lr": 8e-6,
        "lr_schedule": [
            [0, 1e-1],
            [int(1e2), 1e-2],
            [int(1e3), 1e-3],
            [int(1e4), 1e-4],
            [int(1e5), 1e-5],
            [int(1e6), 1e-6],
            [int(1e7), 1e-7]
        ],
        "gamma": 0,
        "observation_filter": "MeanStdFilter",
        "lambda": 0.72,
        "vf_loss_coeff": 0.5,
        "entropy_coeff": 0.01
    },
    checkpoint_at_end=True
)

---

After training is complete, we would now like to get access to the agents policy. We can do that by restoring the agent using the following code.

In [None]:
# Get checkpoint
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean", mode="max"),
    metric="episode_reward_mean"    
)
checkpoint_path = checkpoints[0][0]

# Restore agent
agent = ppo.PPOTrainer(
    env="TradingEnv",
    config={
        "env_config": {
            "window_size": window_size # We want to look at the last x samples (days)
        },
        "framework": "tf2",
        "eager_tracing": False,
        "log_level": "DEBUG",
        "ignore_worker_failures": True,
        "num_workers": 1,
        "num_gpus": 1,
        "clip_rewards": True,
        "lr": 8e-6,
        "lr_schedule": [
            [0, 1e-1],
            [int(1e2), 1e-2],
            [int(1e3), 1e-3],
            [int(1e4), 1e-4],
            [int(1e5), 1e-5],
            [int(1e6), 1e-6],
            [int(1e7), 1e-7]
        ],
        "gamma": 0,
        "observation_filter": "MeanStdFilter",
        "lambda": 0.72,
        "vf_loss_coeff": 0.5,
        "entropy_coeff": 0.01
    }
)

agent.restore(checkpoint_path)

---

After training is complete, we would now like to get access to the agents policy. We can do that by restoring the agent using the following code.

In [None]:
# Instantiate the environment
env = create_training_env({
    "window_size": window_size
})

# Run until episode ends
episode_reward = 0
done = False
obs = env.reset()

while not done:
    action = agent.compute_single_action(obs)
    obs, reward, done, info = env.step(action)
    episode_reward += reward

env.render()

---

### Validation Set (Out-of-Sampe Data)

In [None]:
def create_eval_env(config):
    y = config["y"]
    
    p = Stream.source(y, dtype="float").rename("BRL-ASSET")

    b3 = Exchange("B3", service=execute_order)(p)

    cash = Wallet(b3, 100000 * BRL)
    asset = Wallet(b3, 0 * ASSET)

    portfolio = Portfolio(BRL, [cash, asset])

    feed = DataFeed([
        p,
        p.rolling(window=10).mean().rename("fast"),
        p.rolling(window=50).mean().rename("medium"),
        p.rolling(window=100).mean().rename("slow"),
        p.log().diff().fillna(0).rename("lr")
    ])

    reward_scheme = PBR(
        price=p
    )

    action_scheme = BSH(
        cash=cash,
        asset=asset
    ).attach(reward_scheme)

    renderer_feed = DataFeed([
        Stream.source(y, dtype="float").rename("price"),
        Stream.sensor(action_scheme, lambda s: s.action, dtype="float").rename("action")
    ])

    environment = default.create(
        feed=feed,
        portfolio=portfolio,
        action_scheme=action_scheme,
        reward_scheme=reward_scheme,
        renderer_feed=renderer_feed,
        renderer=PositionChangeChart(),
        window_size=config["window_size"],
        max_allowed_loss=0.6
    )
    return environment, portfolio

In [None]:
# Instantiate the environment
env, portfolio = create_eval_env({
    "window_size": window_size,
    "y": yf.download(f'PETR4.SA', start='2022-01-01', end='2022-04-01')['Adj Close'].dropna().values
})

# Run until episode ends
episode_reward = 0
done = False
obs = env.reset()

while not done:
    action = agent.compute_single_action(obs)
    obs, reward, done, info = env.step(action)
    episode_reward += reward
    
env.render()

portfolio.ledger.as_frame().head(10)

---

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
portfolio.ledger.as_frame().head(15)

In [None]:
df = pd.DataFrame(portfolio.performance)

new_column_list = list(yf.download(f'PETR4.SA', start='2022-01-01', end='2022-04-01').index.astype(str).values)

assert len(new_column_list) == df.shape[1]

df.set_axis(new_column_list, axis=1, inplace=True)

In [None]:
def plot_evaluation_results(df):
    """Plot Evaluation Set Results."""
    fig, ax = plt.subplots(3, sharex=True, figsize=(15,6))
    
    # Net Worth
    df.loc["net_worth"].plot(ax=ax[0])
    ax[0].set_title('Net Worth')
    ax[0].set_ylabel("Cash (R$)")
    #ax[0].set_xlabel("Valores em X")
    ax[0].axhline(100000, linestyle='dashed', color='black')
    ax[0].legend(['Current Net Worth', 'Initial Net Worth'])
    
    # Asset Price
    df.loc["B3:/BRL-ASSET"].plot(ax=ax[1])
    ax[1].set_title('Asset Price')
    ax[1].set_ylabel("Price (R$)")
    ax[1].legend(['Current Share Price'])
    
    # Positions
    df.loc["B3:/ASSET:/total"].plot(ax=ax[2])
    ax[2].set_title('Positions')
    ax[2].set_ylabel("Shares (#)")
    ax[2].set_xlabel("Datetime")
    ax[2].legend(['Current Position'])
    
    plt.tight_layout()

In [None]:
plot_evaluation_results(df)

---