# A2C from scratch

In [1]:
import numpy as np
import plotly.graph_objects as go
import torch
from tqdm.notebook import tqdm
from tradezoo.agent import DecisionBatch, Observation
from tradezoo.trainer import Experience

## Architecture

In [2]:
class Normal(torch.nn.Module):
    def __init__(self, uncertainty: float):
        super().__init__()
        self.uncertainty = uncertainty
        self.loc = torch.nn.Parameter(torch.tensor([0], dtype=torch.float))
        self.scale = torch.nn.Parameter(torch.tensor([1], dtype=torch.float))
    
    def torch_distribution(self):
        return torch.distributions.Normal(self.loc, self.uncertainty + self.scale.abs())

Normal(uncertainty=0.001).torch_distribution().sample()

tensor([-0.0099])

In [3]:
class MockActor(torch.nn.Module):
    def __init__(self, uncertainty: float):
        super().__init__()
        self.log_mid_price = Normal(uncertainty=uncertainty)
        self.log_spread = Normal(uncertainty=uncertainty)

    def decide(self):
        return DecisionBatch(
            log_mid_price=self.log_mid_price.torch_distribution(),
            log_spread=self.log_spread.torch_distribution(),
        )


MockActor(uncertainty=0.001).decide().sample()[0]


Action(log_mid_price=-0.5833380818367004, log_spread=0.6447067260742188)

## Environment

In [4]:
def mock_experience(actor: MockActor) -> Experience:
    old_mid_price = np.random.uniform(0.5, 1.5)
    old_observation = Observation(
        cash_balance=np.random.uniform(0, 64),
        asset_balance=np.random.uniform(0, 64),
        best_ask=old_mid_price * 1.1,
        best_bid=old_mid_price / 1.1,
    )
    (action,) = actor.decide().sample()
    new_cash_balance = old_observation.cash_balance
    new_asset_balance = old_observation.asset_balance
    if action.ask <= old_observation.best_bid:
        sold_assets = min(new_asset_balance, 1)
        new_cash_balance += action.ask * sold_assets
        new_asset_balance -= sold_assets
    if action.bid >= old_observation.best_ask:
        bought_assets = min(1, new_cash_balance / action.bid)
        new_cash_balance -= action.bid * bought_assets
        new_asset_balance += bought_assets
    new_mid_price = np.random.uniform(0.5, 1.5)
    new_observation = Observation(
        cash_balance=new_cash_balance,
        asset_balance=new_asset_balance,
        best_ask=new_mid_price * 1.1,
        best_bid=new_mid_price / 1.1,
    )
    return Experience(
        old_observation=old_observation,
        action=action,
        reward=-((action.ask - 1.5) ** 2 + (action.bid - 0.5) ** 2),
        new_observation=new_observation,
    )


## Training

In [5]:
def train(learning_rate, num_steps):
    actor = MockActor(uncertainty=0.001)
    optimizer = torch.optim.Adam(actor.parameters(), lr=learning_rate)
    for step_id in range(num_steps):
        experience = mock_experience(actor)
        td_error = experience.reward # TODO: introduce a critic and see if it still works
        actor_loss = (
            -td_error
            * actor.decide().log_probabilities([experience.action])
        ).mean()
        optimizer.zero_grad()
        actor_loss.backward()
        optimizer.step()
    return actor


In [6]:
example_agent = train(learning_rate=2e-2, num_steps=4096)
example_actions = [example_agent.decide().sample()[0] for _ in range(4096)]
go.Figure(
    data=[
        go.Histogram(
            name="Mid",
            x=[action.mid_price for action in example_actions],
        ),
        go.Histogram(
            name="Ask",
            x=[action.ask for action in example_actions],
        ),
        go.Histogram(
            name="Bid",
            x=[action.bid for action in example_actions],
        ),
    ]
)

In [7]:
grid_learning_rates = [1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 1e-1, 2e-1, 5e-1]
grid_num_steps = [256, 1024, 4096]
def mean_reward(learning_rate, num_steps):
    actor = train(learning_rate=learning_rate, num_steps=num_steps)
    return np.mean([mock_experience(actor).reward for _ in range(256)])
grid_mean_rewards = [
    [
        mean_reward(learning_rate, num_steps)
        for num_steps in grid_num_steps
    ]
    for learning_rate in tqdm(grid_learning_rates)
]
go.Figure(
    layout=dict(
        scene=dict(
            xaxis_title="Number of steps",
            xaxis_type="log",
            yaxis_title="Learning rate",
            yaxis_type="log",
            zaxis_title="Mean reward",
        )
    ),
    data=[
        go.Surface(
            x=grid_num_steps,
            y=grid_learning_rates,
            z=grid_mean_rewards,
        )
    ]
)

  0%|          | 0/9 [00:00<?, ?it/s]