# DDPG from scratch

In [1]:
from dataclasses import dataclass
import numpy as np
import plotly.graph_objects as go
import torch
from tqdm.notebook import tqdm, trange

## Data structures

In [2]:
@dataclass(frozen=True)
class Observation:
    cash_balance: float
    asset_balance: float
    best_ask: float
    best_bid: float

    @property
    def tensor(self):
        return torch.tensor(
            [[self.cash_balance, self.asset_balance, self.best_ask, self.best_bid]],
            dtype=torch.float32,
        )


@dataclass(frozen=True)
class Action:
    tensor: torch.Tensor

    @property
    def ask(self):
        return self.tensor.detach().numpy()[0, 0]

    @property
    def bid(self):
        return self.tensor.detach().numpy()[0, 1]


@dataclass(frozen=True)
class Experience:
    old_observation: Observation
    action: Action
    reward: float
    new_observation: Observation


## Architecture

In [3]:
class Actor(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.network = torch.nn.Sequential(
            torch.nn.Linear(4, 64),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(64, 2),
        )

    def forward(self, tensor):
        return self.network(tensor)


class Critic(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.network = torch.nn.Sequential(
            torch.nn.Linear(6, 64),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(64, 1),
        )

    def forward(self, tensor):
        return self.network(tensor)


@dataclass(frozen=True)
class Agent:
    actor: Actor
    actor_optimizer: torch.optim.Optimizer
    critic: Critic
    critic_optimizer: torch.optim.Optimizer
    discount: float

    def act(self, observation: Observation) -> Action:
        tensor = self.actor(observation.tensor)
        return Action(tensor=tensor)

    def evaluate(self, observation: Observation, action: Action) -> torch.Tensor:
        return self.critic(
            torch.cat([observation.tensor, action.tensor], axis=1)
        ).squeeze()

    def train_(self, experience: Experience):
        new_action = self.act(experience.new_observation)
        new_evaluation = self.evaluate(experience.new_observation, new_action)
        td_error = (
            experience.reward
            + self.discount * new_evaluation
            - self.evaluate(experience.old_observation, experience.action)
        )

        critic_loss = td_error ** 2
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        actor_loss = -self.evaluate(
            experience.new_observation, self.act(experience.new_observation)
        )  # we need to re-evaluate because the critic changed
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()


## Training

In [4]:
def mock_experience(agent: Agent) -> Experience:
    old_mid_price = np.random.uniform(0.5, 1.5)
    old_observation = Observation(
        cash_balance=np.random.uniform(0, 64),
        asset_balance=np.random.uniform(0, 64),
        best_ask=old_mid_price * 1.1,
        best_bid=old_mid_price / 1.1,
    )
    action = agent.act(old_observation)
    new_cash_balance = old_observation.cash_balance
    new_asset_balance = old_observation.asset_balance
    if action.ask <= old_observation.best_bid:
        sold_assets = min(new_asset_balance, 1)
        new_cash_balance += action.ask * sold_assets
        new_asset_balance -= sold_assets
    if action.bid >= old_observation.best_ask:
        bought_assets = min(1, new_cash_balance / action.bid)
        new_cash_balance -= action.bid * bought_assets
        new_asset_balance += bought_assets
    new_mid_price = np.random.uniform(0.5, 1.5)
    new_observation = Observation(
        cash_balance=new_cash_balance,
        asset_balance=new_asset_balance,
        best_ask=new_mid_price * 1.1,
        best_bid=new_mid_price / 1.1,
    )
    return Experience(
        old_observation=old_observation,
        action=action,
        reward=-((action.ask - 1.5) ** 2 + (action.bid - 0.5) ** 2),
        new_observation=new_observation,
    )


In [8]:
def train(actor_lr, critic_lr, num_steps=4096):
    actor = Actor()
    critic = Critic()
    agent = Agent(
        actor=actor,
        actor_optimizer=torch.optim.Adam(actor.parameters(), lr=actor_lr),
        critic=critic,
        critic_optimizer=torch.optim.Adam(critic.parameters(), lr=critic_lr),
        discount=0.99,
    )
    for step in trange(num_steps):
        experience = mock_experience(agent)
        agent.train_(experience)
    return agent

In [9]:
example_agent = train(actor_lr=1e-3, critic_lr=1e-2)
example_actions = [mock_experience(example_agent).action for _ in range(4096)]
go.Figure(
    data=[
        go.Histogram(
            name="Ask",
            x=[action.ask for action in example_actions],
        ),
        go.Histogram(
            name="Bid",
            x=[action.bid for action in example_actions],
        ),
    ]
)

  0%|          | 0/4096 [00:00<?, ?it/s]