
# 🤖 RL Execution Agent — Almgren–Chriss Style

_Date generated: 2025-09-03_

This notebook trains a **reinforcement learning execution agent** to minimize **implementation shortfall** when liquidating an order under market impact, inspired by **Almgren–Chriss** (temporary & permanent impact).

**What you'll get**
- A realistic **execution environment** (multi-step, with market volume, spread, volatility, and impact).
- Baselines: **TWAP** and **POV** (fixed participation).
- **DQN agent** (PyTorch) choosing participation rate each slice.
- Learning curve, backtests vs baselines, distribution of costs.


## 1) Setup

In [None]:

import math, random, os, sys, time
from dataclasses import dataclass
from typing import Tuple, List, Dict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

np.random.seed(42)
random.seed(42)
torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


## 2) Execution Environment (Almgren–Chriss‑like)

In [None]:

@dataclass
class ExecConfig:
    T: int = 40                 # time slices
    Q0: float = 100_000         # shares to sell
    p0: float = 100.0           # arrival mid price
    sigma: float = 0.02         # daily vol (fraction)
    dt: float = 1/78            # each step ~ 5 minutes (78 slices/day)
    kappa: float = 1e-6         # permanent impact per share
    eta: float = 5e-5           # temporary impact coefficient
    spread_bps: float = 5       # half-spread in bps
    vol_mu: float = 2e6         # avg market volume per slice
    vol_sd: float = 5e5         # volume std per slice
    noise_sd: float = 0.002     # microstructure noise
    max_participation: float = 0.2  # hard cap (20% POV per slice)
    action_levels: int = 7      # number of discrete participation levels
    buy: bool = False           # we default to SELL; set True for buy program

class ExecEnv:
    def __init__(self, cfg: ExecConfig):
        self.cfg = cfg
        self.action_space = cfg.action_levels
        self.reset()

    def reset(self):
        self.t = 0
        self.p = self.cfg.p0
        self.q_rem = self.cfg.Q0
        self.perm_impact_cum = 0.0
        self.last_participation = 0.0
        self._gen_paths()
        return self._state()

    def _gen_paths(self):
        T = self.cfg.T
        dt = self.cfg.dt
        sigma_step = self.cfg.sigma * math.sqrt(dt)
        # Exogenous midprice process (without impact)
        shocks = np.random.normal(0, sigma_step, size=T)
        self.mid_exo = self.cfg.p0 * (1 + np.cumsum(shocks))
        # Market volumes per slice
        self.market_vol = np.maximum(np.random.normal(self.cfg.vol_mu, self.cfg.vol_sd, size=T), 1e3)
        # Spreads per slice (in price units)
        self.half_spread = (self.cfg.spread_bps/1e4) * self.cfg.p0

    def _state(self):
        frac_time = self.t / max(1, self.cfg.T-1)
        frac_inv = self.q_rem / max(1, self.cfg.Q0)
        avg_vol = self.market_vol[self.t] / (self.cfg.vol_mu + 1e-9)
        last_part = self.last_participation
        # scaled features
        return np.array([frac_time, frac_inv, avg_vol, last_part], dtype=np.float32)

    def step(self, action: int):
        assert 0 <= action < self.action_space
        if self.q_rem <= 0 or self.t >= self.cfg.T:
            return self._state(), 0.0, True, {}

        # Map discrete action -> participation rate
        part = (action / (self.action_space - 1)) * self.cfg.max_participation  # [0, max_participation]
        mkt_vol = self.market_vol[self.t]
        # shares to execute this slice (can't exceed remaining)
        q_exec = min(self.q_rem, part * mkt_vol)

        # Impact & price formation
        sign = -1 if not self.cfg.buy else +1   # selling pushes price down; buying up
        # Permanent impact
        dp_perm = sign * self.cfg.kappa * q_exec
        self.perm_impact_cum += dp_perm
        # Temporary impact (execution price vs current mid)
        tmp_imp = self.cfg.eta * (q_exec / max(1.0, mkt_vol))  # scales with fraction of slice volume
        micro_noise = np.random.normal(0, self.cfg.noise_sd)

        # Midprice evolution (exo + permanent impact)
        mid_no_impact = self.mid_exo[self.t]
        self.p = max(0.01, mid_no_impact + self.perm_impact_cum + micro_noise)

        # Realized execution price (we cross the spread + temp impact)
        if not self.cfg.buy:
            # SELL: execute near bid -> mid - half_spread - tmp_imp
            px_exec = self.p - self.half_spread - tmp_imp * self.p
        else:
            # BUY: execute near ask -> mid + half_spread + tmp_imp
            px_exec = self.p + self.half_spread + tmp_imp * self.p

        # Cashflow & inventory update
        cashflow = px_exec * q_exec * (+1 if not self.cfg.buy else -1)  # selling adds cash, buying uses cash
        self.q_rem -= q_exec
        self.t += 1
        self.last_participation = part

        # Reward shaping: negative implementation shortfall increment
        # IS = (arrival_mid - realized_px) * q for SELL (opposite for BUY)
        arrival = self.cfg.p0
        if not self.cfg.buy:
            inc_cost = (arrival - px_exec) * q_exec
        else:
            inc_cost = (px_exec - arrival) * q_exec

        reward = -inc_cost / max(1.0, self.cfg.Q0 * self.cfg.p0)  # scale by notional
        done = (self.q_rem <= 1e-8) or (self.t >= self.cfg.T)

        # Apply terminal penalty if time ends with unexecuted inventory
        if done and self.q_rem > 0:
            # Force market order for remaining at worse price (spread + big temp impact)
            pen_tmp = self.cfg.eta * (self.q_rem / (self.market_vol[self.t-1] if self.t>0 else self.cfg.vol_mu)) * 5.0
            worst_px = (self.p - self.half_spread - pen_tmp * self.p) if not self.cfg.buy else (self.p + self.half_spread + pen_tmp * self.p)
            rem_cost = ((arrival - worst_px) if not self.cfg.buy else (worst_px - arrival)) * self.q_rem
            reward += - rem_cost / max(1.0, self.cfg.Q0 * self.cfg.p0)
            self.q_rem = 0.0

        return self._state(), float(reward), done, {
            "q_exec": q_exec, "px_exec": float(px_exec), "mid": float(self.p), "part": float(part)
        }


## 3) Baselines: TWAP & POV

In [None]:

def run_policy(env: ExecEnv, policy_fn, episodes=50):
    rewards, costs = [], []
    for _ in range(episodes):
        s = env.reset()
        total_r, total_cost = 0.0, 0.0
        while True:
            a = policy_fn(env, s)
            s, r, done, info = env.step(a)
            total_r += r
            # translate reward back to cost
            total_cost += -r * (env.cfg.Q0 * env.cfg.p0)
            if done:
                break
        rewards.append(total_r)
        costs.append(total_cost)
    return np.array(rewards), np.array(costs)

def twap_policy(env: ExecEnv, s):
    # target equal shares each slice -> choose participation to hit target
    slices_left = max(1, env.cfg.T - env.t)
    target_q = env.q_rem / slices_left
    desired_part = min(env.cfg.max_participation, target_q / env.market_vol[env.t])
    # map to discrete action
    a = int(round(desired_part / env.cfg.max_participation * (env.action_space-1)))
    return np.clip(a, 0, env.action_space-1)

def pov_policy(env: ExecEnv, s, pov=0.1):
    desired_part = min(env.cfg.max_participation, pov)
    a = int(round(desired_part / env.cfg.max_participation * (env.action_space-1)))
    return np.clip(a, 0, env.action_space-1)

cfg = ExecConfig()
env = ExecEnv(cfg)

r_twap, c_twap = run_policy(env, lambda e,s: twap_policy(e,s), episodes=30)
r_pov,  c_pov  = run_policy(env, lambda e,s: pov_policy(e,s, pov=0.1), episodes=30)

print("TWAP  : mean cost $", np.mean(c_twap))
print("POV10%: mean cost $", np.mean(c_pov))


## 4) DQN Agent (PyTorch)

In [None]:

class QNet(nn.Module):
    def __init__(self, state_dim, n_actions):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 64), nn.ReLU(),
            nn.Linear(64, 64), nn.ReLU(),
            nn.Linear(64, n_actions)
        )
    def forward(self, x): return self.net(x)

class ReplayBuffer:
    def __init__(self, cap=50_000):
        self.s, self.a, self.r, self.ns, self.d = [], [], [], [], []
        self.cap = cap
        self.ptr = 0
    def push(self, s,a,r,ns,d):
        if len(self.s) < self.cap:
            self.s.append(s); self.a.append(a); self.r.append(r); self.ns.append(ns); self.d.append(d)
        else:
            i = self.ptr % self.cap
            self.s[i]=s; self.a[i]=a; self.r[i]=r; self.ns[i]=ns; self.d[i]=d
            self.ptr += 1
    def sample(self, bs=64):
        idx = np.random.randint(0, len(self.s), size=bs)
        return (
            torch.tensor(np.array([self.s[i] for i in idx]), dtype=torch.float32, device=device),
            torch.tensor(np.array([self.a[i] for i in idx]), dtype=torch.int64, device=device),
            torch.tensor(np.array([self.r[i] for i in idx]), dtype=torch.float32, device=device),
            torch.tensor(np.array([self.ns[i] for i in idx]), dtype=torch.float32, device=device),
            torch.tensor(np.array([self.d[i] for i in idx]), dtype=torch.float32, device=device),
        )
    def __len__(self): return len(self.s)

def train_dqn(env: ExecEnv, episodes=300, gamma=0.99, lr=1e-3, eps_start=1.0, eps_end=0.05, eps_decay=0.995, target_sync=200):
    state_dim = len(env._state())
    n_actions = env.action_space
    q = QNet(state_dim, n_actions).to(device)
    qt = QNet(state_dim, n_actions).to(device)
    qt.load_state_dict(q.state_dict())
    opt = optim.Adam(q.parameters(), lr=lr)
    buf = ReplayBuffer(100_000)

    eps = eps_start
    rewards = []

    step = 0
    for ep in range(episodes):
        s = env.reset()
        ep_reward = 0.0
        while True:
            step += 1
            if np.random.rand() < eps:
                a = np.random.randint(0, n_actions)
            else:
                with torch.no_grad():
                    a = q(torch.tensor(s, device=device)).argmax().item()
            ns, r, done, info = env.step(a)
            buf.push(s,a,r,ns,done)
            s = ns
            ep_reward += r

            if len(buf) >= 512:
                S,A,R,NS,D = buf.sample(128)
                with torch.no_grad():
                    max_next = qt(NS).max(dim=1)[0]
                    y = R + gamma * (1 - D) * max_next
                q_sa = q(S).gather(1, A.view(-1,1)).squeeze(1)
                loss = F.mse_loss(q_sa, y)
                opt.zero_grad(); loss.backward(); opt.step()

                if step % target_sync == 0:
                    qt.load_state_dict(q.state_dict())

            if done: break

        rewards.append(ep_reward)
        eps = max(eps_end, eps * eps_decay)
        if (ep+1) % 20 == 0:
            print(f"Episode {ep+1}/{episodes}  avgR(last20)={np.mean(rewards[-20:]):.6f}  eps={eps:.3f}")
    return q, rewards

cfg = ExecConfig()
env = ExecEnv(cfg)
qnet, rew_hist = train_dqn(env, episodes=200)


## 5) Learning Curve

In [None]:

plt.figure(figsize=(10,3.5))
plt.plot(pd.Series(rew_hist).rolling(10).mean())
plt.title("DQN — Rolling Mean Reward (10 ep)")
plt.tight_layout(); plt.show()


## 6) Evaluation vs Baselines

In [None]:

def eval_policy(env: ExecEnv, policy, episodes=100):
    costs = []
    for _ in range(episodes):
        s = env.reset()
        total_cost = 0.0
        while True:
            a = policy(env, s)
            s, r, done, info = env.step(a)
            total_cost += -r * (env.cfg.Q0 * env.cfg.p0)
            if done: break
        costs.append(total_cost)
    return np.array(costs)

def policy_from_q(qnet: QNet):
    def fn(env, s):
        with torch.no_grad():
            t = torch.tensor(s, dtype=torch.float32, device=device)
            return int(qnet(t).argmax().item())
    return fn

env = ExecEnv(cfg)
cost_rl  = eval_policy(env, policy_from_q(qnet), episodes=100)
cost_twap = eval_policy(env, lambda e,s: twap_policy(e,s), episodes=100)
cost_pov  = eval_policy(env, lambda e,s: pov_policy(e,s, pov=0.1), episodes=100)

print("Mean costs ($): RL={:.2f}  TWAP={:.2f}  POV10={:.2f}".format(cost_rl.mean(), cost_twap.mean(), cost_pov.mean()))

plt.figure(figsize=(10,3.5))
plt.hist(cost_rl, bins=30, alpha=0.6, label="RL")
plt.hist(cost_twap, bins=30, alpha=0.6, label="TWAP")
plt.hist(cost_pov, bins=30, alpha=0.6, label="POV10%")
plt.title("Distribution of Implementation Shortfall Costs")
plt.legend(); plt.tight_layout(); plt.show()


## 7) Example Episode — Inventory & Participation

In [None]:

def rollout(env: ExecEnv, policy, record=True):
    s = env.reset()
    rec = {"t": [], "q_rem": [], "part": [], "px": [], "exec_px": [], "mid": []}
    total_cost = 0.0
    while True:
        a = policy(env, s)
        ns, r, done, info = env.step(a)
        if record:
            rec["t"].append(env.t)
            rec["q_rem"].append(env.q_rem)
            rec["part"].append(info["part"])
            rec["mid"].append(info["mid"])
            rec["exec_px"].append(info["px_exec"])
        s = ns
        total_cost += -r * (env.cfg.Q0 * env.cfg.p0)
        if done: break
    return total_cost, pd.DataFrame(rec)

env = ExecEnv(cfg)
cost, df = rollout(env, policy_from_q(qnet))

fig, ax = plt.subplots(2,1, figsize=(10,6))
ax[0].plot(df["t"], df["q_rem"]); ax[0].set_title("Remaining Inventory")
ax[1].plot(df["t"], df["part"]); ax[1].set_title("Participation per Slice")
plt.tight_layout(); plt.show()

plt.figure(figsize=(10,3))
plt.plot(df["t"], df["mid"], label="Mid")
plt.plot(df["t"], df["exec_px"], label="Exec Px")
plt.title("Prices During Episode")
plt.legend(); plt.tight_layout(); plt.show()

print("Example episode cost ($):", cost)



## 8) Extensions (easy to add)
- Add **stochastic spreads** and **venue selection** (extend action space).
- Penalty on **aggressiveness** or **variance of participation**.
- Replace DQN with **PPO** or **DDQN**.
- Train on **buy** as well as **sell** programs (flip `buy=True`).
- Plug real **tape/LOB features** (depth imbalance, recent volume, toxicity).

---
