
# 🧪 Smart Order Router (SOR) — RL Experiments

This notebook simulates a **multi-venue trading environment** and trains a simple **reinforcement learning agent** to decide which venue to route an order to.

**Experiments**
- Synthetic environment with spreads, latencies, liquidity
- Baseline strategies: round-robin, VWAP
- RL Agent (policy gradient) learns venue selection
- Compare performance across metrics: cost, slippage, latency

---


In [None]:

import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim

np.random.seed(42)
torch.manual_seed(42)


## 1) Synthetic Multi-Venue Environment

In [None]:

class MultiVenueEnv:
    def __init__(self, n_venues=3, base_price=100.0):
        self.n_venues = n_venues
        self.base_price = base_price
        self.reset()

    def reset(self):
        self.step_count = 0
        return self._state()

    def _state(self):
        # Randomized liquidity, spreads, latency
        spreads = np.random.uniform(0.01, 0.05, size=self.n_venues)
        liquidity = np.random.uniform(50, 200, size=self.n_venues)
        latency = np.random.uniform(1, 10, size=self.n_venues)
        return np.concatenate([spreads, liquidity/200.0, latency/10.0])

    def step(self, action):
        self.step_count += 1
        spreads = np.random.uniform(0.01, 0.05, size=self.n_venues)
        liquidity = np.random.uniform(50, 200, size=self.n_venues)
        latency = np.random.uniform(1, 10, size=self.n_venues)
        # Execution cost ~ spread - liquidity impact + latency penalty
        cost = spreads[action] + 0.01*(100/liquidity[action]) + 0.001*latency[action]
        reward = -cost  # lower cost is better
        state = np.concatenate([spreads, liquidity/200.0, latency/10.0])
        done = self.step_count >= 50
        return state, reward, done


## 2) Baseline Strategies

In [None]:

def baseline_round_robin(env, episodes=20):
    rewards = []
    for ep in range(episodes):
        state = env.reset()
        done = False
        total = 0
        t = 0
        while not done:
            action = t % env.n_venues
            state, r, done = env.step(action)
            total += r
            t += 1
        rewards.append(total)
    return np.mean(rewards)

def baseline_random(env, episodes=20):
    rewards = []
    for ep in range(episodes):
        state = env.reset()
        done = False
        total = 0
        while not done:
            action = np.random.randint(env.n_venues)
            state, r, done = env.step(action)
            total += r
        rewards.append(total)
    return np.mean(rewards)

env = MultiVenueEnv(n_venues=3)
print("Round-robin baseline:", baseline_round_robin(env))
print("Random baseline:", baseline_random(env))


## 3) Policy Gradient Agent

In [None]:

class PolicyNet(nn.Module):
    def __init__(self, state_dim, n_actions):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 32),
            nn.ReLU(),
            nn.Linear(32, n_actions),
            nn.Softmax(dim=-1)
        )
    def forward(self, x): return self.net(x)

def select_action(model, state):
    state_t = torch.tensor(state, dtype=torch.float32)
    probs = model(state_t)
    dist = torch.distributions.Categorical(probs)
    action = dist.sample()
    return action.item(), dist.log_prob(action)


## 4) Train Policy Gradient Agent

In [None]:

def train_pg(env, episodes=200, gamma=0.99, lr=1e-2):
    state_dim = env.reset().shape[0]
    model = PolicyNet(state_dim, env.n_venues)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    all_rewards, losses = [], []

    for ep in range(episodes):
        state = env.reset()
        log_probs, rewards = [], []
        done = False
        while not done:
            action, logp = select_action(model, state)
            next_state, reward, done = env.step(action)
            log_probs.append(logp)
            rewards.append(reward)
            state = next_state

        # Discounted return
        returns, G = [], 0
        for r in reversed(rewards):
            G = r + gamma*G
            returns.insert(0, G)
        returns = torch.tensor(returns, dtype=torch.float32)
        returns = (returns - returns.mean())/(returns.std()+1e-8)

        loss = -(torch.stack(log_probs) * returns).sum()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        all_rewards.append(np.sum(rewards))
        losses.append(loss.item())
        if (ep+1) % 20 == 0:
            print(f"Episode {ep+1}: avg_reward={np.mean(all_rewards[-20:]):.4f}")
    return model, all_rewards, losses

model, rewards, losses = train_pg(env, episodes=100)


## 5) Results & Visualization

In [None]:

plt.figure(figsize=(10,4))
plt.plot(rewards, label="Episode reward")
plt.title("Policy Gradient Training — SOR Agent")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.legend()
plt.show()
