In [610]:
import numpy as np
import gym
from gym import spaces
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque, namedtuple

In [None]:
import numpy as np
import gym
from gym import spaces

class BankRunEnv(gym.Env):
    metadata = {'render.modes': []}

    def __init__(
        self,
        n_agents: int = 5,
        T: int = 10,
        alpha: float = 0.3,
        health_min: float = 0.0,
        sigma: float = 0.1,
        deposit_low: float = 0.5,
        deposit_high: float = 1.5,
        R_func=None
    ):
        super().__init__()
        self.n_agents = n_agents
        self.T = T
        self.alpha = alpha
        self.health_min = health_min
        self.sigma = sigma
        self.R_func = R_func or (lambda h: np.exp(h))

        # obs = [noisy_health, last_withdraw_frac]
        self.observation_space = spaces.Box(
            low=np.array([-np.inf, 0.0], dtype=np.float32),
            high=np.array([ np.inf, 1.0], dtype=np.float32),
            dtype=np.float32
        )
        # two actions: wait or withdraw
        self.action_space = spaces.Discrete(2)

        # each agent’s fixed deposit, drawn once
        self.fixed_deposits = np.random.uniform(
            deposit_low, deposit_high, size=self.n_agents
        ).astype(np.float32)

    def reset(self):
        # random initial health in [0.5, 1.5]
        self.initial_health = np.random.uniform(0.5, 1.5)
        self.health = self.initial_health

        self.t = 0
        self.collapsed = False
        self.last_withdraw_frac = 0.0

        # reset deposits & total
        self.deposits = self.fixed_deposits.copy()
        self.total_deposits = float(self.deposits.sum())
        self.active = np.ones(self.n_agents, dtype=bool)

        return self._get_obs()

    def _get_obs(self):
        # noisy health signal for each agent
        noisy = self.health + np.random.randn(self.n_agents) * self.sigma
        return [
            np.array([noisy[i], self.last_withdraw_frac], dtype=np.float32)
            for i in range(self.n_agents)
        ]

    def step(self, actions):
        assert 0 <= self.t < self.T and not self.collapsed, "Episode done"
        actions = np.array(actions, dtype=int)
        rewards = np.zeros(self.n_agents, dtype=np.float32)

        # intermediate rounds
        if self.t < self.T - 1:
            # who withdraws this step?
            withdrawers = (actions == 1) & self.active
            idx = np.where(withdrawers)[0].tolist()

            # pay and deactivate
            withdrawn_amt = 0.0
            for i in idx:
                rewards[i] = self.deposits[i]
                withdrawn_amt += self.deposits[i]
                self.active[i] = False

            # multiplicative decay of health
            if self.total_deposits > 0:
                frac = withdrawn_amt / self.total_deposits
                self.health *= max(0.0, 1.0 - self.alpha * frac)

            self.last_withdraw_frac = len(idx) / float(self.n_agents)

            # collapse check
            if self.health <= self.health_min:
                self.collapsed = True
                self.active[:] = False

            self.t += 1
            done = not self.active.any()
            obs = self._get_obs() if not done else [np.zeros(2, dtype=np.float32)] * self.n_agents
            dones = [done] * self.n_agents
            info = {'health': self.health, 'collapsed': self.collapsed}
            return obs, rewards.tolist(), dones, info

        # final period: everyone forced to exit
        else:
            remaining = self.active
            Rpay = float(self.R_func(self.health))
            rewards[remaining] = self.deposits[remaining] * Rpay
            self.active[:] = False
            self.t += 1
            obs = [np.zeros(2, dtype=np.float32)] * self.n_agents
            dones = [True] * self.n_agents
            info = {'health': self.health, 'collapsed': self.collapsed}
            return obs, rewards.tolist(), dones, info

    def render(self, mode='human'):
        print(f"t={self.t}/{self.T-1}, health={self.health:.3f}, collapsed={self.collapsed}")
        print(f"Deposits: {self.deposits}")
        print(f"Active: {self.active}, last_withdraw_frac={self.last_withdraw_frac:.2f}")


In [612]:
class CentralizedQNet(nn.Module):
    def __init__(self, n_agents, obs_dim, hidden_dim=128):
        super().__init__()
        input_dim = n_agents * obs_dim
        output_dim = 2 ** n_agents
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.net(x)

Transition = namedtuple('Transition', ('state','action','reward','next_state','done'))

class ReplayBuffer:
    def __init__(self, capacity=100000):
        self.buffer = deque(maxlen=capacity)
    def push(self, *args):
        self.buffer.append(Transition(*args))
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)
    def __len__(self):
        return len(self.buffer)

def index_to_joint_action(idx, n_agents):
    b = format(idx, f'0{n_agents}b')
    return tuple(int(x) for x in b)

In [613]:
def R_power(h, rho=0.10, beta=2):
    return 1.0 + rho * (h ** beta)

In [614]:
def train_centralized_dqn(env, n_agents, obs_dim, buffer, q_net, target_q_net,
                          optimizer, episodes=1000, batch_size=64,
                          gamma=0.99, sync_every=50):
    for ep in range(episodes):
        joint_obs = np.concatenate(env.reset()).astype(np.float32)
        done = False
        while not done:
            if random.random() < max(0.1, 1 - ep/episodes):
                act_idx = random.randrange(2**n_agents)
            else:
                with torch.no_grad():
                    qs = q_net(torch.from_numpy(joint_obs).unsqueeze(0))
                    act_idx = qs.argmax(dim=1).item()

            joint_act = index_to_joint_action(act_idx, n_agents)
            obs_, rewards, dones, info = env.step(joint_act)
            joint_obs_next = np.concatenate(obs_).astype(np.float32)
            r = sum(rewards)
            buffer.push(joint_obs, act_idx, r, joint_obs_next, any(dones))
            joint_obs = joint_obs_next
            done = any(dones)

        if len(buffer) >= batch_size:
            batch = Transition(*zip(*buffer.sample(batch_size)))
            s = torch.tensor(batch.state)
            a = torch.tensor(batch.action).unsqueeze(1)
            r = torch.tensor(batch.reward).unsqueeze(1)
            s2 = torch.tensor(batch.next_state)
            d = torch.tensor(batch.done).unsqueeze(1)

            q_vals = q_net(s).gather(1, a)
            with torch.no_grad():
                next_q = target_q_net(s2).max(dim=1, keepdim=True)[0]
                target = r + gamma * next_q * (~d)
            loss = nn.MSELoss()(q_vals, target)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if ep % sync_every == 0:
            target_q_net.load_state_dict(q_net.state_dict())

In [615]:
def evaluate(env, n_agents, q_net, episodes=5):
    for ep in range(episodes):
        joint_obs = np.concatenate(env.reset()).astype(np.float32)
        done = False
        step = 0
        print(f"\n--- Eval Episode {ep} ---")
        print(env.health)
        while not done:
            with torch.no_grad():
                qs = q_net(torch.from_numpy(joint_obs).unsqueeze(0))
                act_idx = qs.argmax(dim=1).item()
            joint_act = index_to_joint_action(act_idx, n_agents)
            obs_, rewards, dones, info = env.step(joint_act)
            print(f"Step {step}: Health={info['health']:.3f}, Actions={joint_act}, Obs={[round(float(joint_obs[2*i]),3) for i in range(len(joint_obs)//2)]}, Rewards={[round(i,3) for i in rewards]}")
            joint_obs = np.concatenate(obs_).astype(np.float32)
            done = any(dones)
            step += 1

In [None]:
n_agents = 8
obs_dim = 2
env = BankRunEnv(n_agents=n_agents, T=15, alpha=1, health_min=0.3, sigma=0.25, R_func=None)

q_net = CentralizedQNet(n_agents, obs_dim)
target_q_net = CentralizedQNet(n_agents, obs_dim)
target_q_net.load_state_dict(q_net.state_dict())
optimizer = optim.Adam(q_net.parameters(), lr=2e-4)
buffer = ReplayBuffer(50000)

# Train
train_centralized_dqn(env, n_agents, obs_dim, buffer, q_net, target_q_net,
                        optimizer, episodes=5000, batch_size=128)

# Evaluate
evaluate(env, n_agents, q_net, episodes=10)


--- Eval Episode 0 ---
1.1831454022188115
Step 0: Health=1.068, Actions=(0, 0, 0, 0, 0, 1, 0, 0), Obs=[1.849, 1.054, 0.736, 1.308, 1.148, 1.091, 1.337, 1.416], Rewards=[0.0, 0.0, 0.0, 0.0, 0.0, 0.748, 0.0, 0.0]
Step 1: Health=1.068, Actions=(0, 0, 0, 0, 0, 1, 0, 0), Obs=[1.152, 0.628, 0.857, 0.893, 1.302, 1.161, 1.11, 1.219], Rewards=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Step 2: Health=1.068, Actions=(0, 0, 0, 0, 0, 1, 0, 0), Obs=[1.063, 0.931, 1.014, 0.993, 1.605, 1.053, 1.181, 0.782], Rewards=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Step 3: Health=0.757, Actions=(0, 0, 1, 0, 0, 0, 0, 1), Obs=[0.899, 0.695, 1.076, 0.913, 0.948, 0.647, 1.034, 1.148], Rewards=[0.0, 0.0, 0.731, 0.0, 0.0, 0.0, 0.0, 1.498]
Step 4: Health=0.757, Actions=(0, 0, 0, 0, 0, 1, 0, 0), Obs=[1.041, 0.738, 1.016, 0.554, 0.896, 0.918, 0.66, 0.507], Rewards=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Step 5: Health=0.588, Actions=(1, 1, 0, 0, 0, 1, 0, 0), Obs=[0.631, 0.896, 0.329, 0.425, 0.715, 0.591, 0.487, 0.605], 