<a href="https://colab.research.google.com/github/manikanta-eng/Reinforcement-learning/blob/main/rml_lab_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os, random, time, numpy as np, torch, torch.nn as nn, torch.optim as optim
try:
    from pettingzoo.mpe import simple_spread_v3
except Exception:
    from pettingzoo.mpe2 import simple_spread_v3

Dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Net(nn.Module):
    def __init__(self, ob, ac, h=256):
        super().__init__()
        self.m = nn.Sequential(nn.Linear(ob, h), nn.ReLU(), nn.Linear(h, h), nn.ReLU(), nn.Linear(h, ac))
    def forward(self, x):
        return self.m(x)

class Buf:
    def __init__(self, cap=200_000):
        self.s = np.zeros((cap,), dtype=object)
        self.a = np.zeros((cap,), dtype=np.int64)
        self.r = np.zeros((cap,), dtype=np.float32)
        self.ns = np.zeros((cap,), dtype=object)
        self.d = np.zeros((cap,), dtype=np.bool_)
        self.i = 0
        self.n = 0
        self.cap = cap
    def push(self, s, a, r, ns, d):
        self.s[self.i] = s; self.a[self.i] = a; self.r[self.i] = r; self.ns[self.i] = ns; self.d[self.i] = d
        self.i = (self.i + 1) % self.cap
        self.n = min(self.n + 1, self.cap)
    def sample(self, bs):
        idx = np.random.randint(0, self.n, size=bs)
        s = np.stack(self.s[idx])
        a = torch.as_tensor(self.a[idx], dtype=torch.long, device=Dev)
        r = torch.as_tensor(self.r[idx], dtype=torch.float32, device=Dev)
        ns = np.stack(self.ns[idx])
        d = torch.as_tensor(self.d[idx], dtype=torch.float32, device=Dev)
        return s, a, r, ns, d
    def __len__(self):
        return self.n

class DQN:
    def __init__(self, ob, ac, lr=2.5e-4, gamma=0.99, tau=0.005):
        self.q = Net(ob, ac).to(Dev)
        self.t = Net(ob, ac).to(Dev)
        self.t.load_state_dict(self.q.state_dict())
        self.o = optim.Adam(self.q.parameters(), lr=lr)
        self.l = nn.SmoothL1Loss()
        self.g = gamma
        self.tau = tau
        self.ac = ac
        self.buf = Buf()
    def act(self, ob, eps):
        if np.random.rand() < eps:
            return np.random.randint(self.ac)
        with torch.no_grad():
            q = self.q(torch.as_tensor(ob, dtype=torch.float32, device=Dev).unsqueeze(0))
            return int(q.argmax(1).item())
    def upd(self, bs=256):
        if len(self.buf) < bs:
            return 0.0
        s, a, r, ns, d = self.buf.sample(bs)
        s = torch.as_tensor(s, dtype=torch.float32, device=Dev)
        ns = torch.as_tensor(ns, dtype=torch.float32, device=Dev)
        qv = self.q(s).gather(1, a.view(-1,1)).squeeze(1)
        with torch.no_grad():
            nq = self.t(ns).max(1).values
            tg = r + (1.0 - d) * self.g * nq
        loss = self.l(qv, tg)
        self.o.zero_grad(); loss.backward()
        torch.nn.utils.clip_grad_norm_(self.q.parameters(), 10.0)
        self.o.step()
        with torch.no_grad():
            for p, tp in zip(self.q.parameters(), self.t.parameters()):
                tp.data.lerp_(p.data, self.tau)
        return float(loss.item())

def mk_env(seed=0, n=3, cyc=200, lr=0.5):
    e = simple_spread_v3.parallel_env(N=n, local_ratio=lr, max_cycles=cyc, continuous_actions=False)
    e.reset(seed=seed)
    return e

def seed_all(s):
    random.seed(s); np.random.seed(s); torch.manual_seed(s)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(s)

def flat_obs(obs_dict):
    return {k: np.asarray(v, dtype=np.float32).reshape(-1) for k, v in obs_dict.items()}

def pz_reset(env, seed=None):
    out = env.reset(seed=seed)
    if isinstance(out, tuple):
        obs, _ = out
    else:
        obs = out
    return obs

def train(
    total_steps=300_000, seed=7, n_agents=3, start_eps=1.0, end_eps=0.05, eps_decay_steps=200_000,
    batch_size=256, tau=0.005, gamma=0.99, lr=2.5e-4, max_cycles=200, local_ratio=0.5,
    eval_every=25_000, eval_episodes=5
):
    seed_all(seed)
    env = mk_env(seed=seed, n=n_agents, cyc=max_cycles, lr=local_ratio)
    ids = env.possible_agents
    obs = pz_reset(env, seed=seed)
    obs = flat_obs(obs)
    od = {i: obs[i].shape[0] for i in ids}
    ad = {i: env.action_space(i).n for i in ids}
    ag = {i: DQN(od[i], ad[i], lr=lr, gamma=gamma, tau=tau) for i in ids}
    step = 0
    ep_r = {i: 0.0 for i in ids}
    ep = 0
    t0 = time.time()
    def eps_at(s):
        if s >= eps_decay_steps: return end_eps
        return end_eps + (start_eps - end_eps) * (1 - s/eps_decay_steps)
    while step < total_steps:
        eps = eps_at(step)
        acts = {i: ag[i].act(obs[i], eps) for i in ids}
        nobs, rew, term, trunc, inf = env.step(acts)
        nobs = flat_obs(nobs) if len(nobs) else {}
        dn = {i: bool(term[i] or trunc[i]) for i in ids}
        for i in ids:
            ag[i].buf.push(obs[i], acts[i], rew[i], nobs.get(i, obs[i]), dn[i])
            ep_r[i] += rew[i]
        obs = nobs if len(nobs) else obs
        step += 1
        for i in ids: ag[i].upd(batch_size)
        if all(dn.values()):
            ep += 1
            if ep % 10 == 0:
                m = float(np.mean(list(ep_r.values())))
                print(f"steps={step} episodes={ep} eps={eps:.3f} mean_return_per_agent={m:.3f} buf={[len(ag[a].buf) for a in ids]}")
            obs = pz_reset(env)
            obs = flat_obs(obs)
            ep_r = {i: 0.0 for i in ids}
        if step % eval_every == 0:
            r = evaluate(ag, n_episodes=eval_episodes, seed=seed+123, n_agents=n_agents, max_cycles=max_cycles, local_ratio=local_ratio)
            print(f"[EVAL] steps={step} avg_team_return={r:.3f} elapsed={time.time()-t0:.1f}s")
    return ag

def evaluate(agents, n_episodes=5, seed=123, n_agents=3, max_cycles=200, local_ratio=0.5):
    env = mk_env(seed=seed, n=n_agents, cyc=max_cycles, lr=local_ratio)
    ids = env.possible_agents
    team = []
    for e in range(n_episodes):
        obs = pz_reset(env, seed=seed+e)
        obs = flat_obs(obs)
        ep_r = {i: 0.0 for i in ids}
        dn = {i: False for i in ids}
        while not all(dn.values()):
            acts = {i: agents[i].act(obs[i], eps=0.0) for i in ids}
            nobs, rew, term, trunc, inf = env.step(acts)
            nobs = flat_obs(nobs) if len(nobs) else {}
            dn = {i: bool(term[i] or trunc[i]) for i in ids}
            for i in ids: ep_r[i] += rew[i]
            obs = nobs if len(nobs) else obs
        team.append(sum(ep_r.values()))
    return float(np.mean(team))

if __name__ == "__main__":
    agents = train(
        total_steps=15000,
        seed=42,
        n_agents=3,
        start_eps=1.0,
        end_eps=0.05,
        eps_decay_steps=12000,
        batch_size=256,
        tau=0.01,
        gamma=0.99,
        lr=3e-4,
        max_cycles=200,
        local_ratio=0.5,
        eval_every=2500,
        eval_episodes=5
    )


ModuleNotFoundError: No module named 'pettingzoo'

In [None]:
!pip install pettingzoo