# Multi-agent PPO for Microgrid Dynamic Pricing

This notebook scales the microgrid simulator to five parallel households and trains a decentralized Proximal Policy Optimization (PPO) agent to coordinate demand response. Agents share a policy network but experience independent discomfort and cost signals.

## Environment setup
Each observation contains normalized temperature and consumption for one household. The environment computes a neighborhood price $p = 0.1 + 0.5 \sum_i c_i (1 - a_i)$ and assigns each agent a reward that penalizes the realized cost and quadratic discomfort.

In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pandas as pd

class MultiAgentMicrogridEnv(gym.Env):
    def __init__(self, df, num_agents=5):
        super().__init__()
        self.num_agents = num_agents
        unique_ids = df['Household_ID'].unique()[:num_agents]
        df = df[df['Household_ID'].isin(unique_ids)]
        self.temps = df.pivot(index='Date', columns='Household_ID', values='Avg_Temperature_C').fillna(20).values
        self.cons = df.pivot(index='Date', columns='Household_ID', values='Energy_Consumption_kWh').fillna(5).values
        self.total_step = len(df['Date'].unique())
        self.current_step = 0

    def reset(self):
        self.current_step = 0
        return self._get_obs()

    def _get_obs(self):
        t = self.temps[self.current_step] / 30.0
        c = self.cons[self.current_step] / 10
        return np.stack([t, c], axis=1)

    def step(self, actions):
        current_temps = self.temps[self.current_step]
        current_cons = self.cons[self.current_step]
        actual_load = current_cons * (1 - actions.flatten())
        total_energy_cons = np.sum(actual_load, axis=0)
        price = 0.1 + 0.5 * total_energy_cons
        rewards = []
        for agent in range(self.num_agents):
            cost = actual_load[agent] * price
            discomfort = current_cons[agent] * (actions[agent] ** 2) * (current_temps[agent] / 30.0)
            reward = -(cost + discomfort)
            rewards.append(reward)
        self.current_step += 1
        done = self.current_step >= self.total_step
        next_obs = self._get_obs() if not done else np.zeros((self.num_agents, 2))
        return next_obs, np.array(rewards), done, {}


## Shared actor-critic policy
The PPO backbone mirrors the single-agent network but exposes an `evaluate` helper for clipped objective computation. A single policy serves all agents, encouraging coordinated responses.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal

class ActorCriticNetwork(nn.Module):
    def __init__(self, input_dim, action_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.actor_mu = nn.Linear(64, action_dim)
        self.actor_sig = nn.Linear(64, action_dim)
        self.critic = nn.Linear(64, 1)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        actor_mu = torch.sigmoid(self.actor_mu(x))
        actor_sig = torch.sigmoid(self.actor_sig(x)) * 0.3 + 0.01
        critic_value = self.critic(x)
        return actor_mu, actor_sig, critic_value

    def evaluate(self, state, action):
        mu, sig, state_value = self.forward(state)
        dist = Normal(mu, sig)
        actions_log_normal = dist.log_prob(action)
        dist_entropy = dist.entropy()
        return actions_log_normal, state_value, dist_entropy


## PPO update rule
The surrogate objective clips the ratio $rac{\pi_	heta(a|s)}{\pi_{	heta_{old}}(a|s)}$ within $(1 \pm \epsilon)$ while adding a value loss and entropy bonus to encourage exploration. Rewards are normalized per batch to stabilize learning.

In [None]:
import torch.optim as optim

class PPOAgent:
    def __init__(self, input_dim, action_dim, lr=0.002, gamma=0.99, K_epochs=4, eps_clip=0.2):
        self.input_dim = input_dim
        self.action_dim = action_dim
        self.K_epochs = K_epochs
        self.eps_clip = eps_clip
        self.lr = lr
        self.gamma = gamma
        self.policy = ActorCriticNetwork(input_dim, action_dim)
        self.old_policy = ActorCriticNetwork(input_dim, action_dim)
        self.old_policy.load_state_dict(self.policy.state_dict())
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        self.buffer = []

    def store(self, state, action, log_prob, reward, done):
        obs = (state, action, log_prob, reward, done)
        self.buffer.append(obs)

    def update(self):
        states, actions, log_probs, rewards, done = zip(*self.buffer)
        discounted_rewards = []
        current_reward = 0
        for reward, is_terminal in zip(reversed(rewards), reversed(done)):
            if is_terminal:
                current_reward = 0
            current_reward = reward + self.gamma * current_reward
            discounted_rewards.insert(0, current_reward)
        discounted_rewards_t = torch.FloatTensor(discounted_rewards)
        discounted_rewards_t = (discounted_rewards_t - discounted_rewards_t.mean()) / (discounted_rewards_t.std() + 1e-6)
        old_states = torch.stack(states).squeeze().detach()
        old_actions = torch.stack(actions).squeeze().detach()
        old_logprobs = torch.stack(log_probs).squeeze().detach()
        for _ in range(self.K_epochs):
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
            logprobs = logprobs.squeeze()
            ratio = torch.exp(logprobs - old_logprobs.detach())
            state_values = state_values.squeeze()
            advantage = discounted_rewards_t - state_values.detach()
            surr_1 = ratio * advantage
            surr_2 = torch.clamp(ratio, 1 - self.eps_clip, 1 + self.eps_clip) * advantage
            loss = -torch.min(surr_1, surr_2) + 0.5 * F.mse_loss(state_values, discounted_rewards_t) - 0.01 * dist_entropy
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
        self.old_policy.load_state_dict(self.policy.state_dict())
        self.buffer = []


## Multi-agent training loop
Each episode collects trajectories for all agents, aggregates them into a shared buffer, and performs updates every ten episodes. The average neighborhood reward highlights the policy trend.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('household_energy_consumption.csv')
num_agents = 5
env = MultiAgentMicrogridEnv(df, num_agents=num_agents)
ppo_agent = PPOAgent(input_dim=2, action_dim=1, lr=0.0001, gamma=0.99)
num_episodes = 2500
update_interval = 10
all_rewards = []
print('Starting MARL PPO training...')
agent_memories = {i: [] for i in range(num_agents)}
for episode in range(num_episodes):
    obs = env.reset()
    episode_reward = 0
    done = False
    while not done:
        actions_list = []
        step_data = []
        for i in range(num_agents):
            state = torch.FloatTensor(obs[i]).unsqueeze(0)
            with torch.no_grad():
                mu, sigma, _ = ppo_agent.old_policy(state)
                dist = torch.distributions.Normal(mu, sigma)
                action = dist.sample()
                action_clipped = torch.clamp(action, 0.0, 1.0)
                log_prob = dist.log_prob(action)
            actions_list.append(action_clipped.item())
            step_data.append({
                'state': state,
                'action': action,
                'log_prob': log_prob,
                'agent_id': i
            })
        actions_np = np.array(actions_list)
        next_obs, rewards, done, _ = env.step(actions_np)
        episode_reward += np.mean(rewards)
        for i in range(num_agents):
            agent_memories[i].append((step_data[i]['state'], step_data[i]['action'], step_data[i]['log_prob'], rewards[i], done))
        obs = next_obs
    all_rewards.append(episode_reward)
    if episode % update_interval == 0 and episode > 0:
        ppo_agent.buffer = []
        for i in range(num_agents):
            history = agent_memories[i]
            for step in history:
                ppo_agent.store(*step)
            agent_memories[i] = []
        ppo_agent.update()
    if episode % 10 == 0:
        print(f"Episode {episode} | Avg team reward: {episode_reward:.2f}")

plt.plot(all_rewards)
plt.title("MARL PPO training")
plt.xlabel("Episode")
plt.ylabel("Avg reward")
plt.show()


## Policy inspection
The trained policy runs on a six-step slice to visualize how predicted actions co-vary with the dynamic price signal and ambient temperature.

In [None]:
import matplotlib.pyplot as plt
import torch
import numpy as np
import pandas as pd

ppo_agent.policy.eval()
episode_length = 6
start_step = 0
num_agents = env.num_agents
price_log = []
action_log = []
temp_log = []
print(f"Running validation loop for {episode_length} steps...")
for i in range(episode_length):
    current_idx = start_step + i
    raw_temps = env.temps[current_idx]
    raw_cons = env.cons[current_idx]
    norm_temps = raw_temps / 30.0
    norm_cons = raw_cons / 10.0
    states = torch.tensor(np.stack([norm_temps, norm_cons], axis=1), dtype=torch.float32)
    with torch.no_grad():
        mu, _, _ = ppo_agent.policy(states)
        actions = mu.numpy().flatten()
    actual_loads = raw_cons * (1 - actions)
    total_neighborhood_load = np.sum(actual_loads)
    dynamic_price = 0.1 + 0.5 * total_neighborhood_load
    for j in range(num_agents):
        price_log.append(dynamic_price)
        action_log.append(actions[j])
        temp_log.append(raw_temps[j])
plt.figure(figsize=(10, 7))
scatter = plt.scatter(price_log, action_log, c=temp_log, cmap="coolwarm", alpha=0.6, s=50)
cbar = plt.colorbar(scatter)
cbar.set_label('Temperature (C)', rotation=270, labelpad=15)
plt.xlabel("Dynamic grid price ($/kWh)")
plt.ylabel("Optimal load shedding")
plt.title(f"Smart city policy: price vs. action\n(5 agents, {episode_length} steps)")
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()
