# **_Autonomous and Adaptive Systems_ - 2025**
## **Mini-Project**:*Overcooked*

### Imports

In [2]:
import math
import numpy as np
import tensorflow as tf

from tensorflow import keras
from tqdm import tqdm
from overcooked_ai_py.mdp.overcooked_env import OvercookedEnv, Overcooked
from overcooked_ai_py.mdp.overcooked_mdp import OvercookedGridworld
from overcooked_ai_py.visualization.state_visualizer import StateVisualizer
from overcooked_ai_py.planning.planners import MediumLevelActionManager

2025-05-27 16:33:11.497146: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Testing _Overcooked_ environment

- `base_mdp` = An MDP grid world based off of the Overcooked game.
- `base_env` = An environment wrapper for the OvercookedGridworld Markov Decision Process. The environment keeps track of the current state of the agent, updates it as the agent takes actions, and provides rewards to the agent.
- `env`= Similar to gym env.

In [3]:
base_mdp = OvercookedGridworld.from_layout_name("cramped_room", old_dynamics = True) # or other layout
base_env = OvercookedEnv.from_mdp(base_mdp, info_level=0, horizon=400)
env = Overcooked(base_env=base_env, featurize_fn=base_env.featurize_state_mdp)

There are different experimetnal layouts:
- ***Cramped Room*** presents low-level coordination challenges: in this shared, confined space it is very easy for the agents to collide.
- ***Asymmetric Advantages*** tests whether players can choose high-level strategies that play to their strengths.
- ***Coordination Ring***, players must coordinate to travel between the bottom left and top right corners of the layout.
- ***Forced Coordination*** removes collision coordination problems, and forces players to develop a high-level joint strategy, since neither player can serve a dish by themselves.
- ***Counter Circuit*** involves a non-obvious coordination strategy, where onions are passed over the counter to the pot, rather than being carried around.

#### **Actions**

The possible actions are: _up, down, left, right, noop,_ and _"interact"_

In [4]:
print('The action space has dimension: {}'.format(env.action_space))

The action space has dimension: Discrete(6)


#### **Observations**

In [5]:
observation = env.reset()
action = env.action_space.sample()
observation, reward, done, info = env.step((action, action))
observation

{'both_agent_obs': (array([ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0., -1.,  0.,  0.,  0.,  0.,
          2.,  0.,  0.,  0.,  0.,  2.,  2.,  0.,  0.,  1.,  1.,  0.,  0.,
          0.,  0.,  0.,  0.,  1., -1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  1.,  0.,  0.,  0., -2.,  2.,  0.,  0.,  0.,  0.,  0.,
          2.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0., -1., -1.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,
          0.,  2.,  0.,  1.,  1.]),
  array([ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0., -2.,
          2.,  0.,  0.,  0.,  0.,  0.,  2.,  0.,  0.,  1.,  1.,  0.,  0.,
          0.,  0.,  0.,  0., -1., -1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
          0.,  0., -1.,  0.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,  0.,  2.,
          2.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,

### Understanding the _Overcooked_ **observations** to apply shaping

- `[0:4]` pi_orientation: length 4 one-hot-encoding of direction currently facing
- `[4:8]` pi_obj: length 4 one-hot-encoding of object currently being held (all 0s if no object held) (onion|soup|dish|tomato)
- `[8:20]` pi_closest_{onion|tomato|dish|soup|serving|empty_counter}: (dx, dy) where dx = x dist to item, dy = y dist to item. (0, 0) if item is currently held
- `[20:22]` pi_cloest_soup_n_{onions|tomatoes}: int value for number of this ingredient in closest soup ???
- `[22:23]` pi_closest_pot_{j}_exists: {0, 1} depending on whether jth closest pot found. If 0, then all other pot features are 0. Note: can be 0 even if there are more than j pots on layout, if the pot is not reachable by player i
- `[23:27]` pi_closest_pot_{j}_{is_empty|is_full|is_cooking|is_ready}: {0, 1} depending on boolean value for jth closest pot
- `[27:29]` pi_closest_pot_{j}_{num_onions|num_tomatoes}: int value for number of this ingredient in jth closest pot
- `[29:30]` pi_closest_pot_{j}_cook_time: int value for seconds remaining on soup. -1 if no soup is cooking
- `[30:32]` pi_closest_pot_{j}: (dx, dy) to jth closest pot from player i location
- `[32:36]` pi_wall: length 4 boolean value of whether player i has wall in each direction

In [6]:
class OvercookedRewardShaping(Overcooked):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def step(self, actions):
        observation, base_reward, done, info = super().step(actions)
        if base_reward != 0:
            print("Soup delivered! Voto: {}".format(base_reward)) # base_reward is 20 if soup is delivered
        shaped_reward = base_reward + self._compute_shaping(observation['both_agent_obs'])
        return observation, shaped_reward, done, info

    def _compute_shaping(self, observations):
        shaping = 0
        for obs in observations:
            holding_vector = obs[4:8]
            holding_soup = obs[5:6]
            soup_full_cooking_ready = obs[24:27]
            soup_empty = obs[23:24]
            soup_cooking = obs[25:26]
            pot_onions = obs[27:28]
            
            # Penalty if holding an object
            #if holding_vector.any():
            #    shaping -= 0.05
            # Reward if holding a soup
            #if holding_soup.any():
            #    shaping += 0.1
            # Reward if soup is full/cooking/ready
            if soup_cooking.any():
                shaping += 0.3
            # Reward if onion are putted into the soup
            #if soup_empty.any():
            # shaping += int(pot_onions)*0.01

        return shaping

In [7]:
env = OvercookedRewardShaping(base_env=base_env, featurize_fn=base_env.featurize_state_mdp)

#### Testing a random episode

In [8]:
for i_episode in range(1):
    observation = env.reset()
    
    for t in range(100):
        action = env.action_space.sample()
        state = env.step((action, action))

env.close()

### ***PPO-Clip***: Networks

In [9]:
num_inputs = env.observation_space.shape[0] # 96 # length of the observation array
num_actions = env.action_space.n # 6
num_hidden = 128

#### _Actor_ model
- **Actor**: This takes as input the _state of our environment_ and returns a _probability value_ for each action in its action space.

In [10]:
# Actor-Critic for Player 1
actor = keras.models.Sequential([
    keras.layers.Input(shape=(num_inputs,)),
    keras.layers.Dense(num_hidden, activation='relu'),
    keras.layers.Dense(num_hidden, activation='relu'),
    keras.layers.Dense(num_actions, activation='softmax')
])

#### _Critic_ model
- **Critic**: This takes as input the _state of our environment_ and returns an estimate of _total rewards_ in the future.

In [11]:
critic = keras.models.Sequential([
    keras.layers.Input(shape=(num_inputs,)),
    keras.layers.Dense(num_hidden, activation='relu'),
    keras.layers.Dense(num_hidden//2, activation='relu'),
    keras.layers.Dense(1)
])

#### _PPO Buffer_

In [28]:
def policy_loss(old_log_prob, new_log_prob, advantage, eps=0.2):
    # If you look on the spinning up repo you will see they use the difference between the 
    # log of the probabilities, not the division of the probabilities. Both things are equivalent, 
    # but logs tend to be used more often because it’s numerically more stable
    ratio = tf.exp(new_log_prob - old_log_prob)
    clipped_ratio = tf.clip_by_value(ratio, 1 - eps, 1 + eps)
    loss = -tf.reduce_mean(tf.minimum(ratio * advantage, clipped_ratio * advantage))
    
    return -loss # since we want to maximize it, we need to minimize the negative

class PPOBuffer:
    def __init__(self):
        self.obs, self.actions, self.log_probs = [], [], []
        self.rewards, self.values, self.dones = [], [], []

    def store(self, obs, action, log_prob, reward, value, done):
        self.obs.append(obs)
        self.actions.append(action)
        self.log_probs.append(log_prob)
        self.rewards.append(reward)
        self.values.append(value)
        self.dones.append(done)

    def clear(self):
        self.__init__()

    def compute_advantages(self, gamma=0.99):
        advantages, returns = [], []
        gae = 0
        next_value = 0
        for i in reversed(range(len(self.rewards))):
            delta = self.rewards[i] + gamma * next_value * (1 - self.dones[i]) - self.values[i]
            gae = delta + gamma * gae * (1 - self.dones[i])
            advantages.insert(0, gae)
            returns.insert(0, gae + self.values[i])
            next_value = self.values[i]
        return np.array(advantages, dtype=np.float32), np.array(returns, dtype=np.float32)


### ***PPO-Clip***: Training

In [13]:
env.action_space.n

6

In [14]:
actor_optimizer = keras.optimizers.Adam(learning_rate=3e-4)
critic_optimizer = keras.optimizers.Adam(learning_rate=1e-3)

mse_loss = keras.losses.MeanSquaredError()

In [32]:
eps_clip = 0.2
gamma = 0.99
update_epochs = 4
rollout_steps = 2048
batch_size = 64

for episode in tqdm(range(1000)):
    done = True
    buffer = PPOBuffer()

    while len(buffer.rewards) < rollout_steps:
        if done:
            obs = env.reset()
        obs1 = tf.convert_to_tensor(obs['both_agent_obs'][0][None], dtype=tf.float32)
        obs2 = tf.convert_to_tensor(obs['both_agent_obs'][1][None], dtype=tf.float32)

        probs1 = actor(obs1)
        probs2 = actor(obs2)

        a1 = np.random.choice(num_actions, p=np.squeeze(probs1))
        a2 = np.random.choice(num_actions, p=np.squeeze(probs2))

        logp1 = tf.math.log(probs1[0, a1] + 1e-8)
        logp2 = tf.math.log(probs2[0, a2] + 1e-8)

        v1 = critic(obs1)[0, 0].numpy()
        v2 = critic(obs2)[0, 0].numpy()

        next_obs, reward, done, _ = env.step((a1, a2))

        buffer.store(obs['both_agent_obs'][0], a1, logp1, reward, v1, done)
        buffer.store(obs['both_agent_obs'][1], a2, logp2, reward, v2, done)

        obs = next_obs

    # Compute advantages & returns
    advs, rets = buffer.compute_advantages(gamma)
    obs_tensor = tf.convert_to_tensor(np.array(buffer.obs), dtype=tf.float32)
    act_tensor = tf.convert_to_tensor(np.array(buffer.actions), dtype=tf.int32)
    logp_old_tensor = tf.convert_to_tensor(np.array(buffer.log_probs), dtype=tf.float32)
    adv_tensor = tf.convert_to_tensor(advs, dtype=tf.float32)
    ret_tensor = tf.convert_to_tensor(rets, dtype=tf.float32)

    adv_tensor = (adv_tensor - tf.reduce_mean(adv_tensor)) / (tf.math.reduce_std(adv_tensor) + 1e-8)

    # Training
    dataset = tf.data.Dataset.from_tensor_slices((obs_tensor, act_tensor, logp_old_tensor, adv_tensor, ret_tensor))
    dataset = dataset.shuffle(2048).batch(batch_size)

    for _ in range(update_epochs):
        for batch_obs, batch_act, batch_logp_old, batch_adv, batch_ret in dataset:
            with tf.GradientTape(persistent=True) as tape:
                probs = actor(batch_obs)
                action_probs = tf.gather(probs, batch_act[:, None], axis=1, batch_dims=1)
                logp = tf.math.log(action_probs + 1e-8)[:, 0]

                ratio = tf.exp(logp - batch_logp_old)
                clipped = tf.clip_by_value(ratio, 1 - eps_clip, 1 + eps_clip)
                policy_loss = -tf.reduce_mean(tf.minimum(ratio * batch_adv, clipped * batch_adv))

                values = critic(batch_obs)[:, 0]
                value_loss = tf.reduce_mean(tf.square(batch_ret - values))

            actor_grads = tape.gradient(policy_loss, actor.trainable_variables)
            critic_grads = tape.gradient(value_loss, critic.trainable_variables)

            actor_optimizer.apply_gradients(zip(actor_grads, actor.trainable_variables))
            critic_optimizer.apply_gradients(zip(critic_grads, critic.trainable_variables))

    mean_reward = np.mean(buffer.rewards)
    print(f"Episode {episode}: reward={mean_reward:.2f}")
    buffer.clear()


  0%|          | 0/1000 [00:00<?, ?it/s]

Soup delivered! Voto: 20


2025-05-27 17:11:41.508312: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:11:43.955241: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:11:46.739383: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:11:49.511907: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  0%|          | 1/1000 [00:26<7:19:37, 26.40s/it]

Episode 0: reward=0.05


2025-05-27 17:12:10.705714: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:12:13.463461: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:12:16.364390: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:12:20.109989: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  0%|          | 2/1000 [00:57<8:00:13, 28.87s/it]

Episode 1: reward=0.02
Soup delivered! Voto: 20
Soup delivered! Voto: 20


2025-05-27 17:12:55.207276: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:13:01.514711: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:13:04.804400: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:13:08.216821: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  0%|          | 3/1000 [01:45<10:25:42, 37.66s/it]

Episode 2: reward=0.07


2025-05-27 17:13:27.928428: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:13:31.404258: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:13:34.210631: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:13:36.947357: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  0%|          | 4/1000 [02:13<9:26:34, 34.13s/it] 

Episode 3: reward=0.01


2025-05-27 17:14:00.184300: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:14:02.858043: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:14:05.467411: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:14:08.071401: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  0%|          | 5/1000 [02:44<9:08:01, 33.05s/it]

Episode 4: reward=0.02
Soup delivered! Voto: 20


2025-05-27 17:14:31.394476: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:14:34.003965: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:14:36.641213: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:14:39.209080: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  1%|          | 6/1000 [03:16<8:56:42, 32.40s/it]

Episode 5: reward=0.04


2025-05-27 17:15:01.302985: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:15:04.008405: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:15:06.574822: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:15:09.203081: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  1%|          | 7/1000 [03:46<8:43:10, 31.61s/it]

Episode 6: reward=0.03


2025-05-27 17:15:32.167719: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:15:34.834375: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:15:37.619507: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:15:40.239559: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  1%|          | 8/1000 [04:17<8:39:36, 31.43s/it]

Episode 7: reward=0.00
Soup delivered! Voto: 20


2025-05-27 17:16:01.885220: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:16:04.583804: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:16:07.085531: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:16:09.630764: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  1%|          | 9/1000 [04:46<8:28:34, 30.79s/it]

Episode 8: reward=0.05


2025-05-27 17:16:30.987220: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:16:33.564716: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:16:37.171108: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:16:39.842738: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  1%|          | 10/1000 [05:16<8:25:06, 30.61s/it]

Episode 9: reward=0.02


2025-05-27 17:17:01.659706: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:17:04.254316: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:17:06.773405: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:17:09.381176: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  1%|          | 11/1000 [05:46<8:19:10, 30.28s/it]

Episode 10: reward=0.02


2025-05-27 17:17:30.555001: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:17:33.041659: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:17:35.513571: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:17:38.025470: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  1%|          | 12/1000 [06:14<8:10:27, 29.79s/it]

Episode 11: reward=0.00


2025-05-27 17:17:59.485867: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:18:02.203677: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:18:04.646406: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:18:07.094845: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  1%|▏         | 13/1000 [06:43<8:06:23, 29.57s/it]

Episode 12: reward=0.02
Soup delivered! Voto: 20


2025-05-27 17:18:27.682271: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:18:30.255114: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:18:32.990050: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:18:35.733006: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  1%|▏         | 14/1000 [07:12<8:01:17, 29.29s/it]

Episode 13: reward=0.05


2025-05-27 17:18:56.662611: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:18:59.078153: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:19:01.530659: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:19:03.953682: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  2%|▏         | 15/1000 [07:40<7:55:31, 28.97s/it]

Episode 14: reward=0.02


2025-05-27 17:19:23.668821: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:19:26.079270: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:19:28.639559: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:19:31.072793: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  2%|▏         | 16/1000 [08:07<7:45:55, 28.41s/it]

Episode 15: reward=0.02


2025-05-27 17:19:50.651554: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:19:53.234816: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:19:55.571419: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:19:57.914008: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  2%|▏         | 17/1000 [08:34<7:37:43, 27.94s/it]

Episode 16: reward=0.03
Soup delivered! Voto: 20


2025-05-27 17:20:17.820365: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:20:20.203006: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:20:22.544079: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:20:24.904715: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  2%|▏         | 18/1000 [09:01<7:32:35, 27.65s/it]

Episode 17: reward=0.03
Soup delivered! Voto: 20


2025-05-27 17:20:44.153589: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:20:46.486470: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:20:48.843528: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:20:51.156126: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  2%|▏         | 19/1000 [09:28<7:25:14, 27.23s/it]

Episode 18: reward=0.06
Soup delivered! Voto: 20


2025-05-27 17:21:10.771807: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:21:13.290313: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:21:15.602660: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:21:18.164560: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  2%|▏         | 20/1000 [09:55<7:23:42, 27.17s/it]

Episode 19: reward=0.04
Soup delivered! Voto: 20


2025-05-27 17:21:38.125735: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:21:40.856305: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:21:43.309668: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-27 17:21:45.658527: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  2%|▏         | 21/1000 [10:22<7:24:51, 27.26s/it]

Episode 20: reward=0.03


  2%|▏         | 21/1000 [10:38<8:15:47, 30.39s/it]


KeyboardInterrupt: 

1 --> 2 (0 soup delivered) [Start filling the pot]
0 --> 5 (0 soup delivered) [Consistelty filling the pot]
2 --> 8 (2 soup delivered) [Emptying the pot, but not delivering]
1 --> 11 (5 soup delivered) [Start delivering]
2 --> 11 (4 soup delivered) [Just filling the pot once]
2 --> 12 (6 soup delivered) []
2 --> 11 (1 soup delivered) []

### Test for Visualization

In [196]:
import pygame

# 1) Initialize Pygame & Visualizer
pygame.init()
visualizer = StateVisualizer()

# 2) Grab your grid and do one dummy render to get a surface
grid = base_env.mdp.terrain_mtx
_ = env.reset()
surf = visualizer.render_state(base_env.state, grid=grid)

# 3) Use that surface’s size for your window
win_w, win_h = surf.get_size()
screen = pygame.display.set_mode((win_w, win_h), pygame.RESIZABLE)
clock  = pygame.time.Clock()

# 4) Main loop: render each frame & blit into the same window
running = True
observation = env.reset() #observation of the starting state
soup_delivered = 0

while running:
    for ev in pygame.event.get():
        if ev.type == pygame.QUIT:
            running = False
    
    # observation of the environment
    chef1_observation = observation['both_agent_obs'][0]
    chef2_observation = observation['both_agent_obs'][1]

    chef1_observation = keras.ops.convert_to_tensor(chef1_observation)
    chef1_observation = keras.ops.expand_dims(chef1_observation, 0)

    chef2_observation = keras.ops.convert_to_tensor(chef2_observation)
    chef2_observation = keras.ops.expand_dims(chef2_observation, 0)
    
    # step the environment
    chef1_action_probs = actor(chef1_observation)
    chef1_action = np.random.choice(num_actions, p=np.squeeze(chef1_action_probs))
    
    chef2_action_probs = actor(chef2_observation)
    chef2_action = np.random.choice(num_actions, p=np.squeeze(chef2_action_probs))

    # try to step; if episode is over, catch and reset
    try:
        # Overcooked wrapper returns (obs_p0, obs_p1, reward, done, info)
        observation, reward, done, info = env.step((chef1_action, chef2_action))
        if reward > 19:
            soup_delivered += 1
    except AssertionError:
        # base_env.is_done() was True → reset and continue
        env.reset()
        break

    # render the new state
    surf = visualizer.render_state(base_env.state, grid=grid)

    # draw it
    screen.blit(surf, (0, 0))
    pygame.display.flip()

    clock.tick(15)   # cap at 30 FPS

pygame.quit()

print(f"Soup delivered: {soup_delivered}")


Soup delivered: 0
