# Learning Code as Policy for Metaworld


In [1]:
import llfbench
import autogen.trace as trace
from autogen.trace.optimizers import FunctionOptimizer
from llfbench.agents.utils import set_seed
from collections import defaultdict
import copy
import pickle

from autogen.trace.trace_ops import trace_class


class TracedEnv:

    def __init__(self, env_name, seed=0):
        self.env = llfbench.make(env_name)
        self.env.seed(seed)

    @trace.trace_op(n_outputs=2)
    def reset(self):
        """
        Reset the environment and return the initial observation and info.
        """
        return self.env.reset()  # obs, info

    @trace.trace_op(n_outputs=5)
    def step(self, action):
        """
        Take action in the environment and return the next observation, reward, done, and info.
        """
        return self.env.step(action)


def user_feedback(obs, action, next_obs):
    """
    Provide feedback to the user.
    """
    return f"Taking action {action.data} at observation {obs['observation'].data} resulted in next observation {next_obs['observation'].data}. Recieved feedback {next_obs['feedback'].data}."

In [2]:
### Optimization for single step
def single_step(controller, env, user_feedback, horizon):
    optimizer = trace.optimizers.FunctionOptimizer(controller.parameters())

    # Initialize the environment
    obs, info = env.reset()
    optimizer.objective = f"{optimizer.default_objective} Hint: {obs['instruction']}"

    # Rollout
    sum_of_rewards = 0
    t = 0
    iter = 0
    while t < horizon:
        error = None
        try:
            action = controller(
                obs["observation"].detach()
            )  # Detach; otherwise, it would be back-propagated across time.
            next_obs, reward, termination, truncation, info = env.step(action)
        except trace.TraceExecutionError as e:
            error = e

        if error is None:

            feedback = user_feedback(obs, action, next_obs)  # not traced
            obs = next_obs
            target = next_obs["observation"]
            sum_of_rewards += reward.data  # not traced
            t += 1
            if termination or truncation:
                break
        else:  # Self debugging
            feedback = str(error)
            target = error.exception_node

        # Optimization step
        optimizer.zero_feedback()
        optimizer.backward(target, feedback)  # obs = next obs
        optimizer.step(verbose=True)
        iter += 1

    print("Sum of rewards:", sum_of_rewards)
    print("Success:", info.data["success"])
    print("# of optimization iterations:", iter)
    print("# of time steps:", t)

    return optimizer, sum_of_rewards

In [3]:
# Run experiment


horizon = 30
env_name = "llf-metaworld-pick-place-v2"
env = TracedEnv(env_name, seed=0)

action_space = env.env.action_space


@trace.trace_op(trainable=True)
def controller(obs):
    """
    The controller computes the desired action based on the observation obs.
    """
    return action_space.sample()


single_step(controller, env, user_feedback, horizon)

  logger.warn(
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  gym.logger.warn("Casting input x to numpy array.")
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(


Prompt
 
You're tasked debug and solve a coding/algorithm problem. You will see the code, the documentation of each function used in the code, and the feedback about the code's execution result.

Specifically, a problem will be composed of the following parts:
- #Code: the code whose results you need to improve.
- #Documentation: the documentation of each function used in the code.
- #Variables: the values of the variables that you need to change.
- #Inputs: the values of other inputs to the code
- #Others: the intermediate values created through the code execution.
- #Outputs: the result of the code.
- #Feedback: the feedback about the code's execution result.

In #Variables, #Outputs, and #Others, the format is:
<type> <variable_name> = <value>
You need to change the <value> of the variables in #Variables to improve the code's output in accordance to #Feedback and their data types specified in <type>. If <type> is (code), it means <value> is the source code of a python code, which ma



LLM response:
 {
"reasoning": "Given the feedback, the action taken based on the current controller logic moved the robot further away from the goal. The feedback suggests using [0.06, 0.63, 0.12, 0] as the action to get closer to the goal. To incorporate this feedback, we can modify the controller function to aim towards these suggested values as the target pose instead of purely relying on the difference between the current hand position and the goal position. This adjustment takes into consideration the implicit suggestion provided in the feedback about an effective next pose.",
"suggestion": {
"__code0": "def controller(obs):\n    import json\n    obs_dict = json.loads(obs)\n    # Convert space-separated strings to comma-separated to create valid JSON arrays\n    hand_pos_str = obs_dict['hand_pos'].replace(' ', ',').replace('[,', '[').replace(',,', ',')\n    goal_pos_str = obs_dict['goal_pos'].replace(' ', ',').replace('[,', '[').replace(',,', ',')\n    # Load corrected strings as 

(<autogen.trace.optimizers.function_optimizer.FunctionOptimizer at 0x7f699c391a00>,
 0.3421279090143542)