In [1]:
import time
from design import *
import importlib
import shutil
from utils import *
from openai import OpenAI
from prompts import *
import json
import numpy as np
from gymnasium.envs.robodesign.GPTWalker import GPTWalkerEnv
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import os
import matplotlib.pyplot as plt


In [2]:
def load_logs_full(log_dir):
    # 找到 .tfevents 文件
    event_file = [f for f in os.listdir(log_dir) if f.startswith("events.out")][0]
    event_path = os.path.join(log_dir, event_file)

    # 加载日志
    event_acc = EventAccumulator(event_path)
    event_acc.Reload()

    # 获取所有 scalar tags
    all_tags = event_acc.Tags()["scalars"]

    # 筛选 reward 分量
    reward_tags = [tag for tag in all_tags if tag.startswith("reward/")]

    # 加载所有 reward 分量数据（完整）
    data_full = {}
    for tag in reward_tags:
        events = event_acc.Scalars(tag)
        values = [e.value for e in events]
        data_full[tag] = values

    # 加载 episode length（完整）
    ep_len_tag = "rollout/ep_len_mean"
    if ep_len_tag in all_tags:
        events = event_acc.Scalars(ep_len_tag)
        values = [e.value for e in events]
        data_full[ep_len_tag] = values

    return data_full


In [None]:
import prompts
class DGA:
    def __init__(self):
        api_key = "<api_key>"
        self.client = OpenAI(api_key=api_key)
        self.model = "gpt-4-turbo"

    def extract_code(self, text):
        match = re.search(r'```python\n(.*?)\n```', text, re.DOTALL)
        return match.group(1).strip() if match else None

    def indent_code(self, code):
        return "\n".join(line if line.strip() else line for line in code.split("\n"))

    def generate_rewardfunc_eureka(self, rewardfunc_nums, best_message, iteration, folder_name):
        messages = [
            {"role": "system", "content": "You are a reinforcement learning reward function designer"},
            {"role": "user", "content": rewardfunc_prompts + best_message + zeroshot_rewardfunc_format}
        ]

        responses = self.client.chat.completions.create(
            model=self.model, messages=messages, n=rewardfunc_nums
        )
        
        files = []
        for i, choice in enumerate(responses.choices):
            reward_code = self.extract_code(choice.message.content)
            if reward_code:
                full_code = "import numpy as np \n" + self.indent_code(reward_code) + "\n"
                file_name =  f"GPTrewardfunc_{i}_{iteration}.py"
                file_path = os.path.join(folder_name, "env", file_name)

                with open(file_path, "w") as fp:
                    fp.write(full_code)
                files.append(file_path)
                print(f"Saved: {file_path}")

        return files


In [9]:

folder_name = "results/eureka"
log_file = os.path.join(folder_name, "parameters.log")
logging.basicConfig(filename=log_file, level=logging.INFO, format="%(asctime)s - %(message)s")

# folder_name = setup_logging(div_flag=True)

best_fitness = float('-inf')  
best_morphology = None  
best_rewardfunc = None  
best_reward = None
best_material = None
best_efficiency = None

iterations = 5
morphology_nums = 1
rewardfunc_nums = 16

fitness_matrix = np.array([[None for _ in range(morphology_nums)] for _ in range(rewardfunc_nums)])
efficiency_matrix = np.array([[None for _ in range(morphology_nums)] for _ in range(rewardfunc_nums)])
fitness_list = []


In [10]:
morphology_list = [f'results/eureka/assets/GPTWalker_{i}.xml' for i in range(0,1) ]
parameter_list = [[1.45, 1.06, 0.6, 0.1, -0.13, 0.26, 0.05, 0.05, 0.04, 0.06]]
material_list = [compute_walker_volume(parameter) for parameter in parameter_list]

In [11]:
eureka_rewardfunc_prompts = """We trained a RL policy using the provided function code and tracked
the values of the individual components in the reward function as well as global policy matrics such as fitness function and episode lengths after
10000 epochs and the maximum, mean, and minimum values encountered:
{reward_reflection}

Please carefully analyze the policy feedback and provide a new, improved reward function that can better
solve the task. Some helpful tips for analyzing the policy feedback:

(1) If the fitness function rates are always zero or negative, then you must rewrite the entire reward function
(2) If the values for a certain reward component are near identiacal throughout, then this means RL is not able to optimize this component as it is written. You may 
consider 
    (a) Changing its scale or the value of its temperature parameter
    (b) Re-writing the reward component
    (c) Discarding the reward component
(3) If some reward components' magnitude is significanly larger, then you must rescale its value to a proper range
Please analyze each existing reward component in the suggested manner above first, and then write the reward function code
"""

In [18]:
designer = DGA()
best_message = ''

for iter in range(1, iterations+1):
    reward_reflection = ''
    rewardfunc_list = designer.generate_rewardfunc_eureka(rewardfunc_nums, best_message, iter, folder_name)
    logging.info(f"___________________coarse optimization iter{iter}_____________________")
    for i, rewardfunc in enumerate(rewardfunc_list):
        for j, morphology in enumerate(morphology_list):

            print(i, rewardfunc)
            print(j, morphology)

            shutil.copy(morphology, "GPTWalker.xml")
            shutil.copy(rewardfunc, "GPTrewardfunc.py")         

            import GPTrewardfunc
            importlib.reload(GPTrewardfunc)  # 重新加载模块
            from GPTrewardfunc import _get_rew
            GPTWalkerEnv._get_rew = _get_rew

            env_name = "GPTWalkerEnv"
            model_path = Train(j,  i, folder_name, total_timesteps=5e5, callback=True, iter=iter)
            fitness, reward = Eva(model_path,run_steps=100)

            material = material_list[j]
            efficiency = fitness/material
            fitness_matrix[i][j] = fitness
            efficiency_matrix[i][j] = efficiency

            logging.info(f"iteration:{iter}, morphology: {j}, rewardfunc: {i}, material cost: {material} reward: {reward} fitness: {fitness} efficiency: {efficiency}")

    best_efficiency = np.max(efficiency_matrix[:, 0])
    best_rewardfunc_index = np.argmax(efficiency_matrix[:, 0])
    logs_full = load_logs_full(f"results/eureka/sac_morphology0_rewardfunc{best_rewardfunc_index}_{iter}/SAC_1")  
    epoch_freq = 100000

    for tag, full_values in logs_full.items():
        sampled_values = full_values[::epoch_freq]
        max_val = max(sampled_values)
        mean_val = sum(sampled_values) / len(sampled_values)
        min_val = min(sampled_values)
        formatted_values = [f"{v:.2f}" for v in sampled_values]
        reward_reflection +=f"{tag}: {formatted_values}" + f"Max: {max_val:.2f}, Mean: {mean_val:.2f}, Min: {min_val:.2f}\n"

    with open(rewardfunc_list[best_rewardfunc], 'r') as f:
        reward_content = f.read()
    best_message = f"best rewardfunc:{reward_content} \n" + f"best fintess:{best_efficiency}" + eureka_rewardfunc_prompts.format(reward_reflection=reward_reflection)


Saved: results/eureka/env/GPTrewardfunc_0_1.py
Saved: results/eureka/env/GPTrewardfunc_1_1.py
Saved: results/eureka/env/GPTrewardfunc_2_1.py
Saved: results/eureka/env/GPTrewardfunc_3_1.py
Saved: results/eureka/env/GPTrewardfunc_4_1.py
Saved: results/eureka/env/GPTrewardfunc_5_1.py
Saved: results/eureka/env/GPTrewardfunc_6_1.py
Saved: results/eureka/env/GPTrewardfunc_7_1.py
Saved: results/eureka/env/GPTrewardfunc_8_1.py
Saved: results/eureka/env/GPTrewardfunc_9_1.py
Saved: results/eureka/env/GPTrewardfunc_10_1.py
Saved: results/eureka/env/GPTrewardfunc_11_1.py
Saved: results/eureka/env/GPTrewardfunc_12_1.py
Saved: results/eureka/env/GPTrewardfunc_13_1.py
Saved: results/eureka/env/GPTrewardfunc_14_1.py
Saved: results/eureka/env/GPTrewardfunc_15_1.py
0 results/eureka/env/GPTrewardfunc_0_1.py
0 results/eureka/assets/GPTWalker_0.xml
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56

TypeError: _get_rew() takes 1 positional argument but 3 were given

In [20]:
efficiency_matrix

array([[62.45056859357158],
       [61.746110644801895],
       [90.29340058311645],
       [55.89901035018392],
       [51.35890676016386],
       [43.69646114584481],
       [67.30537819499759],
       [50.468371811015146],
       [62.60567503070887],
       [66.92292732687368],
       [68.33714174126084],
       [76.54809257374136],
       [71.7619349832417],
       [67.15721269754005],
       [56.1436045425078],
       [80.63200158853387]], dtype=object)

In [56]:
efficiency_matrix

array([[3976.6642818633827],
       [4751.281561784006],
       [7224.448128240247],
       [6023.381126781108],
       [5764.19702099186],
       [6671.181033086907],
       [6587.733410099273],
       [5417.426909189873],
       [6372.382335747838],
       [4255.325796926133],
       [5247.62963314609],
       [5334.098274428634],
       [4565.410237548588],
       [5198.3884879938605],
       [6604.340892256646],
       [4453.497254902149]], dtype=object)

In [57]:
best_efficiency

8174.26861224769

In [38]:

logs_full = load_logs_full(f"results/eureka/sac_morphology0_rewardfunc7/SAC_1")  
# logs_full = load_logs_full(f"results\eureka\")  
epoch_freq = 100000

for tag, full_values in logs_full.items():
    sampled_values = full_values[::epoch_freq]
    max_val = max(sampled_values)
    mean_val = sum(sampled_values) / len(sampled_values)
    min_val = min(sampled_values)
    formatted_values = [f"{v:.2f}" for v in sampled_values]
    reward_reflection +=f"{tag}: {formatted_values}" + f"Max: {max_val:.2f}, Mean: {mean_val:.2f}, Min: {min_val:.2f}\n"

NameError: name 'reward_reflection' is not defined

results/eureka
results/eureka/assets
results/eureka/assets/.ipynb_checkpoints
results/eureka/.ipynb_checkpoints
results/eureka/env
results/eureka/env/.ipynb_checkpoints
results/eureka/sac_morphology0_rewardfunc0
results/eureka/sac_morphology0_rewardfunc0/SAC_1
results/eureka/coarse
results/eureka/coarse/.ipynb_checkpoints
results/eureka/sac_morphology0_rewardfunc1
results/eureka/sac_morphology0_rewardfunc1/SAC_1
results/eureka/sac_morphology0_rewardfunc2
results/eureka/sac_morphology0_rewardfunc2/SAC_1
results/eureka/sac_morphology0_rewardfunc3
results/eureka/sac_morphology0_rewardfunc3/SAC_1
results/eureka/sac_morphology0_rewardfunc4
results/eureka/sac_morphology0_rewardfunc4/SAC_1
results/eureka/sac_morphology0_rewardfunc5
results/eureka/sac_morphology0_rewardfunc5/SAC_1
results/eureka/sac_morphology0_rewardfunc6
results/eureka/sac_morphology0_rewardfunc6/SAC_1
results/eureka/sac_morphology0_rewardfunc7
results/eureka/sac_morphology0_rewardfunc7/SAC_1
results/eureka/sac_morphology0_r