### Franka Panda

* 环境

| Package   | Version |
|-----------|---------|
| gymnasium | 0.29.1  |
| numpy | 1.24.0 |
| scipy | 1.13.1 |
| torch |  2.3.1+cu121 |
| grpcio |   1.64.1 |
| 关卡名 | franka_emika_panda |

* 采用在线强化学习
* 任务目标是达到制定目标点
* 控制量为关节电机驱动角度，采用离散采样，每个关节生成 增加、减少、不变 三个控制信号
* time_step 为 0.01，因此 fps 设为 100
* frame_skip 为1， 即每次action进行1次 step，因此是 100 actions/s

In [None]:
import os
import sys

current_file_path = os.path.abspath('')
project_root = os.path.dirname(current_file_path)

# 将项目根目录添加到 PYTHONPATH
if project_root not in sys.path:
    sys.path.append(project_root)


import gymnasium as gym
from stable_baselines3 import PPO
import asyncio
import nest_asyncio
from gymnasium.envs.registration import register
from envs.mujoco.franka_emika_panda import FrankaEnv
from datetime import datetime
import torch.nn as nn
from envs.orca_gym_env import ActionSpaceType


nest_asyncio.apply()

def register_env(grpc_address, low_path_filter_alpha, update_goal_interval):
    print("register_env: ", grpc_address)
    gym.register(
        id=f"FrankaPanda-v0-OrcaGym-{grpc_address[-2:]}",
        entry_point="envs.mujoco.franka_emika_panda:FrankaEnv",
        kwargs={'frame_skip': 5, 
                'action_space_type': ActionSpaceType.DISCRETE,
                'action_step_count': 200,
                'grpc_address': grpc_address, 
                'agent_names': ['Panda'], 
                'time_step': 0.0166666, 
                'alpha': low_path_filter_alpha, 
                'update_goal_interval': update_goal_interval},
        max_episode_steps=512,
        reward_threshold=0.0,
    )

async def continue_training(env, total_timesteps, is_training):


    # 加载已有模型或初始化新模型
    if os.path.exists("frankapanda_ppo_model.zip"):
        model = PPO.load("frankapanda_ppo_model", env=env)
    else:
        # 定义自定义策略网络
        policy_kwargs = dict(
            net_arch=dict(
                pi=[256, 256, 256],  # 策略网络结构
                vf=[256, 256, 256]   # 值函数网络结构
            ),
            ortho_init=True,
            activation_fn=nn.ReLU
        )
        model = PPO("MlpPolicy", env, verbose=1, learning_rate=0.0003, n_steps=2048, batch_size=128, gamma=0.99, clip_range=0.2, policy_kwargs=policy_kwargs)
        

    # 训练模型，每 LOOP_LEN 步保存一次模型
    if (is_training):
        LOOP_LEN = 100000
        if (total_timesteps >= LOOP_LEN):
            for i in range(total_timesteps // LOOP_LEN):
                model.learn(LOOP_LEN)
                model.save(f"frankapanda_ppo_model_ckp{i}")
                print(f"-----------------Save Model: {i}-----------------")

        model.save("frankapanda_ppo_model")
        

    # 测试模型
    observation, info = env.reset(seed=42)
    for test in range(10):
        total_reward = 0
        for _ in range(1000):
            start_time = datetime.now()

            action, _states = model.predict(observation, deterministic=True)
            observation, reward, terminated, truncated, info = env.step(action)

            total_reward += reward

            # 帧率为 60fps ，为显示为正常速度，每次渲染间隔 16ms
            elapsed_time = datetime.now() - start_time
            if elapsed_time.total_seconds() < 0.016666666666666666:
                await asyncio.sleep(0.016666666666666666 - elapsed_time.total_seconds())

            if terminated or truncated:
                print(f"----------------Test: {test}----------------")
                print("Terminated: ", terminated, " Truncated: ", truncated)
                print("Total Reward: ", total_reward)
                print("---------------------------------------")
                observation, info = env.reset()
                total_reward = 0
                break

    env.close()

if __name__ == "__main__":
    try:
        grpc_address = "localhost:50051"
        print("simulation running... , grpc_address: ", grpc_address)
        env_id = f"FrankaPanda-v0-OrcaGym-{grpc_address[-2:]}"
        low_path_filter_alpha=0.7
        update_goal_interval=10000
        register_env(grpc_address, low_path_filter_alpha, update_goal_interval)

        env = gym.make(env_id)
        print("启动仿真环境")
        asyncio.run(continue_training(env, total_timesteps=200000, is_training=True))
    except KeyboardInterrupt:
        print("关闭仿真环境")        
        env.close()
    

### 测试用代码

* 测试控制结果


In [None]:
import os
import sys

current_file_path = os.path.abspath('')
project_root = os.path.dirname(current_file_path)

# 将项目根目录添加到 PYTHONPATH
if project_root not in sys.path:
    sys.path.append(project_root)


import gymnasium as gym
from stable_baselines3 import PPO
import asyncio
import nest_asyncio
from gymnasium.envs.registration import register
from envs.mujoco.franka_emika_panda import FrankaEnv
from datetime import datetime
import torch.nn as nn
from envs.orca_gym_env import ActionSpaceType
import numpy as np


nest_asyncio.apply()

def register_env(grpc_address, low_path_filter_alpha, update_goal_interval):
    print("register_env: ", grpc_address)
    gym.register(
        id=f"FrankaPanda-v0-OrcaGym-{grpc_address[-2:]}",
        entry_point="envs.mujoco.franka_emika_panda:FrankaEnv",  # 更新为实际路径
        kwargs={'frame_skip': 1, 
        'action_space_type': ActionSpaceType.CONTINUOUS,
        'action_step_count': 200,
        'grpc_address': grpc_address, 
        'agent_names': ['Panda'], 
        'time_step': 0.0166666, 
        'alpha': low_path_filter_alpha, 
        'update_goal_interval': update_goal_interval},
        max_episode_steps=512,
        reward_threshold=0.0,
    )

async def continue_training(env, total_timesteps, is_training):
    # 测试模型
    observation, info = env.reset(seed=42)
    for test in range(10):
        total_reward = 0

        action =  np.array([0, -1,  0, -2, 0,  1.5, 2, 0, 0])
        mean = 0
        std_dev = 0.1

        for _ in range(1000):
            start_time = datetime.now()

            # 使用高斯噪声修改 action 数组
            noise = np.random.normal(mean, std_dev, action.shape)
            action = action + noise

            # action = [0, 0, 0, 0, 0, 0, 0]
            observation, reward, terminated, truncated, info = env.step(action)

            # 帧率为 60fps ，为显示为正常速度，每次渲染间隔 16ms
            elapsed_time = datetime.now() - start_time
            if elapsed_time.total_seconds() < 0.016:
                await asyncio.sleep(0.016 - elapsed_time.total_seconds())

    env.close()

if __name__ == "__main__":
    try:
        grpc_address = 'localhost:50051'
        print("simulation running... , grpc_address: ", grpc_address)
        env_id = f"FrankaPanda-v0-OrcaGym-{grpc_address[-2:]}"
        register_env(grpc_address, low_path_filter_alpha=0.5, update_goal_interval=10000)

        env = gym.make(env_id)      
        print("启动仿真环境")
        asyncio.run(continue_training(env, total_timesteps=100000, is_training=True))
    except KeyboardInterrupt:
        print("关闭仿真环境")        
        env.close()
    