In [1]:
import random
from MuJoCo_Gym.mujoco_rl import MuJoCo_RL
from MuJoCo_Gym.single_agent_wrapper import Single_Agent_Wrapper
import time
from stable_baselines3 import PPO, SAC
import copy
import numpy as np

### Implement a reward function
In version 3.0, the reward function is not a class, but an actual callable function, which is also handed over as an argument in the config dictionary. First the data fields are created in the dataStore. This will be done every time the environment has been reset, as the datastore is cleared back to {agent:{}, agent2:{} etc.} during reset.<br>
After that the agent gets a reward for getting closer to the target. To achieve this the reward function simply calculates the difference between the current distance and the distance at the previous timestep. If the agent gets closer to the target, the difference is positive and therefor the reward is positive as well.

In [2]:
class TargetCoordinates():
    def __init__(self, mujoco_gym):
        self.mujoco_gym = mujoco_gym
        self.observation_space = {"low":[-30, -30, -30, -30, -30, -30], "high":[30, 30, 30, 30, 30, 30]}
        self.action_space = {"low":[], "high":[]}
        # The datastore is used to store and preserve data over one or multiple timesteps
        self.dataStore = {}

    def dynamic(self, agent, actions):
        if "targets" not in self.mujoco_gym.dataStore.keys():
            self.mujoco_gym.dataStore["targets"] = self.mujoco_gym.filterByTag("target")
            self.mujoco_gym.dataStore[agent]["current_target"] = self.mujoco_gym.dataStore["targets"][random.randint(0, len(self.mujoco_gym.dataStore["targets"]) - 1)]["name"]
            distance = self.mujoco_gym.distance(agent, self.mujoco_gym.dataStore[agent]["current_target"])
            self.mujoco_gym.dataStore[agent]["distance"] = distance
        target_coords = np.array(self.mujoco_gym.getData(self.mujoco_gym.dataStore[agent]["current_target"])["position"])
        own_coords = np.array(self.mujoco_gym.getData(agent)["position"])
        return 0, np.concatenate((target_coords, own_coords))

In [3]:
def reward_function(mujoco_gym, agent):
    # Creates all the necessary fields to store the needed data within the dataStore at timestep 0
    distance = mujoco_gym.distance(agent, mujoco_gym.dataStore[agent]["current_target"])
    new_reward = mujoco_gym.dataStore[agent]["distance"] - distance
    mujoco_gym.dataStore[agent]["distance"] = copy.deepcopy(distance)
    reward = new_reward * 10
    speed = 0.0004
    if abs(distance) > speed:
        new_reward = speed * 5
    else:
        new_reward = -1 * speed * 5
    reward += new_reward
    return reward

### Done function
The current simulation run is over if the agent gets closer than one distance unit to the target. Note that the data field for distance does not have to be created again. This is because the reward functions are executed before the done function inside the environment class. This means that the distance fields already exist, even at timestep 0.

In [4]:
def done_function(mujoco_gym, agent):
    if mujoco_gym.dataStore[agent]["distance"] <= 1 or mujoco_gym.dataStore[agent]["distance"] > 15:
        return True
    else:
        return False

### Starting the environment
The path to the mujoco xml file and additional json info file are handed over, as well as the agents mujoco names and the reward/done function and the environment dynamic. The render mode is also set to true, meaning that the environment is rendered on screen while running. This should only be done for inference though, as rendering is quite ressource intensive.

In [5]:
environment_path = "Environment/MultiEnvs.xml"
info_path = "Environment/info_example.json"
agents = ["agent1_torso"]
config_dict = {"xmlPath":environment_path, "infoJson":info_path, "agents":agents, "rewardFunctions":[reward_function], "doneFunctions":[done_function], "environmentDynamics":[TargetCoordinates], "renderMode":True}
environment = MuJoCo_RL(config_dict)

0.0


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


### Turn the environment into a single agent openAI gym
To make the environment compatible with Stable Baselines3, it has to be converted into an OpenAI Gym first. This can be achieved by using the Single_Agent_Wrapper class, which implements a gym interface for the MultiAgentEnv. Note that the multi agent environment can only have one active agent. If more than one are handed over, the environment will throw an exception

In [6]:
gymEnvironment = Single_Agent_Wrapper(environment, agents[0])

### Train the agent using Stable Baselines

In [7]:
policy_kwargs = dict(net_arch=dict(pi=[4096, 4096, 4096], qf=[4096, 4096, 4096]))
model = SAC("MlpPolicy", gymEnvironment, verbose=1, train_freq=(128, "step"), batch_size=256, learning_starts=10000, learning_rate=0.0015, buffer_size=1500000, policy_kwargs=policy_kwargs, device="mps")
model.learn(total_timesteps=5000000, progress_bar=True)

Using mps device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.02e+03 |
|    ep_rew_mean     | 4.08     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 993      |
|    time_elapsed    | 4        |
|    total_timesteps | 4100     |
---------------------------------


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.02e+03 |
|    ep_rew_mean     | 4.6      |
| time/              |          |
|    episodes        | 8        |
|    fps             | 1020     |
|    time_elapsed    | 8        |
|    total_timesteps | 8200     |
---------------------------------


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.02e+03 |
|    ep_rew_mean     | 4.76     |
| time/              |          |
|    episodes        | 12       |
|    fps             | 429      |
|    time_elapsed    | 28       |
|    total_timesteps | 12300    |
| train/             |          |
|    actor_loss      | -10.1    |
|    critic_loss     | 4.42     |
|    ent_coef        | 1.01     |
|    ent_coef_loss   | 0.113    |
|    learning_rate   | 0.0015   |
|    n_updates       | 18       |
---------------------------------


### Save the model for later use

In [None]:
model.save("models/sac_model")

### Perform inference while rendering the environment

In [None]:
model = SAC.load("models/sac_model")
testEnv = MuJoCo_RL(config_dict)
testEnvSingleAgent = Single_Agent_Wrapper(testEnv, agents[0])
obs = testEnvSingleAgent.reset()
reward = 0
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = testEnvSingleAgent.step(action)
    if dones:
        print(reward)
        break
    reward += rewards
    time.sleep(0.01)