# Robotics with Python: MuJoCo & Gym

### Setup

In [2]:
#pip install mujoco
#pip install gymnasium

import gymnasium as gym
import time

env = gym.make("Humanoid-v4", render_mode="human")
obs, info = env.reset()
env.render()

In [4]:
info

{'x_position': np.float64(0.0016152704880662335),
 'y_position': np.float64(-0.006376264384969079),
 'tendon_length': array([ 0.0039346 , -0.00659782]),
 'tendon_velocity': array([-0.00065977,  0.00110333]),
 'distance_from_origin': np.float64(0.006577677877233183)}

In [2]:
obs

array([ 1.40551438e+00,  1.00249109e+00,  1.02295992e-03,  6.11636741e-03,
        8.89343991e-03,  3.68799654e-03,  2.68748631e-03, -7.37148320e-03,
       -1.27035634e-03, -3.83448487e-03,  4.97995726e-03, -9.27465782e-04,
       -5.30276103e-03, -6.80604484e-04, -9.18529355e-03, -1.14950454e-03,
       -7.01506517e-03,  3.62228355e-03,  8.04498421e-03, -5.30207130e-03,
       -1.90562795e-03, -6.06947519e-05,  2.28593828e-05,  6.61675641e-03,
        9.77447190e-03, -6.79145658e-03,  9.35076611e-03,  9.40259276e-03,
        6.24602486e-03, -7.69409031e-03,  7.79339588e-03, -3.57860426e-03,
        8.84452451e-03,  2.14833940e-03,  3.15900130e-03, -3.69123743e-03,
        5.47496853e-03,  3.01387670e-03,  9.26404279e-03, -1.41988753e-03,
        1.43547743e-03, -1.56219701e-03,  6.16717799e-03,  5.88406125e-03,
        6.06353189e-03,  2.30359698e+00,  2.28564819e+00,  4.23921394e-02,
        3.81892603e-04,  4.27251199e-02, -1.34221053e-03, -9.78046855e-02,
        3.05445261e-03,  

In [3]:
env.action_space

Box(-0.4, 0.4, (17,), float32)

In [6]:
env.action_space.sample()

array([-0.05293579, -0.02909377,  0.12628955,  0.15863812, -0.14441639,
       -0.0146535 , -0.04183137,  0.30172417, -0.05358713, -0.20124765,
       -0.3164999 , -0.08170177, -0.13806766, -0.0086144 , -0.1124486 ,
       -0.22309826,  0.36465538], dtype=float32)

In [9]:
# RANDOM ACTIONS
import gymnasium as gym
import time

env = gym.make("Humanoid-v4", render_mode="human")
obs, info = env.reset()

reset = False #reset if the humanoid falls or the episode ends
episode = 1
total_reward, step = 0, 0

for _ in range(240):
    ## action
    step += 1
    action = env.action_space.sample() #random action
    obs, reward, terminated, truncated, info = env.step(action)
    ## reward
    total_reward += reward
    ## render
    env.render() #render physics step (CPU speed = 0.1 seconds)
    time.sleep(1/240) #slow down to real-time (240 steps × 1/240 second sleep = 1 second)
    if (step == 1) or (step % 100 == 0): #print first step and every 100 steps
        print(f"EPISODE {episode} - Step:{step}, Reward:{reward:.1f}, Total:{total_reward:.1f}")
    ## reset
    if reset:
        if terminated or truncated: #print the last step
            print(f"EPISODE {episode} - Step:{step}, Reward:{reward:.1f}, Total:{total_reward:.1f}")
            obs, info = env.reset()
            episode += 1
            total_reward, step = 0, 0
            print("------------------------------------------")

env.close()

EPISODE 1 - Step:1, Reward:4.9, Total:4.9
EPISODE 1 - Step:100, Reward:4.7, Total:467.3
EPISODE 1 - Step:200, Reward:4.9, Total:951.0


### Reinforcement Learning

In [8]:
import gymnasium as gym
import time
import numpy as np

env = gym.make("Humanoid-v4", render_mode="human")
obs, info = env.reset()

reset = True #reset if the humanoid falls or the episode ends
episode = 1
total_reward, step = 0, 0
exploration_rate = 0.5 #start wild
preferred_action = np.zeros(env.action_space.shape) #knowledge to update with experience

for _ in range(1000):
    ## action
    step += 1
    exploration = np.random.normal(loc=0, scale=exploration_rate, size=env.action_space.shape) #add random noise
    action = np.clip(a=preferred_action+exploration, a_min=-1, a_max=1)
    obs, reward, terminated, truncated, info = env.step(action) 
    ## reward
    total_reward += reward
    if reward > 0:
        preferred_action += (action-preferred_action)*0.05 #learning_rate
    exploration_rate = max(0.05, exploration_rate*0.99) #min_exploration=0.05, decay_exploration=0.99
    ## render
    env.render() 
    time.sleep(1/240)
    if (step == 1) or (step % 100 == 0):
        print(f"EPISODE {episode} - Step:{step}, Reward:{reward:.1f}, Total:{total_reward:.1f}")
    ## reset
    if reset:
        if terminated or truncated:
            print(f"EPISODE {episode} - Step:{step}, Reward:{reward:.1f}, Total:{total_reward:.1f}")
            obs, info = env.reset()
            episode += 1
            total_reward, step = 0, 0
            print("------------------------------------------")

env.close()

Exception ignored in: <function WindowViewer.__del__ at 0x7fd4ce8e15f0>
Traceback (most recent call last):
  File "/Users/mdp/opt/anaconda3/envs/TORCH/lib/python3.7/site-packages/gymnasium/envs/mujoco/mujoco_rendering.py", line 335, in __del__
    self.free()
  File "/Users/mdp/opt/anaconda3/envs/TORCH/lib/python3.7/site-packages/gymnasium/envs/mujoco/mujoco_rendering.py", line 330, in free
    glfw.destroy_window(self.window)
  File "/Users/mdp/opt/anaconda3/envs/TORCH/lib/python3.7/site-packages/glfw/__init__.py", line 1282, in destroy_window
    window_addr = ctypes.cast(ctypes.pointer(window),
TypeError: _type_ must have storage info


EPISODE 1 - Step:1, Reward:4.7, Total:4.7
EPISODE 1 - Step:20, Reward:5.6, Total:96.8
------------------------------------------
EPISODE 2 - Step:1, Reward:4.6, Total:4.6
EPISODE 2 - Step:37, Reward:5.0, Total:187.5
------------------------------------------
EPISODE 3 - Step:1, Reward:4.7, Total:4.7
EPISODE 3 - Step:53, Reward:5.9, Total:279.8
------------------------------------------
EPISODE 4 - Step:1, Reward:4.9, Total:4.9
EPISODE 4 - Step:33, Reward:6.2, Total:179.3
------------------------------------------
EPISODE 5 - Step:1, Reward:4.9, Total:4.9
EPISODE 5 - Step:37, Reward:6.2, Total:203.6
------------------------------------------
EPISODE 6 - Step:1, Reward:4.9, Total:4.9
EPISODE 6 - Step:42, Reward:6.2, Total:230.4
------------------------------------------
EPISODE 7 - Step:1, Reward:5.0, Total:5.0
EPISODE 7 - Step:41, Reward:6.3, Total:224.2
------------------------------------------
EPISODE 8 - Step:1, Reward:5.0, Total:5.0
EPISODE 8 - Step:34, Reward:6.2, Total:186.7
----

KeyboardInterrupt: 

### Deep Reinforcement Learning

###### Train

In [None]:
#pip install torch
#pip install stable-baselines3
#pip install tensorboard

import gymnasium as gym
from stable_baselines3 import SAC
from stable_baselines3.common.vec_env import DummyVecEnv

## environment
env = gym.make("Humanoid-v4") #no rendering to speed up
env = DummyVecEnv([lambda:env])

## train
print("Training START")
model = SAC(policy="MlpPolicy", env=env, verbose=0, 
            learning_rate=0.005, ent_coef=0.005, #exploration
            tensorboard_log="logs/") #>tensorboard --logdir=logs/
model.learn(total_timesteps=500_000, #1h
            tb_log_name="model_humanoid", log_interval=10)
print("Training DONE")

## save
model.save("model_humanoid")

Training START


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/mdp/opt/anaconda3/envs/TORCH/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/_z/cl6cx94516g2tlbhzcbl4nfc0000gn/T/ipykernel_66008/1198057625.py", line 19, in <module>
    tb_log_name="model_humanoid", log_interval=10)
  File "/Users/mdp/opt/anaconda3/envs/TORCH/lib/python3.7/site-packages/stable_baselines3/sac/sac.py", line 313, in learn
    progress_bar=progress_bar,
  File "/Users/mdp/opt/anaconda3/envs/TORCH/lib/python3.7/site-packages/stable_baselines3/common/off_policy_algorithm.py", line 331, in learn
    self.train(batch_size=self.batch_size, gradient_steps=gradient_steps)
  File "/Users/mdp/opt/anaconda3/envs/TORCH/lib/python3.7/site-packages/stable_baselines3/sac/sac.py", line 215, in train
    replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)  # type: ignore[union-attr]
  File "/Users/m

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/mdp/opt/anaconda3/envs/TORCH/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/_z/cl6cx94516g2tlbhzcbl4nfc0000gn/T/ipykernel_66008/1198057625.py", line 19, in <module>
    tb_log_name="model_humanoid", log_interval=10)
  File "/Users/mdp/opt/anaconda3/envs/TORCH/lib/python3.7/site-packages/stable_baselines3/sac/sac.py", line 313, in learn
    progress_bar=progress_bar,
  File "/Users/mdp/opt/anaconda3/envs/TORCH/lib/python3.7/site-packages/stable_baselines3/common/off_policy_algorithm.py", line 331, in learn
    self.train(batch_size=self.batch_size, gradient_steps=gradient_steps)
  File "/Users/mdp/opt/anaconda3/envs/TORCH/lib/python3.7/site-packages/stable_baselines3/sac/sac.py", line 215, in train
    replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)  # type: ignore[union-attr]
  File "/Users/m

In [4]:
#pip install torch
#pip install stable-baselines3
#pip install tensorboard

import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

## environment
env = gym.make("Humanoid-v4") #no rendering to speed up
env = DummyVecEnv([lambda:env])

## train
print("Training START")
model = PPO(policy="MlpPolicy", env=env, verbose=0, 
            learning_rate=0.005, ent_coef=0.005, #exploration
            tensorboard_log="logs/") #>tensorboard --logdir=logs/
model.learn(total_timesteps=3_000_000, #1h
            tb_log_name="model_humanoid", log_interval=10)
print("Training DONE")

## save
model.save("model_humanoid")

Training START


Exception ignored in: <function WindowViewer.__del__ at 0x7fef22e7e0e0>
Traceback (most recent call last):
  File "/Users/mdp/opt/anaconda3/envs/TORCH/lib/python3.7/site-packages/gymnasium/envs/mujoco/mujoco_rendering.py", line 335, in __del__
    self.free()
  File "/Users/mdp/opt/anaconda3/envs/TORCH/lib/python3.7/site-packages/gymnasium/envs/mujoco/mujoco_rendering.py", line 330, in free
    glfw.destroy_window(self.window)
  File "/Users/mdp/opt/anaconda3/envs/TORCH/lib/python3.7/site-packages/glfw/__init__.py", line 1282, in destroy_window
    window_addr = ctypes.cast(ctypes.pointer(window),
TypeError: _type_ must have storage info


Training DONE


###### Test

In [11]:
import gymnasium as gym
from stable_baselines3 import PPO
import time

env = gym.make("Humanoid-v4", render_mode="human")
model = PPO.load(path="model_humanoid", env=env)
obs, info = env.reset()

reset = False #reset if the humanoid falls or the episode ends
episode = 1
total_reward, step = 0, 0

for _ in range(1000):
    ## action
    step += 1
    action, _ = model.predict(obs)    
    obs, reward, terminated, truncated, info = env.step(action) 
    ## reward
    total_reward += reward
    ## render
    env.render() 
    time.sleep(1/240)
    if (step == 1) or (step % 100 == 0): #print first step and every 100 steps
        print(f"EPISODE {episode} - Step:{step}, Reward:{reward:.1f}, Total:{total_reward:.1f}")
    ## reset
    if reset:
        if terminated or truncated: #print the last step
            print(f"EPISODE {episode} - Step:{step}, Reward:{reward:.1f}, Total:{total_reward:.1f}")
            obs, info = env.reset()
            episode += 1
            total_reward, step = 0, 0
            print("------------------------------------------")

env.close()

EPISODE 1 - Step:1, Reward:4.7, Total:4.7
EPISODE 1 - Step:100, Reward:5.8, Total:509.3
EPISODE 1 - Step:200, Reward:4.6, Total:1014.7
EPISODE 1 - Step:300, Reward:4.9, Total:1478.7
EPISODE 1 - Step:400, Reward:4.6, Total:1941.6
EPISODE 1 - Step:500, Reward:4.7, Total:2404.6
EPISODE 1 - Step:600, Reward:4.7, Total:2879.2
EPISODE 1 - Step:700, Reward:4.6, Total:3344.9
EPISODE 1 - Step:800, Reward:4.5, Total:3816.8
EPISODE 1 - Step:900, Reward:4.7, Total:4287.6
EPISODE 1 - Step:1000, Reward:4.5, Total:4749.2


### Custom Env

In [3]:
# pip install gymnasium[mujoco] stable-baselines3 mujoco glfw

import gymnasium as gym
from gymnasium.envs.mujoco import AntEnv
from gymnasium.envs.registration import register
import numpy as np


class AntJumpEnv(AntEnv):
    """Custom Ant that learns to JUMP instead of walk."""

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.target_height = 0.8  # what counts as a good jump
        self.last_z = None

    def step(self, action):
        obs, reward, terminated, truncated, info = super().step(action)

        # get torso height (z position of the main body)
        torso_height = float(self.data.qpos[2])

        # --- 💥 custom reward ---
        # reward for jumping high
        jump_bonus = np.clip(torso_height - 0.6, 0, 1) * 10.0
        # penalize wasting energy
        ctrl_penalty = 0.05 * np.square(action).sum()

        reward = jump_bonus - ctrl_penalty

        # episode ends if it falls
        terminated = bool(torso_height < 0.2)

        info["torso_height"] = torso_height
        return obs, reward, terminated, truncated, info

    def reset_model(self):
        obs, info = super().reset_model()
        self.last_z = self.data.qpos[2]
        return obs, info


# 🔹 Register it so Gym can find it
register(
    id="AntJump-v1",
    entry_point="__main__:AntJumpEnv",
)

# 🔹 Test the env (random actions)
if __name__ == "__main__":
    env = gym.make("AntJump-v1", render_mode="human")
    obs, info = env.reset(seed=42)

    for step in range(1000):
        action = env.action_space.sample()
        obs, reward, terminated, truncated, info = env.step(action)
        if terminated or truncated:
            obs, info = env.reset()
    env.close()


AttributeError: module 'mujoco_py' has no attribute 'load_model_from_path'