### `Soft Actor-Critic` using `Snapbot`

In [9]:
import sys,mujoco
import numpy as np
import matplotlib.pyplot as plt
import imageio
import os
import cv2
import mujoco_viewer
sys.path.append('KU-DATA403-simulator-tutorials/package/helper/')
sys.path.append('KU-DATA403-simulator-tutorials/package/mujoco_usage/')
sys.path.append('KU-DATA403-simulator-tutorials/package/gym/')
sys.path.append('KU-DATA403-simulator-tutorials/package/rl/')
from mujoco_parser import *
from slider import *
from utility import *
from snapbot_env import *
from sac import *
np.set_printoptions(precision=2,suppress=True,linewidth=100)
plt.rc('xtick',labelsize=6); plt.rc('ytick',labelsize=6)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
print ("Ready.")

Ready.


#### Parse `Snapbot` gym

In [10]:
xml_path = 'KU-DATA403-simulator-tutorials/asset/snapbot/scene_snapbot.xml'
env = MuJoCoParserClass(name='Snapbot',rel_xml_path=xml_path,verbose=True)
gym = SnapbotGymClass(
    env = env,
    HZ  = 50,
    history_total_sec = 0.2,
    history_intv_sec  = 0.1,
    VERBOSE =True,
)
print ("Ready.")

name:[Snapbot] dt:[0.002] HZ:[500]
n_qpos:[25] n_qvel:[24] n_qacc:[24] n_ctrl:[8]

n_body:[24]
 [0/24] [world] mass:[0.00]kg
 [1/24] [torso] mass:[0.24]kg
 [2/24] [Camera_module_1] mass:[0.06]kg
 [3/24] [Camera_module_2] mass:[0.06]kg
 [4/24] [Leg_module_1_1] mass:[0.06]kg
 [5/24] [Leg_module_1_2] mass:[0.08]kg
 [6/24] [Leg_module_1_3] mass:[0.02]kg
 [7/24] [Leg_module_1_4] mass:[0.01]kg
 [8/24] [Leg_module_1_4bar] mass:[0.01]kg
 [9/24] [Leg_module_2_1] mass:[0.06]kg
 [10/24] [Leg_module_2_2] mass:[0.08]kg
 [11/24] [Leg_module_2_3] mass:[0.02]kg
 [12/24] [Leg_module_2_4] mass:[0.01]kg
 [13/24] [Leg_module_2_4bar] mass:[0.01]kg
 [14/24] [Leg_module_4_1] mass:[0.06]kg
 [15/24] [Leg_module_4_2] mass:[0.08]kg
 [16/24] [Leg_module_4_3] mass:[0.02]kg
 [17/24] [Leg_module_4_4] mass:[0.01]kg
 [18/24] [Leg_module_4_4bar] mass:[0.01]kg
 [19/24] [Leg_module_5_1] mass:[0.06]kg
 [20/24] [Leg_module_5_2] mass:[0.08]kg
 [21/24] [Leg_module_5_3] mass:[0.02]kg
 [22/24] [Leg_module_5_4] mass:[0.01]kg
 [

#### `SAC` hyperparameters

In [None]:
n_episode         = 1000 # number of total episodes (rollouts)
max_epi_sec       = 3.0 # maximum episode length in second (IMPORTANT)
max_epi_tick      = int(max_epi_sec*gym.HZ) # maximum episode length in tick
n_warmup_epi      = 30 # number of warm-up episodes
buffer_limit      = 50000 # 50000
buffer_warmup     = buffer_limit // 5
init_alpha        = 0.1
max_torque        = 2.0
# Update
lr_actor          = 0.0005 # 0.0002 # 0.0005
lr_alpha          = 0.0003 # 0.0003
lr_critic         = 0.0001
n_update_per_tick = 1 # number of updates per tick
batch_size        = 256
gamma             = 0.95
tau               = 0.005
# Debug
print_every       = 50
eval_every        = 1
save_every        = 50
RENDER_EVAL       = False # False
print ("n_episode:[%d], max_epi_sec:[%.2f], max_epi_tick:[%d]"%
       (n_episode,max_epi_sec,max_epi_tick))
print ("n_warmup_epi:[%d], buffer_limit:[%.d], buffer_warmup:[%d]"%
       (n_warmup_epi,buffer_limit,buffer_warmup))

n_episode:[1000], max_epi_sec:[3.00], max_epi_tick:[150]
n_warmup_epi:[30], buffer_limit:[50000], buffer_warmup:[10000]


#### Initialize networks

In [12]:
device = 'cpu' # cpu / mps / cuda
replay_buffer = ReplayBufferClass(buffer_limit, device=device)
actor_arg = {'obs_dim':gym.o_dim,'h_dims':[256,256],'out_dim':gym.a_dim,
             'max_out':max_torque,'init_alpha':init_alpha,'lr_actor':lr_actor,
             'lr_alpha':lr_alpha,'device':device}
critic_arg = {'obs_dim':gym.o_dim,'a_dim':gym.a_dim,'h_dims':[256,256],'out_dim':1,
              'lr_critic':lr_critic,'device':device}
actor           = ActorClass(**actor_arg).to(device)
critic_one      = CriticClass(**critic_arg).to(device)
critic_two      = CriticClass(**critic_arg).to(device)
critic_one_trgt = CriticClass(**critic_arg).to(device)
critic_two_trgt = CriticClass(**critic_arg).to(device)
print ("Ready.")

Ready.


In [13]:
# Modify floor friction priority
env.model.geom('floor').priority = 1 # 0=>1
print ("Floor priority:%s"%(env.model.geom('floor').priority))
gym.env.ctrl_ranges[:,0] = -max_torque
gym.env.ctrl_ranges[:,1] = +max_torque
print ("gym.env.ctrl_ranges:\n",gym.env.ctrl_ranges)

Floor priority:[1]
gym.env.ctrl_ranges:
 [[-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]]


In [None]:
def reward_high_jump(info):
    z_pos = gym.env.get_p_body('torso')[2]
    z_vel = gym.env.get_qvel()[2]
    qvel = gym.env.get_qvel()
    qpos = gym.env.get_qpos()
    orientation_penalty = np.linalg.norm(qpos[3:6])  # assuming 3:6 are rotation components
    contact = gym.env.get_contact_info()
    airborne = not any(contact)
    reward = 0.0
    # 1. Height reward
    reward += 30.0 * z_pos 
    # 2. Vertical speed reward
    if airborne:
        reward += 40.0 * max(0, z_vel)
    else:
        reward += 20.0 * max(0, z_vel)
    ground = any(contact)
    if ground:
        if z_pos < 0.1:
            reward -= 50 * (0.1 - z_pos)
    if z_pos < 0.1:
        reward -= 30 * z_pos # punishment for being low

    if airborne and z_pos > 0.1:
        reward +=15 * z_pos
        reward += 0.5 * z_vel  

    if airborne and z_pos > 0.15:
        reward += 25 * z_pos
        reward += 0.8 * z_vel  

    if airborne and z_pos > 0.2:
        reward += 35 * z_pos
        reward += 1.25 * z_vel  

    if airborne and z_pos > 0.25:
        reward += 50 * z_pos
        reward += 3 * z_vel

    if airborne and z_pos > 0.3:
        reward += 75 * z_pos
        reward += 8 * z_vel

    if airborne and z_pos > 0.35:
        reward += 90 * z_pos
        reward += 15 * z_vel

    if airborne and z_pos > 0.4:
        reward += 120 * z_pos
        reward += 30 * z_vel
    
    # 4. Stability reward (penalize unstable torso pitch, roll, yaw)
    orientation_penalty = np.linalg.norm(qpos[3:6])  # assuming 3:6 are rotation components
    reward += (1.0 - orientation_penalty) * 1 # higher reward for better stability

    # 5. Explosive spring-like use of rear legs
    # Let's assume the back legs are actuators 6 and 7, and joints 16-23 are legs
    rear_joint_velocities = np.array([qvel[21], qvel[22]])
    ctrl_rear = np.array(gym.env.get_ctrl(['actuator_5_2', 'actuator_5_3']))

    # Encourage synchronization of rear actuator effort and joint movement (springiness)
    spring_effort = np.dot(np.abs(rear_joint_velocities), np.abs(ctrl_rear))
    reward += 20.0 * spring_effort

    # Normalize and clip reward for stability in SAC training
    reward = np.clip(reward, 0.0, 500.0)

    return reward

#### Train using `SAC`

In [None]:
REMOVE_PREV_FILES = True # remove previous files
TRACK_BEST_EPISODE = True
best_max_height = -np.inf
best_episode_data = None
best_episode_idx = 0


# Loop
np.random.seed(seed=0) # fix seed
print ("Start training.")
for epi_idx in range(n_episode+1): # for each episode
    zero_to_one = epi_idx/n_episode
    one_to_zero = 1-zero_to_one

    if TRACK_BEST_EPISODE:
        current_episode_actions = []
    # Reset gym
    s = gym.reset()

    # Loop
    USE_RANDOM_POLICY = (np.random.rand()<(0.1*one_to_zero)) or (epi_idx < n_warmup_epi)
    reward_total,reward_forward = 0.0,0.0
    max_height = 0.0
    for tick in range(max_epi_tick): # for each tick in an episode
        if USE_RANDOM_POLICY:
            a_np = gym.sample_action()
        else:
            a,log_prob = actor(np2torch(s,device=device))
            a_np = torch2np(a)
        # Step
        if TRACK_BEST_EPISODE:
            current_episode_actions.append(a_np.copy())
            
        s_prime,_,done,info = gym.step(a_np,max_time=max_epi_sec)
        reward = reward_high_jump(info)  #using long jump reward            
        replay_buffer.put((s,a_np,reward,s_prime,done))
        reward_total += reward 
        reward_forward += info['r_forward']
        s = s_prime

        # compute z_diff
        z_diff = gym.env.get_p_body('torso')[2]
        if z_diff > max_height:
            max_height = z_diff
        if done is True: break # terminate condition
        
        # Replay buffer
        if replay_buffer.size() > buffer_warmup:
             for _ in range(n_update_per_tick): 
                mini_batch = replay_buffer.sample(batch_size)
                # Update critics
                td_target = get_target(
                    actor,
                    critic_one_trgt,
                    critic_two_trgt,
                    gamma      = gamma,
                    mini_batch = mini_batch,
                    device     = device,
                )
                critic_one.train(td_target,mini_batch)
                critic_two.train(td_target,mini_batch)
                # Update actor
                actor.train(
                    critic_one,
                    critic_two,
                    target_entropy = -gym.a_dim,
                    mini_batch     = mini_batch,
                )
                # Soft update of critics
                critic_one.soft_update(tau=tau,net_target=critic_one_trgt)
                critic_two.soft_update(tau=tau,net_target=critic_two_trgt)

    # Compute x_diff
    x_diff = gym.env.get_p_body('torso')[0]

    # Print
    if (epi_idx%print_every)==0:
        epi_tick = tick
        print ("[%d/%d][%.1f%%]"%(epi_idx,n_episode,100.0*(epi_idx/n_episode)))
        print ("  reward:[%.1f] x_diff:[%.3f] epi_len:[%d/%d] buffer_size:[%d] alpha:[%.2f] z_diff:[%.3f]"%
               (reward_total,x_diff,epi_tick,max_epi_tick,
                replay_buffer.size(),actor.log_alpha.exp(),z_diff))
    
    # Evaluation
    if (epi_idx%eval_every)==0:
        if RENDER_EVAL: gym.init_viewer()
        s = gym.reset()
        reward_total = 0.0
        for tick in range(max_epi_tick):
            a,_ = actor(np2torch(s,device=device),SAMPLE_ACTION=False)
            s_prime,reward,done,info = gym.step(torch2np(a),max_time=max_epi_sec)
            reward_total += reward
            if RENDER_EVAL and ((tick%5) == 0):
                gym.render(
                    TRACK_TORSO      = True,
                    PLOT_WORLD_COORD = True,
                    PLOT_TORSO_COORD = True,
                    PLOT_SENSOR      = True,
                    PLOT_CONTACT     = True,
                    PLOT_TIME        = True,
                )
            s = s_prime
            if RENDER_EVAL:
                if not gym.is_viewer_alive(): break
        if RENDER_EVAL: gym.close_viewer()
        x_diff = gym.env.get_p_body('torso')[0]
        z_diff = gym.env.get_p_body('torso')[2]
        if z_diff > max_height:
            max_height = z_diff
        print ("  [Eval] reward:[%.3f] x_diff:[%.3f] epi_len:[%d/%d] max_height:[%.3f]"%
               (reward_total,x_diff,tick,max_epi_tick,max_height))
    
    if TRACK_BEST_EPISODE and max_height > best_max_height:
        best_max_height = max_height
        best_episode_actions = current_episode_actions.copy()
        best_episode_idx = epi_idx
        best_pth_path = './result/weights/sac_%s/highjump/best_highjump%d.pth'%(gym.name.lower(),epi_idx)
        torch.save(actor.state_dict(),best_pth_path)
    
    # Save network
    if (epi_idx%save_every)==0:
        pth_path = './result/weights/sac_%s/highjump/episode_%d.pth'%(gym.name.lower(),epi_idx)
        dir_path = os.path.dirname(pth_path)
        if not os.path.exists(dir_path): os.makedirs(dir_path)
        if (epi_idx == 0) and REMOVE_PREV_FILES: # remove all existing files
            files = os.listdir(path=dir_path)
            print ("  [Save] Remove existing [%d] pth files."%(len(files)))
            for file in files: os.remove(os.path.join(dir_path,file))
        torch.save(actor.state_dict(),pth_path)
        print ("  [Save] [%s] saved."%(pth_path))
    
    # Save best episode
    #if max_height > best_max_height:
     #   best_action_path = './result/weights/sac_%s/highjump/best_jump_actions.pth'%gym.name.lower()
      #  torch.save({
      #      'episode_idx': best_episode_idx,
       #     'max_height': best_max_height,
      #      'actions': best_episode_actions
      #  }, best_action_path)
      #  print ("  [Save] Best jump actions saved to [%s]."%(best_pth_path))

#def replay_best_episode():
#    best_pth_path = './result/weights/sac_%s/highjump/best_jump_actions.pth' % gym.name.lower()
#    if not os.path.exists(best_pth_path):
#        print("No best episode found")
#        return
    
#    best_data = torch.load(best_pth_path)
#    print(f"Replaying best jump (Height: {best_data['max_height']:.3f} from Episode {best_data['episode_idx']})")
    
#    gym.init_viewer()
#    s = gym.reset()
    
    #for action in best_data['actions']:
        #s_prime, _, _, _ = gym.step(action)
        #gym.render(
          #  TRACK_TORSO=True,
         #   PLOT_WORLD_COORD=True,
        #    PLOT_TORSO_COORD=True,
       #     PLOT_SENSOR=True,
      #      PLOT_CONTACT=True,
     #       PLOT_TIME=True,
    #    )
   #     if not gym.is_viewer_alive():
  #          break
    
 #   gym.close_viewer()

# After training completes
if TRACK_BEST_EPISODE:
    print(f"\nTraining complete! Best jump: {best_max_height:.3f} (Episode {best_episode_idx})")
    #replay_best_episode()  # Automatically play the best episode

    
print ("Done.")


Start training.
[0/1000][0.0%]
  reward:[4936.2] x_diff:[0.014] epi_len:[149/150] buffer_size:[150] alpha:[0.10] z_diff:[0.050]
  [Eval] reward:[1.090] x_diff:[-0.000] epi_len:[149/150] max_height:[0.098]
 NEW BEST JUMP: 0.098 (Episode 0)
  [Save] Best jump video saved to ./result/videos/sac_snapbot/highjump/best_jump_0.mp4
  [Save] Remove existing [7] pth files.
  [Save] [./result/weights/sac_snapbot/highjump/episode_0.pth] saved.
[1/1000][0.1%]
  reward:[5565.2] x_diff:[0.016] epi_len:[149/150] buffer_size:[300] alpha:[0.10] z_diff:[0.051]
  [Eval] reward:[1.090] x_diff:[-0.000] epi_len:[149/150] max_height:[0.098]


MESA: error: ZINK: failed to choose pdev
glx: failed to create drisw screen


 NEW BEST JUMP: 0.098 (Episode 1)
  [Save] Best jump video saved to ./result/videos/sac_snapbot/highjump/best_jump_1.mp4
[2/1000][0.2%]
  reward:[6486.6] x_diff:[-0.045] epi_len:[149/150] buffer_size:[450] alpha:[0.10] z_diff:[0.053]
  [Eval] reward:[1.090] x_diff:[-0.000] epi_len:[149/150] max_height:[0.098]
[3/1000][0.3%]
  reward:[6040.2] x_diff:[-0.406] epi_len:[149/150] buffer_size:[600] alpha:[0.10] z_diff:[0.048]
  [Eval] reward:[1.090] x_diff:[-0.000] epi_len:[149/150] max_height:[0.122]


MESA: error: ZINK: failed to choose pdev
glx: failed to create drisw screen


 NEW BEST JUMP: 0.122 (Episode 3)
  [Save] Best jump video saved to ./result/videos/sac_snapbot/highjump/best_jump_3.mp4
[4/1000][0.4%]
  reward:[4304.0] x_diff:[0.068] epi_len:[149/150] buffer_size:[750] alpha:[0.10] z_diff:[0.057]
  [Eval] reward:[1.090] x_diff:[-0.000] epi_len:[149/150] max_height:[0.098]
[5/1000][0.5%]
  reward:[3907.3] x_diff:[-0.060] epi_len:[149/150] buffer_size:[900] alpha:[0.10] z_diff:[0.042]
  [Eval] reward:[1.090] x_diff:[-0.000] epi_len:[149/150] max_height:[0.098]
[6/1000][0.6%]
  reward:[6021.3] x_diff:[0.020] epi_len:[149/150] buffer_size:[1050] alpha:[0.10] z_diff:[0.041]
  [Eval] reward:[1.090] x_diff:[-0.000] epi_len:[149/150] max_height:[0.098]
[7/1000][0.7%]
  reward:[4491.0] x_diff:[0.118] epi_len:[149/150] buffer_size:[1200] alpha:[0.10] z_diff:[0.046]
  [Eval] reward:[1.090] x_diff:[-0.000] epi_len:[149/150] max_height:[0.098]
[8/1000][0.8%]
  reward:[6570.7] x_diff:[0.055] epi_len:[149/150] buffer_size:[1350] alpha:[0.10] z_diff:[0.134]
  [Eval

MESA: error: ZINK: failed to choose pdev
glx: failed to create drisw screen


 NEW BEST JUMP: 0.134 (Episode 8)
  [Save] Best jump video saved to ./result/videos/sac_snapbot/highjump/best_jump_8.mp4
[9/1000][0.9%]
  reward:[3468.0] x_diff:[0.070] epi_len:[149/150] buffer_size:[1500] alpha:[0.10] z_diff:[0.051]
  [Eval] reward:[1.090] x_diff:[-0.000] epi_len:[149/150] max_height:[0.098]
[10/1000][1.0%]
  reward:[5207.2] x_diff:[-0.052] epi_len:[149/150] buffer_size:[1650] alpha:[0.10] z_diff:[0.045]
  [Eval] reward:[1.090] x_diff:[-0.000] epi_len:[149/150] max_height:[0.098]
[11/1000][1.1%]
  reward:[5973.0] x_diff:[0.128] epi_len:[149/150] buffer_size:[1800] alpha:[0.10] z_diff:[0.039]
  [Eval] reward:[1.090] x_diff:[-0.000] epi_len:[149/150] max_height:[0.098]
[12/1000][1.2%]
  reward:[4993.2] x_diff:[0.094] epi_len:[107/150] buffer_size:[1908] alpha:[0.10] z_diff:[0.083]
  [Eval] reward:[1.090] x_diff:[-0.000] epi_len:[149/150] max_height:[0.144]


MESA: error: ZINK: failed to choose pdev
glx: failed to create drisw screen


 NEW BEST JUMP: 0.144 (Episode 12)
  [Save] Best jump video saved to ./result/videos/sac_snapbot/highjump/best_jump_12.mp4
[13/1000][1.3%]
  reward:[4539.1] x_diff:[0.083] epi_len:[149/150] buffer_size:[2058] alpha:[0.10] z_diff:[0.071]
  [Eval] reward:[1.090] x_diff:[-0.000] epi_len:[149/150] max_height:[0.098]
[14/1000][1.4%]
  reward:[4953.1] x_diff:[0.006] epi_len:[149/150] buffer_size:[2208] alpha:[0.10] z_diff:[0.040]
  [Eval] reward:[1.090] x_diff:[-0.000] epi_len:[149/150] max_height:[0.098]
[15/1000][1.5%]
  reward:[3957.7] x_diff:[0.146] epi_len:[149/150] buffer_size:[2358] alpha:[0.10] z_diff:[0.057]
  [Eval] reward:[1.090] x_diff:[-0.000] epi_len:[149/150] max_height:[0.098]
[16/1000][1.6%]
  reward:[6354.3] x_diff:[0.010] epi_len:[149/150] buffer_size:[2508] alpha:[0.10] z_diff:[0.048]
  [Eval] reward:[1.090] x_diff:[-0.000] epi_len:[149/150] max_height:[0.098]
[17/1000][1.7%]
  reward:[6556.0] x_diff:[-0.140] epi_len:[149/150] buffer_size:[2658] alpha:[0.10] z_diff:[0.072

In [None]:
print(best_max_height)
print(best_episode_idx)

0.2920971022349266
915


In [None]:
# Configuration
max_epi_sec  = 15.0 # maximum episode length in second
max_epi_tick = int(max_epi_sec*gym.HZ) # maximum episode length in tick
# Actor
device     = 'cpu' # cpu / mps / cuda
max_torque = 2.0
init_alpha = 0.1
lr_actor   = 0.0005
lr_alpha   = 0.0003
actor = ActorClass(
    obs_dim    = gym.o_dim,
    h_dims     = [256,256],
    out_dim    = gym.a_dim,
    max_out    = max_torque,
    init_alpha = init_alpha,
    lr_actor   = lr_actor,
    lr_alpha   = lr_alpha,
    device     = device,
).to(device)

max_length = 0.0 
# Load pth
pth_path = './result/weights/sac_%s/highjump/best_highjump%d.pth'%(gym.name.lower(),best_episode_idx)#'./result/weights/sac_%s/longjump/best_longjump%d.pth'%(gym.name.lower(),epi_idx)
actor.load_state_dict(torch.load(pth_path,map_location=device))
# Run
gym.init_viewer()
s = gym.reset()
gym.viewer_pause() # pause
print ("   Viewer paused. Press [space] to resume.")
reward_total = 0.0
for tick in range(max_epi_tick):
    a,_ = actor(np2torch(s,device=device),SAMPLE_ACTION=False)
    s_prime,reward,done, info = gym.step(torch2np(a),max_time=max_epi_sec)
    gym.render(
        TRACK_TORSO      = True,
        PLOT_WORLD_COORD = True,
        PLOT_TORSO_COORD = True,
        PLOT_SENSOR      = True,
        PLOT_CONTACT     = True,
        PLOT_TIME        = True,
    )
    reward_total += reward
    s = s_prime
    if not gym.is_viewer_alive(): break
gym.close_viewer()
x_diff = gym.env.get_p_body('torso')[0]
z_diff = gym.env.get_p_body('torso')[2]
if x_diff > max_length:
        max_height = x_diff
print ("  [Eval] reward:[%.3f] x_diff:[%.3f] epi_len:[%d/%d]"%
       (reward_total,x_diff,tick,max_epi_tick))