### `Soft Actor-Critic` using `Snapbot`

In [29]:
import sys,mujoco
import numpy as np
import matplotlib.pyplot as plt
sys.path.append('KU-DATA403-simulator-tutorials/package/helper/')
sys.path.append('KU-DATA403-simulator-tutorials/package/mujoco_usage/')
sys.path.append('KU-DATA403-simulator-tutorials/package/gym/')
sys.path.append('KU-DATA403-simulator-tutorials/package/rl/')
from mujoco_parser import *
from slider import *
from utility import *
from snapbot_env import *
from sac import *
np.set_printoptions(precision=2,suppress=True,linewidth=100)
plt.rc('xtick',labelsize=6); plt.rc('ytick',labelsize=6)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
print ("Ready.")

Ready.


#### Parse `Snapbot` gym

In [30]:
xml_path = 'KU-DATA403-simulator-tutorials/asset/snapbot/scene_snapbot.xml'
env = MuJoCoParserClass(name='Snapbot',rel_xml_path=xml_path,verbose=True)
gym = SnapbotGymClass(
    env = env,
    HZ  = 50,
    history_total_sec = 0.2,
    history_intv_sec  = 0.1,
    VERBOSE =True,
)
print ("Ready.")

name:[Snapbot] dt:[0.002] HZ:[500]
n_qpos:[25] n_qvel:[24] n_qacc:[24] n_ctrl:[8]

n_body:[24]
 [0/24] [world] mass:[0.00]kg
 [1/24] [torso] mass:[0.24]kg
 [2/24] [Camera_module_1] mass:[0.06]kg
 [3/24] [Camera_module_2] mass:[0.06]kg
 [4/24] [Leg_module_1_1] mass:[0.06]kg
 [5/24] [Leg_module_1_2] mass:[0.08]kg
 [6/24] [Leg_module_1_3] mass:[0.02]kg
 [7/24] [Leg_module_1_4] mass:[0.01]kg
 [8/24] [Leg_module_1_4bar] mass:[0.01]kg
 [9/24] [Leg_module_2_1] mass:[0.06]kg
 [10/24] [Leg_module_2_2] mass:[0.08]kg
 [11/24] [Leg_module_2_3] mass:[0.02]kg
 [12/24] [Leg_module_2_4] mass:[0.01]kg
 [13/24] [Leg_module_2_4bar] mass:[0.01]kg
 [14/24] [Leg_module_4_1] mass:[0.06]kg
 [15/24] [Leg_module_4_2] mass:[0.08]kg
 [16/24] [Leg_module_4_3] mass:[0.02]kg
 [17/24] [Leg_module_4_4] mass:[0.01]kg
 [18/24] [Leg_module_4_4bar] mass:[0.01]kg
 [19/24] [Leg_module_5_1] mass:[0.06]kg
 [20/24] [Leg_module_5_2] mass:[0.08]kg
 [21/24] [Leg_module_5_3] mass:[0.02]kg
 [22/24] [Leg_module_5_4] mass:[0.01]kg
 [

#### `SAC` hyperparameters

In [None]:
n_episode         = 3000 # number of total episodes (rollouts)
max_epi_sec       = 7.0 # maximum episode length in second (IMPORTANT)
max_epi_tick      = int(max_epi_sec*gym.HZ) # maximum episode length in tick
n_warmup_epi      = 10 # number of warm-up episodes
buffer_limit      = 100000 # larger buffer
buffer_warmup     = buffer_limit // 5
init_alpha        = 0.2
max_torque        = 2.5
# Update
lr_actor          = 0.0003 # 0.0002 # 0.0005
lr_alpha          = 0.0002 # 0.0003
lr_critic         = 0.0001
n_update_per_tick = 1 # number of updates per tick
batch_size        = 256
gamma             = 0.98
tau               = 0.01
# Debug
print_every       = 250
eval_every        = 50
save_every        = 50
RENDER_EVAL       = False # False
print ("n_episode:[%d], max_epi_sec:[%.2f], max_epi_tick:[%d]"%
       (n_episode,max_epi_sec,max_epi_tick))
print ("n_warmup_epi:[%d], buffer_limit:[%.d], buffer_warmup:[%d]"%
       (n_warmup_epi,buffer_limit,buffer_warmup))

n_episode:[500], max_epi_sec:[7.00], max_epi_tick:[350]
n_warmup_epi:[10], buffer_limit:[100000], buffer_warmup:[20000]


#### Initialize networks

In [32]:
device = 'cpu' # cpu / mps / cuda
replay_buffer = ReplayBufferClass(buffer_limit, device=device)
actor_arg = {'obs_dim':gym.o_dim,'h_dims':[256,256],'out_dim':gym.a_dim,
             'max_out':max_torque,'init_alpha':init_alpha,'lr_actor':lr_actor,
             'lr_alpha':lr_alpha,'device':device}
critic_arg = {'obs_dim':gym.o_dim,'a_dim':gym.a_dim,'h_dims':[256,256],'out_dim':1,
              'lr_critic':lr_critic,'device':device}
actor           = ActorClass(**actor_arg).to(device)
critic_one      = CriticClass(**critic_arg).to(device)
critic_two      = CriticClass(**critic_arg).to(device)
critic_one_trgt = CriticClass(**critic_arg).to(device)
critic_two_trgt = CriticClass(**critic_arg).to(device)
print ("Ready.")

Ready.


In [33]:
# Modify floor friction priority
env.model.geom('floor').priority = 1 # 0=>1
print ("Floor priority:%s"%(env.model.geom('floor').priority))
gym.env.ctrl_ranges[:,0] = -max_torque
gym.env.ctrl_ranges[:,1] = +max_torque
print ("gym.env.ctrl_ranges:\n",gym.env.ctrl_ranges)

Floor priority:[1]
gym.env.ctrl_ranges:
 [[-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]]


In [None]:
# Sideways Running (Minimize time to target)
def reward_side_run(info):
    x_pos = gym.env.get_p_body('torso')[2]
    #target_y = 5.0  # Target position
    x_vel = gym.env.get_qvel()[2]
    
    distance_progress = abs(x_pos - 0)
    time_penalty = -0.1  # Per timestep penalty

    
    y_vel = gym.env.get_qvel()[0]
    
    # Get orientation and angular velocity
    orientation = gym.env.get_qpos()[3:6]  # Roll, pitch, yaw
    ang_vel = gym.env.get_qvel()[3:6]
    
    # Contact information
    contact = gym.env.get_contact_info()
    # Energy efficiency (penalize high torque)
    
    # Main components of reward
    reward = 0.0

    reward += time_penalty
    
    reward += distance_progress
    # lat vel
    reward += 20.0 * abs(x_vel) 
    
    # punish forward
    reward -= 10.0 * abs(y_vel)
    
    # punish bad orr
    orientation_penalty = np.linalg.norm(orientation[:2])  # Roll and pitch
    reward -= 2.0 * orientation_penalty
    
    # punish ang vel
    ang_vel_penalty = np.linalg.norm(ang_vel)
    reward -= 5.0 * ang_vel_penalty
    
    # 5. Contact pattern reward (encourage rhythmic contact)
    if contact == True:  # At least two legs in contact
        reward += 10.0
    if contact == False:
        reward += 10.0
    
    
    # Clip reward for training stability
    reward = np.clip(reward, -10.0, 100.0)
    
    return reward

In [None]:
def time_to_distance(gym_env, target_distance = 5.0):
    start_x = gym_env.get_p_body('torso')[2]
    current_x = start_x
    start_time = gym_env.get_time()

    while True:
        gym_env.step(actor.get_action(gym_env.get_state))

#### Train using `SAC`

In [None]:
REMOVE_PREV_FILES = True # remove previous files
TRACK_BEST_EPISODE = True
best_lateral_speed = -np.inf
best_distance = -np.inf
best_episode_data = None
best_episode_idx = 0
max_distance = 0


# Loop
np.random.seed(seed=0) # fix seed
print ("Start training.")
for epi_idx in range(n_episode+1): # for each episode
    zero_to_one = epi_idx/n_episode
    one_to_zero = 1-zero_to_one
    # Reset gym
    s = gym.reset()
    # Loop
    USE_RANDOM_POLICY = (np.random.rand()<(0.1*one_to_zero)) or (epi_idx < n_warmup_epi)
    reward_total,reward_forward = 0.0,0.0
    max_height = 0.0
    for tick in range(max_epi_tick): # for each tick in an episode
        if USE_RANDOM_POLICY:
            a_np = gym.sample_action()
        else:
            a,log_prob = actor(np2torch(s,device=device))
            a_np = torch2np(a)
        # Step
        
        s_prime,_,done,info = gym.step(a_np,max_time=max_epi_sec)
        reward = reward_side_run(info)  #using long jump reward
        replay_buffer.put((s,a_np,reward,s_prime,done))
        reward_total += reward 
        reward_forward += info['r_forward']
        s = s_prime
        if done is True: break # terminate condition
        
        # Replay buffer
        if replay_buffer.size() > buffer_warmup:
             for _ in range(n_update_per_tick): 
                mini_batch = replay_buffer.sample(batch_size)
                # Update critics
                td_target = get_target(
                    actor,
                    critic_one_trgt,
                    critic_two_trgt,
                    gamma      = gamma,
                    mini_batch = mini_batch,
                    device     = device,
                )
                critic_one.train(td_target,mini_batch)
                critic_two.train(td_target,mini_batch)
                # Update actor
                actor.train(
                    critic_one,
                    critic_two,
                    target_entropy = -gym.a_dim,
                    mini_batch     = mini_batch,
                )
                # Soft update of critics
                critic_one.soft_update(tau=tau,net_target=critic_one_trgt)
                critic_two.soft_update(tau=tau,net_target=critic_two_trgt)

    # Compute x_diff
    x_diff = gym.env.get_p_body('torso')[0]
    
    # compute z_diff
    z_diff = gym.env.get_p_body('torso')[2]
    if z_diff > max_height:
        max_height = z_diff

    episode_time_sec = tick * (max_epi_sec / max_epi_tick)
    lateral_speed = y_diff / episode_time_sec if episode_time_sec > 0 else 0.0

    if x_diff > max_distance:
        max_distance = x_diff
        best_lateral_speed = lateral_speed 


    # Print
    if (epi_idx%print_every)==0:
        epi_tick = tick
        print ("[%d/%d][%.1f%%]"%(epi_idx,n_episode,100.0*(epi_idx/n_episode)))
        print ("  reward:[%.1f] x_diff:[%.3f] epi_len:[%d/%d] buffer_size:[%d] alpha:[%.2f]"%
               (reward_total,x_diff,epi_tick,max_epi_tick,
                replay_buffer.size(),actor.log_alpha.exp()))
    
    # Evaluation
    if (epi_idx%eval_every)==0:
        if RENDER_EVAL: gym.init_viewer()
        s = gym.reset()
        reward_total = 0.0
        for tick in range(max_epi_tick):
            a,_ = actor(np2torch(s,device=device),SAMPLE_ACTION=False)
            s_prime,reward,done,info = gym.step(torch2np(a),max_time=max_epi_sec)
            reward_total += reward
            if RENDER_EVAL and ((tick%5) == 0):
                gym.render(
                    TRACK_TORSO      = True,
                    PLOT_WORLD_COORD = True,
                    PLOT_TORSO_COORD = True,
                    PLOT_SENSOR      = True,
                    PLOT_CONTACT     = True,
                    PLOT_TIME        = True,
                )
            s = s_prime
            if RENDER_EVAL:
                if not gym.is_viewer_alive(): break
        if RENDER_EVAL: gym.close_viewer()
        x_diff = gym.env.get_p_body('torso')[0]
        print ("  [Eval] reward:[%.3f] x_diff:[%.3f] epi_len:[%d/%d]"%
               (reward_total,x_diff,tick,max_epi_tick))
    if TRACK_BEST_EPISODE and (max_distance > best_distance or (np.isclose(max_distance, best_distance) and lateral_speed > best_lateral_speed)):
        best_distance = max_distance
        best_lateral_speed = lateral_speed
        best_episode_data = current_episode_actions.copy()
        best_episode_idx = epi_idx
        best_pth_path = './result/weights/sac_%s/siderun/best%d.pth'%(gym.name.lower(), epi_idx)
        torch.save(actor.state_dict(),best_pth_path)
        print(f"NEW BEST: Distance: {best_distance:.3f}, Speed = {best_lateral_speed:.3f} Episode {epi_idx}")
    # Save network
    if (epi_idx%save_every)==0:
        pth_path = './result/weights/sac_%s/siderun/episode_%d.pth'%(gym.name.lower(),epi_idx)
        dir_path = os.path.dirname(pth_path)
        if not os.path.exists(dir_path): os.makedirs(dir_path)
        if (epi_idx == 0) and REMOVE_PREV_FILES: # remove all existing files
            files = os.listdir(path=dir_path)
            print ("  [Save] Remove existing [%d] pth files."%(len(files)))
            for file in files: os.remove(os.path.join(dir_path,file))
        torch.save(actor.state_dict(),pth_path)
        print ("  [Save] [%s] saved."%(pth_path))

print ("Done.")


Start training.
[0/500][0.0%]
  reward:[-2739.3] x_diff:[-0.064] epi_len:[349/350] buffer_size:[350] alpha:[0.20]
  [Eval] reward:[0.311] x_diff:[0.052] epi_len:[349/350]


NameError: name 'avg_lateral_speed' is not defined

In [None]:
# Configuration
max_epi_sec  = 15.0 # maximum episode length in second
max_epi_tick = int(max_epi_sec*gym.HZ) # maximum episode length in tick
# Actor
device     = 'cpu' # cpu / mps / cuda
max_torque = 2.0
init_alpha = 0.1
lr_actor   = 0.0004
lr_alpha   = 0.0003
actor = ActorClass(
    obs_dim    = gym.o_dim,
    h_dims     = [256,256],
    out_dim    = gym.a_dim,
    max_out    = max_torque,
    init_alpha = init_alpha,
    lr_actor   = lr_actor,
    lr_alpha   = lr_alpha,
    device     = device,
).to(device)

max_length = 0.0 
# Load pth
pth_path = './result/weights/sac_%s/siderun/episode_%d.pth'%(gym.name.lower(),750)#'./result/weights/sac_%s/longjump/best_longjump%d.pth'%(gym.name.lower(),epi_idx)
actor.load_state_dict(torch.load(pth_path,map_location=device))
# Run
gym.init_viewer()
s = gym.reset()
gym.viewer_pause() # pause
print ("   Viewer paused. Press [space] to resume.")
reward_total = 0.0
for tick in range(max_epi_tick):
    a,_ = actor(np2torch(s,device=device),SAMPLE_ACTION=False)
    s_prime,reward,done, info = gym.step(torch2np(a),max_time=max_epi_sec)
    gym.render(
        TRACK_TORSO      = True,
        PLOT_WORLD_COORD = True,
        PLOT_TORSO_COORD = True,
        PLOT_SENSOR      = True,
        PLOT_CONTACT     = True,
        PLOT_TIME        = True,
    )
    reward_total += reward
    s = s_prime
    if not gym.is_viewer_alive(): break
gym.close_viewer()
x_diff = gym.env.get_p_body('torso')[0]
z_diff = gym.env.get_p_body('torso')[2]
if x_diff > max_length:
        max_height = x_diff
print ("  [Eval] reward:[%.3f] x_diff:[%.3f] epi_len:[%d/%d]"%
       (reward_total,x_diff,tick,max_epi_tick))