# All State Plotting

### Import Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from os import listdir
from os.path import isfile, join
import json
from utils.custom_rta_cpole import CartPole
from train_ppo_cpole_rta import get_ppo_trainer
from utils.ga_masking import Agent
from matplotlib.patches import Rectangle
from matplotlib import colors as mcolors
from scipy.signal import savgol_filter
import scipy

### Set up rollout function

In [None]:
def rollout(trainer, pts, env_config={}, ag_type=None, render=False):
    """
    Rollouts that allow to set start state for comparison reasons.
    """
    action_masking = env_config.get("use_action_masking", False)
    env = CartPole(env_config)
    eval_rewards = []
    eval_time = []
    v_total = 0
    v_eps = 0
    trajectories = []
    safe_perc = 0.0
    
    for pt in pts:
        obs = env.reset(init=True, state=pt)
        r = 0
        steps = 0
        safe = True
        history = []
        unsafe_actions = 0
        tot_actions = 0
        while True:
            if action_masking:
                pos, vel, theta, theta_vel = obs["actual_obs"]
                history.append(obs["actual_obs"])
            else:
                pos, vel, theta, theta_vel = obs
                history.append(obs)
                
            # Check Safety Criteria
            if pos >= 1.5 or pos <= -1.5:
                if safe:
                    v_eps += 1
                    safe = False
                v_total += 1
                unsafe_actions += 1
            if render:
                env.render()
            if ag_type:
                if ag_type == 'ga':
                    action = trainer.get_action(obs)
                else:
                    action = trainer.compute_single_action(obs)
            else:
                action = trainer.compute_single_action(obs)
            tot_actions += 1
            obs, reward, done, _ = env.step(action)
            r += reward
            steps += 1
            if done:
                safe_perc = (1 - (unsafe_actions/tot_actions))*100
                trajectories.append(history)
                eval_rewards.append(r)
                eval_time.append(steps)
                break
    
    return round(np.mean(eval_rewards),4), round(np.mean(eval_time), 4), v_total, v_eps, trajectories, safe_perc

In [None]:
def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, h

### Get every agent and evaluate them on 50 rollouts

In [None]:
NUM_TRIALS = 50
EXTENDED = False
SEED = 4
np.random.seed(10)

# bounds used for training agents
# bounds = [1.5, 1.25, 1.0, 0.75, 0.5, 0.25, 0.1, 0.05]

exp = {
        1.5:{"ga":{}, "ppo":{}},
        1.25:{"ga":{}, "ppo":{}},
        1.0:{"ga":{}, "ppo":{}},
        0.75:{"ga":{}, "ppo":{}},
        0.5:{"ga":{}, "ppo":{}},
        0.25:{"ga":{}, "ppo":{}},
        # 0.1:{"ga":{}, "ppo":{}},
        # 0.05:{"ga":{}, "ppo":{}}
    }

ga_traj = []
ppo_traj = []
env_config = {"use_action_masking": False}

init_pts = [[np.random.uniform(low=-0.05, high=0.05, size=(4,))] for _ in range(NUM_TRIALS)]
if EXTENDED:
    init_pts = [[np.array([np.random.uniform(low=-0.5, high=0.5),
                       np.random.uniform(low=-0.05, high=0.05),
                       np.random.uniform(low=-0.10, high=0.10),
                       np.random.uniform(low=-0.05, high=0.05)])] for _ in range(NUM_TRIALS)]

for i, key in enumerate(exp.keys()):
    exp[key]["ga"]["x"] = []
    exp[key]["ga"]["x_dot"] = []
    exp[key]["ga"]["theta"] = []
    exp[key]["ga"]["theta_dot"] = []
    
    print("\nRolling out bound: ", key)
    # rollout ga agent
    ga_agent = Agent()
    if EXTENDED:
        ga_agent.load("trained_agents/seeded_extended/seed_{}/cartpole_ga_rta_seed-{}_extended_checkpoint-{}.json".format(str(SEED), str(SEED), str(key)))
    else:
        ga_agent.load("trained_agents/seed_{}/cartpole_ga_rta_seed-{}_checkpoint-{}.json".format(str(SEED), str(SEED),str(key)))
    ga_agent.strategy = None
    ga_safe_actions = []
    ga_avg_ep_reward = []
    for k in range(NUM_TRIALS):
        ga_eval_reward, ga_eval_time, ga_v_total, ga_v_eps, ga_pos, safe_perc = rollout(ga_agent, init_pts[k], env_config, ag_type='ga')
        x, x_dot, theta, theta_dot = zip(*ga_pos[0])
        exp[key]["ga"]["x"].append(x)
        exp[key]["ga"]["x_dot"].append(x_dot)
        exp[key]["ga"]["theta"].append(theta)
        exp[key]["ga"]["theta_dot"].append(theta_dot)
        ga_avg_ep_reward.append(ga_eval_reward)
        ga_safe_actions.append(safe_perc)
    
    # rollout ppo agent
    exp[key]["ppo"]["x"] = []
    exp[key]["ppo"]["x_dot"] = []
    exp[key]["ppo"]["theta"] = []
    exp[key]["ppo"]["theta_dot"] = []
    ppo_agent, ec = get_ppo_trainer()
    name = "cartpole_ppo_rta_seed-" + str(SEED) + "_checkpoint-" + str(key)
    
    if EXTENDED:
        name = "cartpole_ppo_rta_seed-{}_extended_checkpoint-{}".format(str(SEED), str(key))
    ppo_agent.restore("trained_agents/seed_" + str(SEED) +"/" +name+"/"+name)
    ppo_safe_actions = []
    ppo_avg_ep_reward = []
    for k in range(NUM_TRIALS):
        ppo_eval_reward, ppo_eval_time, ppo_v_total, ppo_v_eps, ppo_pos, safe_perc = rollout(ppo_agent, init_pts[k], env_config, ag_type='ppo')
        x, x_dot, theta, theta_dot = zip(*ppo_pos[0])
        exp[key]["ppo"]["x"].append(x)
        exp[key]["ppo"]["x_dot"].append(x_dot)
        exp[key]["ppo"]["theta"].append(theta)
        exp[key]["ppo"]["theta_dot"].append(theta_dot)
        ppo_avg_ep_reward.append(ppo_eval_reward)
        ppo_safe_actions.append(safe_perc)
        
    ppo_safe_m, ppo_safe_c = mean_confidence_interval(ppo_safe_actions)
    ppo_ret_m, ppo_ret_c = mean_confidence_interval(ppo_avg_ep_reward)
    print("PPO {} | (%) Average Return: {}".format(key, ppo_ret_m), "Confidence: {}".format(ppo_ret_c))
    print("PPO {} | (%) Safe Percentage: {}".format(key, ppo_safe_m), "Confidence: {}\n".format(ppo_safe_c))
    
        
        
    ga_safe_m, ga_safe_c = mean_confidence_interval(ga_safe_actions)
    ga_ret_m, ga_ret_c = mean_confidence_interval(ga_avg_ep_reward)
    # ga_safe_actions = ga_safe_actions/NUM_TRIALS
    print("GA {} | (%) Average Return: {}".format(key, ga_ret_m), "Confidence: {}".format(ga_ret_c))
    print("GA {} | (%) Safe Percentage: {}".format(key, ga_safe_m), "Confidence: {}".format(ga_safe_c))

## Position Plotting

In [None]:
csfont = {'fontname':'Times New Roman',  'fontsize':20}

In [None]:
plt.close()
sns.set()
plt.figure(1);
fig, axis = plt.subplots(2,3, figsize=(10,9));
axis = axis.flatten();

# fig.suptitle('RTA Position for CartPole-v0 Rollouts', fontsize=18)

for k, key in enumerate(exp.keys()):
    for i in range(NUM_TRIALS-1):
        axis[k].plot(exp[key]["ga"]['x'][i], np.linspace(0, len(exp[key]["ga"]['x'][i])-1, len(exp[key]["ga"]['x'][i])), 'b')
        axis[k].plot(exp[key]["ppo"]['x'][i], np.linspace(0, len(exp[key]["ppo"]['x'][i])-1, len(exp[key]["ppo"]['x'][i])), 'g--')
    axis[k].plot(exp[key]["ga"]['x'][-1], np.linspace(0, len(exp[key]["ga"]['x'][-1])-1, len(exp[key]["ga"]['x'][-1])),'b', label="ga")
    axis[k].plot(exp[key]["ppo"]['x'][-1], np.linspace(0, len(exp[key]["ppo"]['x'][-1])-1, len(exp[key]["ppo"]['x'][-1])), 'g--', label="ppo")
    axis[k].plot(np.ones((200,))*1.5, np.linspace(0, 199, 200), 'r-.', label='Unsafe')
    axis[k].plot(np.ones((200,))*-1.5, np.linspace(0, 199, 200), 'r-.')
    axis[k].axvspan(-2.4, -1.5, color='yellow', alpha=0.2);
    axis[k].axvspan(1.5, 2.4, color='yellow', alpha=0.2);
    axis[k].add_patch(Rectangle((-1.5, 199), 3, 6,
                         facecolor = mcolors.cnames['lime'],
                         alpha=0.5,
                         fill=True, label="Goal"))
    axis[k].set_title("Training Constraint: x={}".format(str(key)), **csfont)
    axis[k].set_ylabel('Time', **csfont)
    axis[k].set_xlabel('X Position', **csfont)
    if k == 0:
        axis[k].text(
            -1.9, 50, "Time", ha="center", va="center", rotation=90, size=16,
            bbox=dict(boxstyle="rarrow, pad=0.25", fc="cyan", ec="b", lw=2))
        axis[k].legend(loc='lower right')
    axis[k].set_xlim([-2.4, 2.4])
    axis[k].tick_params(axis='x', labelsize=14)
    axis[k].tick_params(axis='y', labelsize=14)
    
plt.tight_layout()
# plt.savefig("images/{}/cpole_rta{}_x_results.png".format("extended" if EXTENDED else "normal", "_extended" if EXTENDED else ""), bbox_inches='tight', dpi=200)
plt.show(); 

## Velocity Plotting

In [None]:
plt.figure(2);
fig, axis = plt.subplots(2,3, figsize=(10,9));
axis = axis.flatten();
# fig.suptitle('RTA Velocity for CartPole-v0 Rollouts', fontsize=18)

for k, key in enumerate(exp.keys()):
    for i in range(NUM_TRIALS-1):
        len_ga= len(exp[key]["ga"]['x_dot'][i])
        len_ppo = len(exp[key]["ppo"]['x_dot'][i])
        ga_w = ppo_w = 43
        if  len_ga < ga_w:
            ga_w = len_ga-1 if len_ga%2==0 else len_ga
        if len_ppo < ppo_w:
            ppo_w = len_ppo-1 if len_ppo%2==0 else len_ppo
        axis[k].plot(savgol_filter(exp[key]["ga"]['x_dot'][i], ga_w, 6), np.linspace(0, len_ga-1, len_ga), 'b-')
        axis[k].plot(savgol_filter(exp[key]["ppo"]['x_dot'][i], ppo_w, 6), np.linspace(0, len_ppo-1, len_ppo), 'g--')
    # check window size for smoothing
    len_ga= len(exp[key]["ga"]['x_dot'][-1])
    len_ppo = len(exp[key]["ppo"]['x_dot'][-1])
    ga_w = ppo_w = 43
    if  len_ga < ga_w:
        ga_w = len_ga-1 if len_ga%2==0 else len_ga
    if len_ppo < ppo_w:
        ppo_w = len_ppo-1 if len_ppo%2==0 else len_ppo
    axis[k].plot(savgol_filter(exp[key]["ga"]['x_dot'][-1], ga_w, 6), np.linspace(0, len_ga-1, len_ga),'b', label="ga")
    axis[k].plot(savgol_filter(exp[key]["ppo"]['x_dot'][-1], ppo_w, 6), np.linspace(0, len_ppo-1, len_ppo), 'g--', label="ppo")
    # axis[k].plot(np.ones((200,))*1.5, np.linspace(0, 199, 200), 'r-.', label='Unsafe')
    # axis[k].plot(np.ones((200,))*-1.5, np.linspace(0, 199, 200), 'r-.')
    # axis[k].axvspan(-2.4, -1.5, color='red', alpha=0.2);
    # axis[k].axvspan(1.5, 2.4, color='red', alpha=0.2);
    axis[k].add_patch(Rectangle((-2.4, 199), 4.8, 6,
                         facecolor = mcolors.cnames['lime'],
                         alpha=0.5,
                         fill=True, label="Goal"))
    axis[k].set_title("Training Constraint: x={}".format(str(key)), **csfont)
    axis[k].set_ylabel('Time', **csfont)
    axis[k].set_xlabel('Velocity in X Direction', **csfont)
    if k == 0:
        axis[k].text(
            -1.9, 50, "Time", ha="center", va="center", rotation=90, size=16,
            bbox=dict(boxstyle="rarrow, pad=0.25", fc="cyan", ec="b", lw=2))
        axis[k].legend(loc='lower right')
    axis[k].set_xlim([-2.4, 2.4])
    axis[k].tick_params(axis='x', labelsize=14)
    axis[k].tick_params(axis='y', labelsize=14)
    
plt.tight_layout()
# plt.savefig("images/{}/cpole_rta{}_x-dot_results.png".format("extended" if EXTENDED else "normal", "_extended" if EXTENDED else ""), bbox_inches='tight', dpi=200)
plt.show(); 

## Theta

In [None]:
plt.close()
sns.set()
plt.figure(3);
fig, axis = plt.subplots(2,3, figsize=(10,9));
axis = axis.flatten();

# fig.suptitle('RTA Theta for CartPole-v0 Rollouts', fontsize=18)

for k, key in enumerate(exp.keys()):
    for i in range(NUM_TRIALS-1):
        axis[k].plot(exp[key]["ga"]['theta'][i], np.linspace(0, len(exp[key]["ga"]['theta'][i])-1, len(exp[key]["ga"]['theta'][i])), 'b')
        axis[k].plot(exp[key]["ppo"]['theta'][i], np.linspace(0, len(exp[key]["ppo"]['theta'][i])-1, len(exp[key]["ppo"]['theta'][i])), 'g--')
    axis[k].plot(exp[key]["ga"]['theta'][-1], np.linspace(0, len(exp[key]["ga"]['theta'][-1])-1, len(exp[key]["ga"]['theta'][-1])),'b', label="ga")
    axis[k].plot(exp[key]["ppo"]['theta'][-1], np.linspace(0, len(exp[key]["ppo"]['theta'][-1])-1, len(exp[key]["ppo"]['theta'][-1])), 'g--', label="ppo")
    axis[k].plot(np.ones((200,))*0.2095, np.linspace(0, 199, 200), 'r-.', label='Unsafe')
    axis[k].plot(np.ones((200,))*-0.2095, np.linspace(0, 199, 200), 'r-.')
    axis[k].axvspan(-0.48, -0.2095, color='yellow', alpha=0.2);
    axis[k].axvspan(0.2095, 0.48, color='yellow', alpha=0.2);
    axis[k].add_patch(Rectangle((-0.48, 199), .94, 6,
                         facecolor = mcolors.cnames['lime'],
                         alpha=0.5,
                         fill=True, label="Goal"))
    axis[k].set_title("Training Constraint: x={}".format(str(key)), **csfont)
    axis[k].set_ylabel('Time', **csfont)
    axis[k].set_xlabel('Theta Position (rad)', **csfont)
    if k == 0:
        axis[k].text(
            -.35, 50, "Time", ha="center", va="center", rotation=90, size=16,
            bbox=dict(boxstyle="rarrow, pad=0.25", fc="cyan", ec="b", lw=2))
        axis[k].legend(loc='lower right')
    axis[k].set_xlim([-.48, .48])
    axis[k].tick_params(axis='x', labelsize=14)
    axis[k].tick_params(axis='y', labelsize=14)
    
plt.tight_layout()
# plt.savefig("images/{}/cpole_rta{}_theta_results.png".format("extended" if EXTENDED else "normal", "_extended" if EXTENDED else ""), bbox_inches='tight', dpi=200)
plt.show(); 

## Angular Velocity

In [None]:
plt.close()
plt.figure(2);
fig, axis = plt.subplots(2,3, figsize=(10,9));
axis = axis.flatten();
# fig.suptitle('RTA Angular Velocity for CartPole-v0 Rollouts', fontsize=18)

for k, key in enumerate(exp.keys()):
    for i in range(NUM_TRIALS-1):
        len_ga= len(exp[key]["ga"]['theta_dot'][i])
        len_ppo = len(exp[key]["ppo"]['theta_dot'][i])
        ga_w = ppo_w = 43
        if  len_ga < ga_w:
            ga_w = len_ga-1 if len_ga%2==0 else len_ga
        if len_ppo < ppo_w:
            ppo_w = len_ppo-1 if len_ppo%2==0 else len_ppo
        axis[k].plot(savgol_filter(exp[key]["ga"]['theta_dot'][i], ga_w, 6), np.linspace(0, len_ga-1, len_ga), 'b-')
        axis[k].plot(savgol_filter(exp[key]["ppo"]['theta_dot'][i], ppo_w, 6), np.linspace(0, len_ppo-1, len_ppo), 'g--')
    # check window size for smoothing
    len_ga= len(exp[key]["ga"]['theta_dot'][-1])
    len_ppo = len(exp[key]["ppo"]['theta_dot'][-1])
    ga_w = ppo_w = 43
    if  len_ga < ga_w:
        ga_w = len_ga-1 if len_ga%2==0 else len_ga
    if len_ppo < ppo_w:
        ppo_w = len_ppo-1 if len_ppo%2==0 else len_ppo
    axis[k].plot(savgol_filter(exp[key]["ga"]['theta_dot'][-1], ga_w, 6), np.linspace(0, len_ga-1, len_ga),'b', label="ga")
    axis[k].plot(savgol_filter(exp[key]["ppo"]['theta_dot'][-1], ppo_w, 6), np.linspace(0, len_ppo-1, len_ppo), 'g--', label="ppo")
    # axis[k].plot(np.ones((200,))*1.5, np.linspace(0, 199, 200), 'r-.', label='Unsafe')
    # axis[k].plot(np.ones((200,))*-1.5, np.linspace(0, 199, 200), 'r-.')
    # axis[k].axvspan(-2.4, -1.5, color='red', alpha=0.2);
    # axis[k].axvspan(1.5, 2.4, color='red', alpha=0.2);
    axis[k].add_patch(Rectangle((-2.4, 199), 4.8, 6,
                         facecolor = mcolors.cnames['lime'],
                         alpha=0.5,
                         fill=True, label="Goal"))
    axis[k].set_title("Training Constraint: x={}".format(str(key)), **csfont)
    axis[k].set_ylabel('Time', **csfont)
    axis[k].set_xlabel('Angular Velocity', **csfont)
    if k == 0:
        axis[k].text(
            -1.9, 50, "Time", ha="center", va="center", rotation=90, size=16,
            bbox=dict(boxstyle="rarrow, pad=0.25", fc="cyan", ec="b", lw=2))
        axis[k].legend(loc='lower right')
    axis[k].set_xlim([-2.4, 2.4])
    axis[k].tick_params(axis='x', labelsize=14)
    axis[k].tick_params(axis='y', labelsize=14)
    
plt.tight_layout()
# plt.savefig("images/{}/cpole_amask{}_theta-dot_results.png".format("extended" if EXTENDED else "normal", "_extended" if EXTENDED else ""), bbox_inches='tight', dpi=200)
plt.show(); 