In [1]:
import gym
import torch
import sys
import os
import random
import numpy as np
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
sys.path.append('../')
from common import make_env
from evals import *
sys.path.append('../../')
import TD3

import pandas as pd

if not os.path.exists("images"):
    os.mkdir("images")
    
if not os.path.exists("images2"):
    os.mkdir("images2")

In [2]:
response_times = [ 0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64]

In [18]:
for rt in response_times:
    for g_force in range(21):
        for seed in range(5):
            if not os.path.isfile('../models_not_diverged/TD3_InvertedPendulum-v2_'+str(seed)+'_0.02_'+str(float(g_force))+'_'+str(rt)+'_1.0_False_256_final_actor'):
                print (rt, g_force, seed)

In [19]:
for rt in response_times:
    for g_force in range(5):
        for seed in range(5):
            if not os.path.isfile('../models_not_diverged/TD3_InvertedPendulum-v2_'+str(seed)+'_0.02_'+str(float(g_force))+'_'+str(rt)+'_1.0_False_256_best_actor'):
                print (rt, g_force, seed)

In [20]:
df = pd.DataFrame(columns=['seed', 'g_force', 'response_rate', 'reward', 'angle', 'jerk'])
# df = torch.load('dataframe')
default_timestep = 0.02
default_frame_skip = 2
jit_duration = 0.02
env_name = 'InvertedPendulum-v2'
for response_rate in response_times:
    for g_force in range(21):
        print(response_rate, g_force)
        for seed in range(5):
            states = []
            force = g_force * 9.81
            arguments = ['TD3', env_name, seed, jit_duration, float(g_force), response_rate, 1.0, False, 256, 'best']
                # Target policy smoothing is scaled wrt the action scale
            file_name = '_'.join([str(x) for x in arguments])
            if response_rate % default_timestep == 0:
                frame_skip = response_rate / default_timestep
                timestep = default_timestep
            elif jit_duration < response_rate:
                timestep = jit_duration
                frame_skip = response_rate / timestep
            else:
                timestep = response_rate
                frame_skip = 1
            jit_frames = 0  # How many frames the horizontal jitter force lasts each time
            if jit_duration:
                if jit_duration % timestep == 0:
                    jit_frames = int(jit_duration / timestep)
                else:
                    raise ValueError(
                        "jit_duration should be a multiple of the timestep: " + str(timestep))
                    
            time_change_factor = (default_timestep * default_frame_skip) / (timestep * frame_skip)
            eval_env = make_env(env_name, seed, time_change_factor, timestep, frame_skip, False)
            eval_env._max_episode_steps = 100000
            state_dim = eval_env.observation_space.shape[0]
            action_dim = eval_env.action_space.shape[0]
            max_action = float(eval_env.action_space.high[0])
            kwargs = {
                "state_dim": state_dim,
                "action_dim": action_dim,
                "observation_space": eval_env.observation_space,
                "max_action": max_action,
                "discount": 0.99,
                "tau": 0.005,
            }
            kwargs["policy_noise"] = 2 * max_action
            kwargs["noise_clip"] = 0.5 * max_action
            kwargs["policy_freq"] = 2
            policy = TD3.TD3(**kwargs)
            policy_file = file_name 
            if os.path.exists('../models_not_diverged/'+policy_file+"_critic"):
                policy.load(f"../models_not_diverged/{policy_file}")
                avg_reward = 0.
                avg_angle = 0.
                steps = 0

                t = 0
                forces = []
                force_times = []
                for _ in range(10):
                    state, done = eval_env.reset(), False
                    eval_env.model.opt.gravity[0] = 0
                    counter = 0
                    disturb = 5
                    jittering = False
                    force = 0.25
                    prev_action = None
                    jerk = 0
                    while not done:
                        action = policy.select_action(np.array(state))
                        # Perform action
                        if not jittering and round(disturb - counter, 3) >= response_rate:  # Not during the frames when jitter force keeps existing
                            next_state, reward, done, _ = eval_env.step(action)
                            counter += response_rate
                        elif not jittering and round(disturb - counter, 3) < response_rate:
                            forces.append(force)
                            force_times.append(t)
                            jitter_force = force * 9.81 * (2 * (np.random.random() > 0.5) - 1)  # Jitter force strength w/ direction
                            next_state, reward, done, _ = eval_env.jitter_step_start(action, jitter_force,
                                                                                     (disturb - counter) / timestep,
                                                                                     frame_skip - ((disturb - counter) / timestep),
                                                                                     jit_frames)
                            jittered_frames = frame_skip - ((disturb - counter) / timestep)
                            if jittered_frames >= jit_frames:
                                jittered_frames = 0
                                jittering = False
                                eval_env.model.opt.gravity[0] = 0
                                counter = 0
                                force += 0.25

                            else:
                                jittering = True
                                eval_env.model.opt.gravity[0] = jitter_force
                                counter += response_rate
                        elif jit_frames - jittered_frames < frame_skip:  # Jitter force will dispear from now!
                            next_state, reward, done, _ = eval_env.jitter_step_end(
                                action, jitter_force, jit_frames - jittered_frames, frame_skip - (jit_frames - jittered_frames))
                            jittering = False  # Stop jittering now
                            eval_env.model.opt.gravity[0] = 0
                            counter = 0
                            force += 0.25
                        else:  # Jitter force keeps existing now!
                            next_state, reward, done, _ = eval_env.step(action)
                            jittered_frames += frame_skip
                            counter += response_rate
                            if jittered_frames == jit_frames:
                                jittering = False
                                eval_env.model.opt.gravity[0] = 0
                                counter = 0
                                force += 0.25

                        avg_reward += reward
                        avg_angle += abs(next_state[1])
                        state = next_state
                        counter = round(counter, 3)
                        if jit_duration:
                            if counter == disturb:
                                forces.append(force)
                                force_times.append(t)
                                jitter_force = force * 9.81 * (2 * (random.random() > 0.5) - 1)
                                eval_env.model.opt.gravity[0] = jitter_force
                                jittering = True
                                jittered_frames = 0

                        t += 1
                        if prev_action:
                            jerk += abs(action[0] - prev_action)
                        prev_action = action[0]
                        states.append(state)
                states = np.array(states)
                fig = make_subplots(rows=2, cols=1)
                x = [i for i in range(t)]
                for index, f in enumerate(force_times):
                    fig.add_shape(go.layout.Shape(type="line",
                                            x0=f,
                                            y0=-100,
                                            x1=f,
                                            y1=100,
                                            ),row=1,col=1)
                    fig.add_annotation(x=f,
                                       y=0,
                                       text=str(forces[index]), 
                                       showarrow=False,
                                       row=1, col=1)
                fig.add_trace(go.Scatter(x=x, y=states[:,0], mode='lines', name='pos'), row=1, col=1)
                fig.add_trace(go.Scatter(x=x, y=states[:,1], mode='lines', name='angle'), row=2, col=1)
    #             fig.add_trace(go.Scatter(x=x, y=states[:,2], mode='lines', name='vel'))
    #             fig.add_trace(go.Scatter(x=x, y=states[:,3], mode='lines', name='angular vel'))
                fig.update_layout(xaxis_title="Frames")

                fig.write_html("images2/"+file_name+'.html')
        

                avg_reward /= 10
                avg_angle /= 10
                jerk /= avg_reward
                df.loc[len(df.index)] = [seed, g_force, response_rate, avg_reward, avg_angle, jerk]
                

0.01 0
0.01 1
0.01 2
0.01 3
0.01 4
0.01 5
0.01 6
0.01 7
0.01 8
0.01 9
0.01 10
0.01 11
0.01 12
0.01 13
0.01 14
0.01 15
0.01 16
0.01 17
0.01 18
0.01 19
0.01 20
0.02 0
0.02 1
0.02 2
0.02 3
0.02 4
0.02 5
0.02 6
0.02 7
0.02 8
0.02 9
0.02 10
0.02 11
0.02 12
0.02 13
0.02 14
0.02 15
0.02 16
0.02 17
0.02 18
0.02 19
0.02 20
0.04 0
0.04 1
0.04 2
0.04 3
0.04 4
0.04 5
0.04 6
0.04 7
0.04 8
0.04 9
0.04 10
0.04 11
0.04 12
0.04 13
0.04 14
0.04 15
0.04 16
0.04 17
0.04 18
0.04 19
0.04 20
0.08 0
0.08 1
0.08 2
0.08 3
0.08 4
0.08 5
0.08 6
0.08 7
0.08 8
0.08 9
0.08 10
0.08 11
0.08 12
0.08 13
0.08 14
0.08 15
0.08 16
0.08 17
0.08 18
0.08 19
0.08 20
0.16 0
0.16 1
0.16 2
0.16 3
0.16 4
0.16 5
0.16 6
0.16 7
0.16 8
0.16 9
0.16 10
0.16 11
0.16 12
0.16 13
0.16 14
0.16 15
0.16 16
0.16 17
0.16 18
0.16 19
0.16 20
0.32 0
0.32 1
0.32 2
0.32 3
0.32 4
0.32 5
0.32 6
0.32 7
0.32 8
0.32 9
0.32 10
0.32 11
0.32 12
0.32 13
0.32 14
0.32 15
0.32 16
0.32 17
0.32 18
0.32 19
0.32 20
0.64 0
0.64 1
0.64 2
0.64 3
0.64 4
0.64 5
0.64 6
0.6

In [21]:
torch.save(df, 'dataframe_inverted_pendulum_not_diverged')

In [79]:
df = torch.load('dataframe_inverted_pendulum_not_diverged')

In [80]:
df['reward'] = df['reward'] * df['response_rate']
df['jerk'] = df['jerk'] / df['response_rate']
rewards = pd.crosstab(df['g_force'], df['response_rate'], values=df['reward'], aggfunc='mean')
jerks = pd.crosstab(df['g_force'], df['response_rate'], values=df['jerk'], aggfunc='mean')

In [81]:
fig = go.Figure(data=go.Heatmap(
                    z=rewards, x=df['response_rate'].unique().astype('U'),
                   y=df['g_force'].unique().astype('U')))

fig.update_layout(
    title='Heatmap for with response times vs max. pertubation during training',
    xaxis_title="Response Rate",
    yaxis_title="Max Pertubation during training",
)

In [101]:
at = np.around(rewards.to_numpy(), decimals=2).astype(str)

for i, row in enumerate(at):
    for j, eleme in enumerate(row):
        at[i,j] = str(eleme) + 's / ' + str(np.around(float(eleme)*0.25/5, decimals = 2)) +'g'

fig = ff.create_annotated_heatmap(z=rewards.to_numpy(),y=[str(int(i))+'g' for i in df['g_force'].unique().tolist()], 
                                  x=[str(i) + 's' for i in df['response_rate'].unique().tolist()],  
                                  annotation_text= at, colorscale='blues', showscale=True,
                                 reversescale =False)

fig.update_layout(
#     title='Average evaluation seconds for response times vs max. pertubation during training',
    xaxis_title="Response Time",
    yaxis_title="Max Pertubation during training",
    xaxis_side='bottom',
    font=dict(size=15),
    height=700,
)
fig.show()



In [12]:
fig = ff.create_annotated_heatmap(z=jerks.to_numpy(),y=[str(int(i))+'g' for i in df['g_force'].unique().tolist()], 
                                  x=[str(i) + 's' for i in df['response_rate'].unique().tolist()],  
                                  annotation_text=np.around(jerks.to_numpy(), decimals=2), colorscale='blues', showscale=True,
                                 reversescale =False)

fig.update_layout(
#     title='Average evaluation seconds for response times vs max. pertubation during training',
    xaxis_title="Response Time",
    yaxis_title="Max Pertubation during training",
    xaxis_side='bottom',
    font=dict(size=15),
    height=700,
)
fig.show()



# Delayed Environment

In [18]:
g_forces = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [19]:
for rt in response_times:
    for g_force in g_forces:
        for seed in range(5):
            if not os.path.isfile('../models_not_diverged/TD3_InvertedPendulum-v2_'+str(seed)+'_0.02_'+str(float(g_force))+'_'+str(rt)+'_1.0_True_256_final_actor'):
                print (rt, g_force, seed)

0.01 4 0
0.01 7 0
0.02 4 2
0.04 5 4
0.04 6 4
0.08 3 4


In [20]:
for rt in response_times:
    for g_force in g_forces:
        for seed in range(5):
            if not os.path.isfile('../models_not_diverged/TD3_InvertedPendulum-v2_'+str(seed)+'_0.02_'+str(float(g_force))+'_'+str(rt)+'_1.0_True_256_best_actor'):
                print (rt, g_force, seed)

In [21]:
df = pd.DataFrame(columns=['seed', 'g_force', 'response_rate', 'reward', 'angle', 'jerk'])
# df = torch.load('dataframe')
default_timestep = 0.02
default_frame_skip = 2
jit_duration = 0.02
env_name = 'InvertedPendulum-v2'
for response_rate in response_times:
    for g_force in g_forces:
        print(response_rate, g_force)
        for seed in range(5):
            states = []
            arguments = ['TD3', env_name, seed, jit_duration, float(g_force), response_rate, 1.0, True, 256, 'best']
            file_name = '_'.join([str(x) for x in arguments])
            if response_rate % default_timestep == 0:
                frame_skip = response_rate / default_timestep
                timestep = default_timestep
            elif jit_duration < response_rate:
                timestep = jit_duration
                frame_skip = response_rate / timestep
            else:
                timestep = response_rate
                frame_skip = 1
            jit_frames = 0  # How many frames the horizontal jitter force lasts each time
            if jit_duration:
                if jit_duration % timestep == 0:
                    jit_frames = int(jit_duration / timestep)
                else:
                    raise ValueError(
                        "jit_duration should be a multiple of the timestep: " + str(timestep))
                    
            time_change_factor = (default_timestep * default_frame_skip) / (timestep * frame_skip)
            eval_env = make_env(env_name, seed, time_change_factor, timestep, frame_skip, True)
            eval_env.env.env._max_episode_steps = 100000
            state_dim = eval_env.observation_space[0].shape[0]
            action_dim = eval_env.action_space.shape[0]
            max_action = float(eval_env.action_space.high[0])
            kwargs = {
                "state_dim": state_dim,
                "action_dim": action_dim,
                "observation_space": eval_env.observation_space,
                "max_action": max_action,
                "discount": 0.99,
                "tau": 0.005,
                "delayed_env":True
            }
            kwargs["policy_noise"] = 2 * max_action
            kwargs["noise_clip"] = 0.5 * max_action
            kwargs["policy_freq"] = 2
            policy = TD3.TD3(**kwargs)
            policy_file = file_name 
            if os.path.exists('../models_not_diverged/'+policy_file+"_critic"):
                policy.load(f"../models_not_diverged/{policy_file}")
                avg_reward = 0.
                avg_angle = 0.
                steps = 0

                t = 0
                forces = []
                force_times = []
                for _ in range(10):
                    state, done = eval_env.reset(), False
                    eval_env.model.opt.gravity[0] = 0
                    counter = 0
                    disturb = 5
                    jittering = False
                    force = 0.25
                    prev_action = None
                    jerk = 0
                    while not done:
                        action = policy.select_action(np.array(state))
                        # Perform action
                        if not jittering and round(disturb - counter, 3) >= response_rate:  # Not during the frames when jitter force keeps existing
                            next_state, reward, done, _ = eval_env.step(action)
                            counter += response_rate
                        elif not jittering and round(disturb - counter, 3) < response_rate:
                            forces.append(force)
                            force_times.append(t)
                            jitter_force = force * 9.81 * (2 * (np.random.random() > 0.5) - 1)  # Jitter force strength w/ direction
                            next_state, reward, done, _ = eval_env.jitter_step_start(action, jitter_force,
                                                                                     (disturb - counter) / timestep,
                                                                                     frame_skip - ((disturb - counter) / timestep),
                                                                                     jit_frames)
                            jittered_frames = frame_skip - ((disturb - counter) / timestep)
                            if jittered_frames >= jit_frames:
                                jittered_frames = 0
                                jittering = False
                                eval_env.model.opt.gravity[0] = 0
                                counter = 0
                                force += 0.25

                            else:
                                jittering = True
                                eval_env.model.opt.gravity[0] = jitter_force
                                counter += response_rate
                        elif jit_frames - jittered_frames < frame_skip:  # Jitter force will dispear from now!
                            next_state, reward, done, _ = eval_env.jitter_step_end(
                                action, jitter_force, jit_frames - jittered_frames, frame_skip - (jit_frames - jittered_frames))
                            jittering = False  # Stop jittering now
                            eval_env.model.opt.gravity[0] = 0
                            counter = 0
                            force += 0.25
                        else:  # Jitter force keeps existing now!
                            next_state, reward, done, _ = eval_env.step(action)
                            jittered_frames += frame_skip
                            counter += response_rate
                            if jittered_frames == jit_frames:
                                jittering = False
                                eval_env.model.opt.gravity[0] = 0
                                counter = 0
                                force += 0.25

                        avg_reward += reward
                        avg_angle += abs(next_state[1])
                        state = next_state
                        counter = round(counter, 3)
                        if jit_duration:
                            if counter == disturb:
                                forces.append(force)
                                force_times.append(t)
                                jitter_force = force * 9.81 * (2 * (random.random() > 0.5) - 1)
                                eval_env.model.opt.gravity[0] = jitter_force
                                jittering = True
                                jittered_frames = 0

                        t += 1
                        if prev_action:
                            jerk += abs(action[0] - prev_action)
                        prev_action = action[0]
                        states.append(state)
#                 states = np.array(states)
#                 fig = make_subplots(rows=2, cols=1)
#                 x = [i for i in range(t)]
#                 for index, f in enumerate(force_times):
#                     fig.add_shape(go.layout.Shape(type="line",
#                                             x0=f,
#                                             y0=-100,
#                                             x1=f,
#                                             y1=100,
#                                             ),row=1,col=1)
#                     fig.add_annotation(x=f,
#                                        y=0,
#                                        text=str(forces[index]), 
#                                        showarrow=False,
#                                        row=1, col=1)
#                 fig.add_trace(go.Scatter(x=x, y=states[:,0], mode='lines', name='pos'), row=1, col=1)
#                 fig.add_trace(go.Scatter(x=x, y=states[:,1], mode='lines', name='angle'), row=2, col=1)
#     #             fig.add_trace(go.Scatter(x=x, y=states[:,2], mode='lines', name='vel'))
#     #             fig.add_trace(go.Scatter(x=x, y=states[:,3], mode='lines', name='angular vel'))
#                 fig.update_layout(xaxis_title="Frames")

#                 fig.write_html("images2/"+file_name+'.html')
        

                avg_reward /= 10
                avg_angle /= 10
                jerk /= avg_reward
                df.loc[len(df.index)] = [seed, g_force, response_rate, avg_reward, avg_angle, jerk]
                

0.01 3
0.01 4
0.01 5
0.01 6
0.01 7
0.01 8
0.01 9
0.01 10
0.01 11
0.01 12
0.02 3
0.02 4
0.02 5
0.02 6
0.02 7
0.02 8
0.02 9
0.02 10
0.02 11
0.02 12
0.04 3
0.04 4
0.04 5
0.04 6
0.04 7
0.04 8
0.04 9
0.04 10
0.04 11
0.04 12
0.08 3
0.08 4
0.08 5
0.08 6
0.08 7
0.08 8
0.08 9
0.08 10
0.08 11
0.08 12
0.16 3
0.16 4
0.16 5
0.16 6
0.16 7
0.16 8
0.16 9
0.16 10
0.16 11
0.16 12
0.32 3
0.32 4
0.32 5
0.32 6
0.32 7
0.32 8
0.32 9
0.32 10
0.32 11
0.32 12
0.64 3
0.64 4
0.64 5
0.64 6
0.64 7
0.64 8
0.64 9
0.64 10
0.64 11
0.64 12


In [22]:
torch.save(df, 'dataframe_inverted_pendulum_not_diverged_delayed')

In [102]:
df = torch.load('dataframe_inverted_pendulum_not_diverged_delayed')

In [103]:
df['reward'] = df['reward'] * df['response_rate']
df['jerk'] = df['jerk'] / df['response_rate']
rewards = pd.crosstab(df['g_force'], df['response_rate'], values=df['reward'], aggfunc='mean')
jerks = pd.crosstab(df['g_force'], df['response_rate'], values=df['jerk'], aggfunc='mean')

In [104]:
at = np.around(rewards.to_numpy(), decimals=2).astype(str)

for i, row in enumerate(at):
    for j, eleme in enumerate(row):
        at[i,j] = str(eleme) + 's / ' + str(np.around(float(eleme)*0.25/5, decimals = 2)) +'g'
        
fig = ff.create_annotated_heatmap(z=rewards.to_numpy(),y=[str(int(i))+'g' for i in df['g_force'].unique().tolist()], 
                                  x=[str(i) + 's' for i in df['response_rate'].unique().tolist()],  
                                  annotation_text=at, colorscale='blues', showscale=True,
                                 reversescale =False)

fig.update_layout(
#     title='Average evaluation seconds for response times vs max. pertubation during training',
    xaxis_title="Response Time",
    yaxis_title="Max Pertubation during training",
    xaxis_side='bottom',
    font=dict(size=15),
    height=700,
)
fig.show()



# Reflex

In [41]:
response_times = [0.02, 0.04, 0.08, 0.16, 0.32, 0.64]
for rt in response_times:
    for g_force in[8]:
        for seed in range(5):
            if not os.path.isfile('../reflex/models/TD3_reflex_InvertedPendulum-v2_'+str(seed)+'_0.02_'+str(float(g_force))+'_'+str(rt)+'_1.0_0.01_0.1_final_actor'):
                print (rt, g_force, seed)

0.04 8 0
0.16 8 4


In [42]:
for rt in response_times:
    for g_force in[8]:
        for seed in range(5):
            if not os.path.isfile('../reflex/models/TD3_reflex_InvertedPendulum-v2_'+str(seed)+'_0.02_'+str(float(g_force))+'_'+str(rt)+'_1.0_0.01_0.1_best_actor'):
                print (rt, g_force, seed)

In [73]:
df = pd.DataFrame(columns=['seed', 'g_force', 'response_rate', 'reward', 'angle', 'jerk', 'actions'])
# df = torch.load('dataframe')
default_timestep = 0.02
default_frame_skip = 2
jit_duration = 0.02
env_name = 'InvertedPendulum-v2'
reflex_response_rate = 0.02
for response_rate in response_times:
    for g_force in [8]:
        print(response_rate, g_force)
        for seed in range(5):
            states = []
            arguments = ['TD3', 'reflex', env_name, seed, jit_duration, float(g_force), response_rate, 1.0, reflex_response_rate, 0.19, 'best']
            file_name = '_'.join([str(x) for x in arguments])

            if reflex_response_rate % default_timestep == 0:
                frame_skip = response_rate / default_timestep
                timestep = default_timestep
            elif jit_duration < reflex_response_rate:
                timestep = jit_duration
                frame_skip = response_rate / timestep
            else:
                timestep = reflex_response_rate
                frame_skip = response_rate / timestep

            jit_frames = 0  # How many frames the horizontal jitter force lasts each time
            if jit_duration:
                if jit_duration % timestep == 0:
                    jit_frames = int(jit_duration / timestep)
                else:
                    raise ValueError(
                        "jit_duration should be a multiple of the timestep: " + str(timestep))

            reflex_frames = int(reflex_response_rate / timestep)

            time_change_factor = (default_timestep * default_frame_skip) / (timestep * frame_skip)
            eval_env = make_env(env_name, seed, time_change_factor, timestep, frame_skip, True)
            eval_env.env.env._max_episode_steps = 100000
            state_dim = eval_env.observation_space[0].shape[0]
            action_dim = eval_env.action_space.shape[0]
            max_action = float(eval_env.action_space.high[0])
            kwargs = {
                "state_dim": state_dim,
                "action_dim": action_dim,
                "observation_space": eval_env.observation_space,
                "max_action": max_action,
                "discount": 0.99,
                "tau": 0.005,
                "delayed_env": True,
                "reflex": True
            }
            kwargs["policy_noise"] = 2 * max_action
            kwargs["noise_clip"] = 0.5 * max_action
            kwargs["policy_freq"] = 2
            policy = TD3.TD3(**kwargs)
            policy_file = file_name
            if os.path.exists('../reflex/models/' + policy_file + "_critic"):
                policy.load(f"../reflex/models/{policy_file}")
                avg_reward = 0.
                avg_angle = 0.
                steps = 0
                actions = 0

                t = 0
                forces = []
                force_times = []
                for _ in range(10):
                    state, done = eval_env.reset(), False
                    eval_env.model.opt.gravity[0] = 0
                    counter = 0
                    disturb = 5
                    jittering = False
                    force = 0.25
                    prev_action = None
                    jerk = 0
                    while not done:
                        reflex, action = policy.select_action(state)

                        if reflex:
                            actions += 2
                        else:
                            actions += 1
                        # Perform action
                        if not jittering and round(disturb - counter,
                                                   3) >= response_rate:  # Not during the frames when jitter force keeps existing
                            next_state, reward, done = env_step(eval_env, reflex, action, reflex_frames, frame_skip)
                            counter += response_rate
                        elif not jittering and round(disturb - counter, 3) < response_rate:
                            forces.append(force)
                            force_times.append(t)
                            frames_simulated = 0
                            force_frames_simulated = 0
                            jitter_force = force * 9.81 * (
                                        2 * (np.random.random() > 0.5) - 1)  # Jitter force strength w/ direction

                            if reflex and round(disturb - counter, 3) / timestep >= reflex_frames:
                                next_state, reward, done, _ = eval_env.jitter_step_end(reflex, 0, reflex_frames, 0)
                                frames_simulated += reflex_frames
                            elif reflex and round(disturb - counter, 3) / timestep < reflex_frames:
                                next_state, reward, done, _ = eval_env.jitter_step_end(reflex, 0,
                                                                                  round(disturb - counter, 3) / timestep, 0)
                                next_state, reward, done, _ = eval_env.jitter_step_end(reflex, jitter_force,
                                                                                  reflex_frames - (round(disturb - counter, 3) / timestep), 0)
                                frames_simulated += reflex_frames
                                force_frames_simulated += reflex_frames - (round(disturb - counter, 3) / timestep)

                            next_state, reward, done, _ = eval_env.jitter_step_start(action, jitter_force,
                                                        max((round(disturb - counter, 3) / timestep) - frames_simulated, 0),
                                                        frame_skip - max((round(disturb - counter, 3) / timestep),
                                                        frames_simulated), jit_frames - force_frames_simulated)


                            jittered_frames = frame_skip - (round(disturb - counter, 3) / timestep)
                            if jittered_frames >= jit_frames:
                                jittered_frames = 0
                                jittering = False
                                eval_env.model.opt.gravity[0] = 0
                                counter = 0
                                force += 0.25
                            else:
                                jittering = True
                                eval_env.model.opt.gravity[0] = jitter_force
                                counter += response_rate

                        elif jit_frames - jittered_frames < frame_skip:  # Jitter force will dispear from now!
                            frames_simulated = 0
                            if reflex:
                                if reflex_frames <= jit_frames - jittered_frames:
                                    next_state, reward, done, _ = eval_env.jitter_step_end(reflex, jitter_force, reflex_frames, 0)
                                else:
                                    next_state, reward, done, _ = eval_env.jitter_step_end(reflex, jitter_force,
                                                                                      jit_frames - jittered_frames,
                                                                                      reflex_frames - jit_frames + jittered_frames)

                                frames_simulated += reflex_frames

                            next_state, reward, done, _ = eval_env.jitter_step_end(
                                action, jitter_force, max(jit_frames - jittered_frames - frames_simulated, 0),
                                frame_skip - max((jit_frames - jittered_frames), frames_simulated))
                            jittering = False  # Stop jittering now
                            eval_env.model.opt.gravity[0] = 0
                            jittered_frames = 0
                            counter = 0
                            force += 0.25
                        else:  # Jitter force keeps existing now!
                            next_state, reward, done = env_step(eval_env, reflex, action, reflex_frames, frame_skip)
                            jittered_frames += frame_skip
                            counter += response_rate
                            if jittered_frames == jit_frames:
                                jittering = False
                                eval_env.model.opt.gravity[0] = 0
                                counter = 0
                                force += 0.25

                        avg_reward += reward
                        avg_angle += abs(next_state[1])
                        state = next_state
                        counter = round(counter, 3)
                        if jit_duration:
                            if counter == disturb:
                                forces.append(force)
                                force_times.append(t)
                                jitter_force = force * 9.81 * (2 * (random.random() > 0.5) - 1)
                                eval_env.model.opt.gravity[0] = jitter_force
                                jittering = True
                                jittered_frames = 0

                        t += 1
                        if prev_action:
                            jerk += abs(action[0] - prev_action)
                        prev_action = action[0]
                        states.append(state)

                avg_reward /= 10
                avg_angle /= 10
                jerk /= avg_reward
                actions /= 10
                df.loc[len(df.index)] = [seed, g_force, response_rate, avg_reward, avg_angle, jerk, actions]


0.02 8
0.04 8
0.08 8
0.16 8
0.32 8
0.64 8


In [74]:
torch.save(df, 'dataframe_inverted_pendulum_not_diverged_torch')

In [75]:
df = torch.load('dataframe_inverted_pendulum_not_diverged_torch')

In [76]:
df['reward'] = df['reward'] * df['response_rate']

df['jerk'] = df['jerk'] / df['response_rate']
rewards = pd.crosstab(df['g_force'], df['response_rate'], values=df['reward'], aggfunc='mean')
jerks = pd.crosstab(df['g_force'], df['response_rate'], values=df['jerk'], aggfunc='mean')
actions = pd.crosstab(df['g_force'], df['response_rate'], values=df['actions'], aggfunc='mean')

In [77]:
fig = ff.create_annotated_heatmap(z=rewards.to_numpy(),y=[str(int(i))+'g' for i in df['g_force'].unique().tolist()], 
                                  x=[str(i) + 's' for i in df['response_rate'].unique().tolist()],  
                                  annotation_text=np.around(rewards.to_numpy(), decimals=2), colorscale='blues', showscale=True,
                                 reversescale =False)

fig.update_layout(
#     title='Average evaluation seconds for response times vs max. pertubation during training',
    xaxis_title="Response Time",
    yaxis_title="Max Pertubation during training",
    xaxis_side='bottom',
    font=dict(size=15),
    height=700,
)
fig.show()



In [78]:
actions

response_rate,0.02,0.04,0.08,0.16,0.32,0.64
g_force,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8.0,7946.74,3659.14,1528.26,188.48,4.6,2.3


In [51]:
state[1]

-0.049293086

In [26]:
reflex, action = policy.select_action(state)

In [27]:
reflex

array([85.29129], dtype=float32)

In [28]:
action

array([-3.], dtype=float32)

In [30]:
next_state, reward, done = env_step(eval_env, reflex, action, reflex_frames, frame_skip)

In [31]:
next_state

array([ 1.01577258e+00,  1.57152149e+00,  1.00466071e-10, -1.59515444e-13,
       -3.00000000e+00])

In [33]:
state, done = eval_env.reset(), False

In [54]:
state[1] = 0.2

In [55]:
reflex, action = policy.select_action(state)

In [56]:
reflex, action

(array([2.9999998], dtype=float32), array([-3.], dtype=float32))

In [49]:
next_state, reward, done = env_step(eval_env, reflex, action, reflex_frames, frame_skip)

In [50]:
next_state

array([-1.0006816e+00,  1.5715215e+00,  2.4704256e-09,  2.2436794e-07,
       -3.0000000e+00], dtype=float32)

In [45]:
state= next_state

In [86]:
rewards.to_numpy().astype(str)

array([['71.1406', '74.1356', '88.4384', '69.8432', '40.224000000000004',
        '0.544', '0.64'],
       ['73.519', '96.1904', '90.91839999999999', '75.568',
        '46.57600000000001', '0.544', '0.64'],
       ['100.9576', '87.1704', '95.1824', '78.4992',
        '63.391999999999996', '0.544', '0.64'],
       ['123.7394', '108.71', '98.87920000000001', '118.8336',
        '58.63040000000001', '0.544', '0.64'],
       ['120.5666', '129.63160000000002', '109.04720000000002',
        '112.48159999999999', '80.8992', '0.544', '0.64'],
       ['135.1484', '133.0512', '129.60320000000002', '129.1968',
        '65.4464', '0.544', '0.64'],
       ['138.7362', '154.29360000000003', '146.5768', '131.8576',
        '70.6464', '0.544', '0.64'],
       ['118.00460000000001', '161.59160000000003', '156.8408',
        '113.7536', '66.30080000000001', '0.544', '0.64'],
       ['148.302', '170.2392', '161.8912', '139.54080000000002',
        '62.441599999999994', '0.544', '0.64'],
       ['140.0220