In [1]:
import numpy as np
import math
import gym

# CartPole

In [11]:
# Hacked Dynamics from OpenAI environment CartPole-v0
def cartpole_reset():
    state = np.random.uniform(low=-0.05, high=0.05, size=(4,))
    return np.array(state)

In [12]:
# Hacked Dynamics from OpenAI environment CartPole-v0
def cartpole_step(state, action):
    
    gravity = 9.8
    masscart = 1.0
    masspole = 0.1
    total_mass = (masspole + masscart)
    length = 0.5 # actually half the pole's length
    polemass_length = (masspole * length)
    force_mag = 10.0
    tau = 0.02  # seconds between state updates

    # Angle at which to fail the episode
    theta_threshold_radians = 12 * 2 * math.pi / 360
    x_threshold = 2.4
    
    x, x_dot, theta, theta_dot = state
    
    already_done =  x < -x_threshold \
        or x > x_threshold \
        or theta < -theta_threshold_radians \
        or theta > theta_threshold_radians
    already_done = bool(already_done)
    
    if already_done:
        
        next_state = state
        reward = 0
        done = True
        
    else:
        
        force = force_mag if action==1 else -force_mag
        costheta = math.cos(theta)
        sintheta = math.sin(theta)
        temp = (force + polemass_length * theta_dot * theta_dot * sintheta) / total_mass
        thetaacc = (gravity * sintheta - costheta* temp) / (length * (4.0/3.0 - masspole * costheta * costheta / total_mass))
        xacc  = temp - polemass_length * thetaacc * costheta / total_mass
        x  = x + tau * x_dot
        x_dot = x_dot + tau * xacc
        theta = theta + tau * theta_dot
        theta_dot = theta_dot + tau * thetaacc
        next_state = np.array([x,x_dot,theta,theta_dot])
        
        reward = 1
        
        done =  x < -x_threshold \
            or x > x_threshold \
            or theta < -theta_threshold_radians \
            or theta > theta_threshold_radians
        done = bool(done)
    
    return next_state, reward, done, {}

# Noisy CartPole

In [13]:
def noisycartpole_reset():
    return cartpole_reset()

In [14]:
def noisycartpole_step(state, action):
    
    stateDimension = 4
    actionDimension = 1
    transitionSigmas = [ 0.01, 0, 0.01, 0 ]
    
    next_state, reward, done, info = cartpole_step(state, action) # CartPole Step
    next_state += np.random.randn(stateDimension) * transitionSigmas # Adding Noise 

    return next_state, reward, done, info

# Evaluation

In [15]:
env = gym.make('CartPole-v0')

for e in range(100):

    print('====== Epoch %s ======================='%(e))
    
    state = env.reset()
    state_hacked = cartpole_reset()
    #if np.array_equal(state, state_hacked):
    #    print('Init State OK.')
    #else:
    #    print('*** Init State Error!!! ***')
    #    print(state)
    #    print(state_hacked)
    
    for t in range(1000):

        # OpenAI Gym CartPole-v0 step()
        action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)

        # Hacked CartPole step()
        next_state_hacked, reward_hacked, done_hacked, _ = cartpole_step(state, action)

        # Comparison
        print('--- Timestep : %s' %(t))
        if np.array_equal(next_state, next_state_hacked):
            print('Next State OK.')
        else:
            print('*** Next State Error!!! ***')
            print(next_state)
            print(next_state_hacked)

        if reward == reward_hacked:
            print('Reward OK.')
        else:
            print('*** Reward Error!!! ***')
            print(reward)
            print(reward_hacked)

        if done == done_hacked:
            print('Done OK.')
        else:
            print('*** Done Error!!! ***')
            print(done)
            print(done_hacked)

        state = next_state
        if done:
            break

[2017-08-09 13:13:28,462] Making new env: CartPole-v0


--- Timestep : 0
Next State OK.
Reward OK.
Done OK.
--- Timestep : 1
Next State OK.
Reward OK.
Done OK.
--- Timestep : 2
Next State OK.
Reward OK.
Done OK.
--- Timestep : 3
Next State OK.
Reward OK.
Done OK.
--- Timestep : 4
Next State OK.
Reward OK.
Done OK.
--- Timestep : 5
Next State OK.
Reward OK.
Done OK.
--- Timestep : 6
Next State OK.
Reward OK.
Done OK.
--- Timestep : 7
Next State OK.
Reward OK.
Done OK.
--- Timestep : 8
Next State OK.
Reward OK.
Done OK.
--- Timestep : 9
Next State OK.
Reward OK.
Done OK.
--- Timestep : 10
Next State OK.
Reward OK.
Done OK.
--- Timestep : 11
Next State OK.
Reward OK.
Done OK.
--- Timestep : 12
Next State OK.
Reward OK.
Done OK.
--- Timestep : 13
Next State OK.
Reward OK.
Done OK.
--- Timestep : 14
Next State OK.
Reward OK.
Done OK.
--- Timestep : 15
Next State OK.
Reward OK.
Done OK.
--- Timestep : 16
Next State OK.
Reward OK.
Done OK.
--- Timestep : 17
Next State OK.
Reward OK.
Done OK.
--- Timestep : 18
Next State OK.
Reward OK.
Done OK.
---