In [1]:
import numpy as np
import gym
import time # to get the time
import math # needed for calculations

In [2]:
#runs an entire episode calculating how long the pole stayed upright, until it falls
def run_episode(env, parameters):
    observation = env.reset()
    totalreward = 0
    for _ in range(200):
        #env.render()
        action = 0 if np.matmul(parameters,observation) < 0 else 1
        observation, reward, done, info = env.step(action)
        totalreward += reward
        if done:
            break
    return totalreward


def get_discrete_state(state):
    discrete_state = state/np_array_win_size+ np.array([15,10,1,10])
    return tuple(discrete_state.astype(np.int))

In [3]:
#basic single episode implementation, returns reward value based on one set of paramters
env = gym.make('CartPole-v1')
print(env.action_space.n)
###############################################################
parameters = np.random.rand(4) * 2 - 1
print('the parameters for this test are', parameters)
run_episode(env, parameters)

2
the parameters for this test are [-0.77207358  0.19018166 -0.21072037  0.38291471]


98.0

In [4]:
#Random Policy Implementation
#random search, runs 10,000 iterations, once it finds a parameter set that returns reward of 200, it breaks and returns
bestparams = None
bestreward = 0
for _ in range(10000):
    parameters = np.random.rand(4) * 2 - 1
    reward = run_episode(env,parameters)
    if reward > bestreward:
        bestreward = reward
        bestparams = parameters
        # considered solved if the agent lasts 200 timesteps
        if reward == 200:
            break
print('iterations are', _)
print('params are', parameters)
print('reward is', reward)

iterations are 9
params are [0.43497743 0.62092678 0.96766357 0.6679255 ]
reward is 200.0


In [5]:
#defining variables for Q-Learning
LEARNING_RATE = 0.1

DISCOUNT = 0.95
EPISODES = 60000
total = 0
total_reward = 0
prior_reward = 0

Observation = [30, 30, 50, 50]
np_array_win_size = np.array([0.25, 0.25, 0.01, 0.1])

epsilon = 1

epsilon_decay_value = 0.99995

In [6]:
#Q Table setup
q_table = np.random.uniform(low=0, high=1, size=(Observation + [env.action_space.n]))
q_table.shape

(30, 30, 50, 50, 2)

In [7]:
for episode in range(EPISODES + 1): #go through the episodes
    t0 = time.time() #set the initial time
    discrete_state = get_discrete_state(env.reset()) #get the discrete start for the restarted environment 
    done = False
    episode_reward = 0 #reward starts as 0 for each episode

    if episode % 2000 == 0: 
        print("Episode: " + str(episode))
    while not done: 
        if np.random.random() > epsilon:
            action = np.argmax(q_table[discrete_state]) #take cordinated action
        else:
            action = np.random.randint(0, env.action_space.n) #do a random action
        new_state, reward, done, _ = env.step(action) #step action to get new states, reward, and the "done" status.
        episode_reward += reward #add the reward
        new_discrete_state = get_discrete_state(new_state)
        if episode % 2000 == 0: #render
            env.render()
        if not done: #update q-table
            max_future_q = np.max(q_table[new_discrete_state])
            current_q = q_table[discrete_state + (action,)]
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)
            q_table[discrete_state + (action,)] = new_q
        discrete_state = new_discrete_state
    if epsilon > 0.05: #epsilon modification
        if episode_reward > prior_reward and episode > 10000:
            epsilon = math.pow(epsilon_decay_value, episode - 10000)
            if episode % 500 == 0:
                print("Epsilon: " + str(epsilon))
    t1 = time.time() #episode has finished
    episode_total = t1 - t0 #episode total time
    total = total + episode_total
    total_reward += episode_reward #episode total reward
    prior_reward = episode_reward
    if episode % 1000 == 0: #every 1000 episodes print the average time and the average reward
        mean = total / 1000
        print("Time Average: " + str(mean))
        total = 0
        mean_reward = total_reward / 1000
        print("Mean Reward: " + str(mean_reward))
        total_reward = 0

env.close()

Episode: 0
Time Average: 0.0005395569801330566
Mean Reward: 0.02
Time Average: 0.0005635209083557129
Mean Reward: 22.156
Episode: 2000
Time Average: 0.0009028017520904541
Mean Reward: 22.046
Time Average: 0.0005565083026885986
Mean Reward: 21.999
Episode: 4000
Time Average: 0.0007101035118103027
Mean Reward: 22.175
Time Average: 0.0005814833641052246
Mean Reward: 22.939
Episode: 6000
Time Average: 0.0007001378536224365
Mean Reward: 22.377
Time Average: 0.0005475616455078125
Mean Reward: 21.942
Episode: 8000
Time Average: 0.0010322530269622802
Mean Reward: 22.868
Time Average: 0.000545264482498169
Mean Reward: 21.7
Episode: 10000
Time Average: 0.0008856346607208252
Mean Reward: 21.902
Epsilon: 0.9753093024395111
Time Average: 0.0005724875926971435
Mean Reward: 22.421
Epsilon: 0.9277417467531685
Episode: 12000
Epsilon: 0.9048351558698463
Time Average: 0.0011609878540039063
Mean Reward: 23.762
Epsilon: 0.8824941446941661
Time Average: 0.0006390032768249512
Mean Reward: 24.884
Epsilon: 0.8