In [430]:
import numpy as np
from MountainCarEnv import MountainCarEnv
import matplotlib.pyplot as plot

In [431]:
env = MountainCarEnv(render_mode="rbg_array")

In [432]:
pos_space = np.linspace(env.min_position, env.max_position, 12)
vel_space = np.linspace(-env.max_speed, env.max_speed, 20)
Q = np.zeros((len(pos_space), len(vel_space), env.action_space.n))


In [433]:
def epsilon_greedy_policy(state, Q, epsilon):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state[0], state[1]])
    return action

def get_state(obs):
    pos, vel = obs
    pos_bin = np.digitize(pos, pos_space)
    vel_bin = np.digitize(vel, vel_space)
    return pos_bin, vel_bin

def run_mountain_car(Q, epsilon, alpha, gamma):
    done = False
    totalReward = 0
    obs = env.reset()
    state = get_state(obs)
    while not done:
        action = epsilon_greedy_policy(state, Q, epsilon)
        obs, reward, done, _ = env.step(action)
        totalReward += reward
        newState = get_state(obs)
        delta = alpha * (reward + gamma * np.max(Q[newState[0], newState[1]]) - Q[state[0], state[1], action])
        Q[state[0], state[1], action] += delta
        state = newState
    return totalReward



In [434]:
total_Q = []
learning_rewards = []
num_episodes = 20000
epsilon=1.0
alpha=0.1
gamma=0.99

reduction = 2*(epsilon)/num_episodes

for i in range(num_episodes):
    totalReward = run_mountain_car(Q, epsilon, alpha, gamma)
    if epsilon > 0.01:
        epsilon -= reduction
    learning_rewards.append(totalReward)
    total_Q.append(np.mean(Q))
    if (i+1) % 100 == 0:
        print(f"Episode {i+1} out of {num_episodes}; Reward {np.mean(totalReward)}; Epsilon {epsilon}")



Episode 100 out of 20000; Reward -500.0; Epsilon 0.9
Episode 200 out of 20000; Reward -500.0; Epsilon 0.9
Episode 300 out of 20000; Reward -500.0; Epsilon 0.9
Episode 400 out of 20000; Reward -500.0; Epsilon 0.9
Episode 500 out of 20000; Reward -500.0; Epsilon 0.9
Episode 600 out of 20000; Reward -500.0; Epsilon 0.9
Episode 700 out of 20000; Reward -500.0; Epsilon 0.9
Episode 800 out of 20000; Reward -500.0; Epsilon 0.9
Episode 900 out of 20000; Reward -500.0; Epsilon 0.9
Episode 1000 out of 20000; Reward -500.0; Epsilon 0.9
Episode 1100 out of 20000; Reward -500.0; Epsilon 0.9
Episode 1200 out of 20000; Reward -500.0; Epsilon 0.9
Episode 1300 out of 20000; Reward -500.0; Epsilon 0.9
Episode 1400 out of 20000; Reward -500.0; Epsilon 0.9
Episode 1500 out of 20000; Reward -500.0; Epsilon 0.9
Episode 1600 out of 20000; Reward -500.0; Epsilon 0.9
Episode 1700 out of 20000; Reward -500.0; Epsilon 0.9
Episode 1800 out of 20000; Reward -500.0; Epsilon 0.9
Episode 1900 out of 20000; Reward -50

KeyboardInterrupt: 

In [None]:
# Plot learing rewards
plot.plot(list(range(1, len(learning_rewards) + 1)), learning_rewards)
plot.show()        

In [None]:
# Plot Q
plot.plot(list(range(1, len(total_Q) + 1)), total_Q, color='blue')
plot.show()
