In [128]:
import numpy as np
from MountainCarEnv import MountainCarEnv
import matplotlib.pyplot as plot

In [129]:
env = MountainCarEnv(render_mode="rbg_array")

In [130]:
pos_space = np.linspace(env.min_position, env.max_position, 20)
vel_space = np.linspace(-env.max_speed, env.max_speed, 20)
Q = np.zeros((len(pos_space), len(vel_space), env.action_space.n))


In [131]:
def epsilon_greedy_policy(state, Q, epsilon):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state[0], state[1]])
    return action

def get_state(obs):
    pos, vel = obs
    pos_bin = np.digitize(pos, pos_space)
    vel_bin = np.digitize(vel, vel_space)
    return pos_bin, vel_bin

def run_mountain_car(Q, epsilon, alpha, gamma):
    done = False
    totalReward = 0
    obs = env.reset()
    state = get_state(obs)
    while not done:
        action = epsilon_greedy_policy(state, Q, epsilon)
        obs, reward, done, _ = env.step(action)
        totalReward += reward
        newState = get_state(obs)
        delta = alpha * (reward + gamma * np.max(Q[newState[0], newState[1]]) - Q[state[0], state[1], action])
        Q[state[0], state[1], action] += delta
        state = newState
    return totalReward



In [132]:
numOfEpisodes = 1000000
printInterval = 1000

epsilon=1
reduction = 2.5*epsilon/numOfEpisodes

alpha=0.1
gamma=0.999

totalRewards = np.zeros(numOfEpisodes)
totalQ = np.zeros(numOfEpisodes)

for i in range(numOfEpisodes):
    totalReward = run_mountain_car(Q, epsilon, alpha, gamma)
    totalRewards[i] = totalReward
    totalQ[i] = np.mean(Q)
    if epsilon > 0.01:
        epsilon -= reduction
    if (i+1) % printInterval == 0:
        print(f"Episode {i+1} out of {numOfEpisodes}; Reward {totalReward}; Epsilon {epsilon}")



Episode 1000 out of 100000; Reward -500.0; Epsilon 0.97999999999998
Episode 2000 out of 100000; Reward -500.0; Epsilon 0.95999999999996
Episode 3000 out of 100000; Reward -500.0; Epsilon 0.93999999999994
Episode 4000 out of 100000; Reward -500.0; Epsilon 0.91999999999992
Episode 5000 out of 100000; Reward -500.0; Epsilon 0.8999999999999
Episode 6000 out of 100000; Reward -500.0; Epsilon 0.87999999999988
Episode 7000 out of 100000; Reward -500.0; Epsilon 0.85999999999986
Episode 8000 out of 100000; Reward -500.0; Epsilon 0.83999999999984
Episode 9000 out of 100000; Reward -500.0; Epsilon 0.81999999999982
Episode 10000 out of 100000; Reward -500.0; Epsilon 0.7999999999998
Episode 11000 out of 100000; Reward -493.0; Epsilon 0.77999999999978
Episode 12000 out of 100000; Reward -500.0; Epsilon 0.75999999999976
Episode 13000 out of 100000; Reward -500.0; Epsilon 0.73999999999974
Episode 14000 out of 100000; Reward -500.0; Epsilon 0.71999999999972
Episode 15000 out of 100000; Reward -500.0; E

In [None]:
avgRewards = np.zeros((int)(numOfEpisodes/printInterval))
avgQ = np.zeros((int)(numOfEpisodes/printInterval))

for i in range(numOfEpisodes):
    if (i+1) % printInterval == 0:
        index = (int)(i/printInterval)
        avgQ[index] = np.mean(totalQ[i+1-printInterval:i+1])
        avgRewards[index] = np.mean(totalRewards[i+1-printInterval:i+1])



plot.plot(avgRewards)
plot.xlabel("Time")
plot.ylabel("Reward")
plot.title("Reward vs Time")
plot.show()
plot.close()

plot.plot(totalRewards)
plot.xlabel("Episode")
plot.ylabel("Reward")
plot.title("Reward vs Episode")
plot.show()
plot.close()


plot.plot(avgQ)
plot.xlabel("Q")
plot.ylabel("Time")
plot.title("Q vs Time")
plot.show()
plot.close()

plot.plot(totalQ)
plot.xlabel("Episode")
plot.ylabel("Q")
plot.title("Q vs Episode")
plot.show()
plot.close()


