In [75]:
import numpy as np
from MountainCarEnv import MountainCarEnv
import matplotlib.pyplot as plot

In [76]:
env = MountainCarEnv(render_mode="human")

In [77]:
pos_space = np.linspace(-5, 5, 10)
vel_space = np.linspace(-3, 3, 2)

Q = np.random.uniform(0, 0, (len(pos_space), len(vel_space), env.action_space.n))

In [78]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state[0], state[1]])
    return action

def get_state(obs):
    pos, vel = obs
    pos_bin = np.digitize(pos, pos_space)
    vel_bin = np.digitize(vel, vel_space)
    return pos_bin, vel_bin

def run_mountain_car(explore_per = 0.5, learn=True, alpha=0.9, gamma=0.99, Q: np.ndarray = None):
    obs = env.reset()
    done = False
    totalReward = 0
    while not done:
        state = get_state(obs)
        Q_state = Q[state[0], state[1]]
        action = epsilon_greedy_policy(state, Q, explore_per)
        obs, reward, done, _ = env.step(action)
        new_state = get_state(obs)
        new_Q_state = Q[new_state[0], new_state[1]]
        totalReward += reward
        if learn:
            Q[state[0], state[1], action] = Q_state[action] + alpha * (reward + gamma * np.max(new_Q_state) - Q_state[action])
    return totalReward


In [None]:
mean_Q = []
max_Q = []
learning_rewards = []
testing_rewards = []

# Learn
for i in range(100):
    totalReward = run_mountain_car(explore_per=0.5, learn=True, Q=Q, alpha=0.1)
    learning_rewards.append(totalReward)

    mean_Q.append(np.mean(Q))
    max_Q.append(np.max(Q))

# Test
for i in range(10):
    totalReward = run_mountain_car(explore_per=0, learn=False, Q=Q)
    testing_rewards.append(totalReward)

# Plot
plt.plot(list(range(1, len(max_Q) + 1)), max_Q, color='red', label='Valor máximo de Q')
plt.plot(list(range(1, len(mean_Q) + 1)), mean_Q, color='blue', label='Valor promedio de Q')
plt.xlabel('Nro de partida', fontsize=20)
plt.ylabel('Valor de Q', fontsize=20)
plt.rcParams["figure.figsize"] = (25, 10)
plt.rcParams['xtick.labelsize'] = 'x-large'
plt.rcParams['ytick.labelsize'] = 'x-large'
plt.legend(loc='best')
plt.show()

In [79]:
# import pygame

# # Initialize Pygame
# pygame.init()

# # Create a Pygame display surface
# display = pygame.display.set_mode((600, 400))

# # Your game or visualization code goes here

# # Close the existing Pygame window
# pygame.display.quit()

In [80]:
# obs = env.reset()
# print(obs)
# done = False
# while not done:
#     state = get_state(obs)
#     action = epsilon_greedy_policy(state, Q, 0.5)
#     obs, reward, done, _ = env.step(action)
#     print('->', state, action, reward, obs, done)