In [26]:
import numpy as np
from MountainCarEnv import MountainCarEnv
import matplotlib.pyplot as plot

In [27]:
env = MountainCarEnv(render_mode="human")

In [28]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state])
    return action

def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

def get_state(obs):
    pos, vel = obs
    pos_bin = np.digitize(pos, pos_space)
    vel_bin = np.digitize(vel, vel_space)
    return pos_bin, vel_bin

In [29]:
pos_space = np.linspace(-5, 5, 10)
vel_space = np.linspace(-3, 3, 2)
pos_space

state = get_state(np.array([-0.4, 0.2]))
state

actions = list(range(env.action_space.n))
actions

Q = np.zeros((11,3,3))
Q

array([[[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]])

In [30]:
mean_Q = []
max_Q = []
learning_rewards = []
testing_rewards = []

epsilon = 0.1  # Exploration rate
alpha = 0.1  # Learning rate
num_episodes = 2000  # Number of episodes for learning
num_test_episodes = 100  # Number of episodes for testing
Q = np.zeros((len(pos_space) + 1, len(vel_space) + 1, len(actions)))  # Q-table initialization


In [31]:
def q_learning(env, Q, num_episodes, epsilon, alpha):
    for episode in range(num_episodes):
        state = get_state(env.reset())
        done = False
        total_reward = 0

        while not done:
            action = epsilon_greedy_policy(state, Q, epsilon)
            next_state, reward, done, _ = env.step(action)
            next_state = get_state(next_state)

            # Q-learning update
            max_q_next = np.max(Q[next_state])
            Q[state][action] += alpha * (reward + max_q_next - Q[state][action])

            state = next_state
            total_reward += reward

        learning_rewards.append(total_reward)
        mean_Q.append(np.mean(Q))
        max_Q.append(np.max(Q))

    return Q


In [32]:
def test_policy(env, Q, num_episodes):
    testing_rewards = []
    
    for episode in range(num_episodes):
        state = get_state(env.reset())
        done = False
        total_reward = 0

        while not done:
            action = optimal_policy(state, Q)
            next_state, reward, done, _ = env.step(action)
            state = get_state(next_state)
            total_reward += reward

        testing_rewards.append(total_reward)

    return testing_rewards


In [33]:
# Learning
Q = q_learning(env, Q, num_episodes, epsilon, alpha)

# Testing
testing_rewards = test_policy(env, Q, num_test_episodes)

#Plot

plt.plot(list(range(1, len(max_Q) + 1)), max_Q, color='red', label='Valor máximo de Q')
plt.plot(list(range(1, len(mean_Q) + 1)), mean_Q, color='blue', label='Valor promedio de Q')
plt.xlabel('Nro de partida', fontsize=20)
plt.ylabel('Valor de Q', fontsize=20)
plt.rcParams["figure.figsize"] = (25, 10)
plt.rcParams['xtick.labelsize'] = 'x-large'
plt.rcParams['ytick.labelsize'] = 'x-large'
plt.legend(loc='best')
plt.show()

In [None]:
# import pygame

# pygame.init()

# display = pygame.display.set_mode((600, 400))

# pygame.display.quit()

In [None]:
# obs = env.reset()
# print(obs)
# done = False
# while not done:
#     state = get_state(obs)
#     action = epsilon_greedy_policy(state, Q, 0.5)
#     obs, reward, done, _ = env.step(action)
#     print('->', state, action, reward, obs, done)