<a href="https://colab.research.google.com/github/loregi01/Cart/blob/main/Cart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import gym
from gym import Env
import numpy as np
import random
from collections import defaultdict

Let's define the two policies we will use, the Random policy for the first attempt and the Learning Policy for the second attempt.

In [18]:
class RandomPolicy:
    def __init__(self, n_actions):
        self.n_actions = n_actions
    def __call__(self, obs) -> int:
        return random.randint(0, self.n_actions - 1)

class GreedyPolicy:
    def __init__(self, Q):
        self.Q = Q
    def __call__(self, obs) -> int:
        return np.argmax(self.Q[obs])

class LearningPolicy:
    def __init__(self, Q):
        self.Q = Q
        self.n_actions = len(Q[0])
    def __call__(self, obs, eps: float) -> int:
        greedy = random.random() > eps
        if greedy:
            return np.argmax(self.Q[obs])
        else:
            return random.randint(0, self.n_actions - 1)


First we try the environment using the random policy, this means that we don't use learning but we perform randomly a new action at each time

We can define a function for the first attempt

In [19]:
def random_attempt (env: gym.Env, policy, discount_factor: float, n_episodes: int, render=False) -> float:

  sum = 0.0
  observation = env.reset()
  discounting = 1

  for episode in range(0, n_episodes):

    discounting = 1

    print ("Episode: " + str(episode))

    observation = env.reset()
    done = False

    while not done :
      if done:
        observation = env.reset()
        discounting = 1
      obs = observation[0] + observation[1]

      selected_action = policy(obs)

      # let's analyze the effects of our actions
      # again we can ignore the info parameter
      observation, reward, done, info = env.step(selected_action)

      sum += reward * discounting
      discounting *= discount_factor

  return sum/n_episodes

In [None]:
learning_rate = 0.01
n_episodes = 20000
start_epsilon = 1.0
epsilon_decay = start_epsilon / (n_episodes / 2)
final_epsilon = 0.1
eps = start_epsilon
discount_factor = 0.95

env = gym.make('MountainCar-v0')

random_trial = random_attempt (env, RandomPolicy(env.action_space.n), discount_factor, n_episodes, render = True)

print(random_trial)

In [21]:
def learning_attempt (env: gym.Env, learning_rate: float, discount_factor: float, n_episodes: int):

  Q = defaultdict(lambda: np.zeros(env.action_space.n))
  policy = LearningPolicy(Q)

  for episode in range(0, n_episodes):
    observation, info = env.reset()
    done = False
    flag = True
    flag2 = True
    eps = (n_episodes - episode) / n_episodes

    while not done :
      if done:
          observation = env.reset()

      if flag2:
        selected_action = policy(observation, eps)
        flag2 = False
      else:
        selected_action = policy(observation[0]+observation[1], eps)

      new_observation, reward, done, info = env.step(selected_action)

      if flag:
        obs1 = observation
        obs2 = new_observation[0] + new_observation [1]
        flag = False
      else:
        obs1 = observation[0] + observation[1]
        obs2 = new_observation[0] + new_observation [1]

      Q[obs1][selected_action] += learning_rate * (reward + discount_factor * np.max(Q[obs2]) - Q[obs1][selected_action])

      observation = new_observation
    eps = max(final_epsilon, eps - epsilon_decay)

  return Q

In [28]:
q_table = learning_attempt (env, learning_rate, discount_factor, n_episodes)

In [None]:
training = random_attempt(env, GreedyPolicy(q_table), learning_rate, n_episodes, render = True)
print(training)