<a href="https://colab.research.google.com/github/manmeet3/my_colabs/blob/master/sarsa-mountain-car.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sarsa Mountain Car

In [1]:
"""
sarsa_MountainCar.py

A SARSA modification of Q_Learning algorithm
"""
import gym
import numpy as np

In [2]:
#MAX_NUM_EPISODES = 500
MAX_NUM_EPISODES = 5000
STEPS_PER_EPISODE = 1000 #  This is specific to MountainCar. May change with env
EPSILON_MIN = 0.005
max_num_steps = MAX_NUM_EPISODES * STEPS_PER_EPISODE
EPSILON_DECAY = 500 * EPSILON_MIN / max_num_steps
ALPHA = 0.3 # Learning rate
GAMMA = 0.999  # Discount factor
NUM_DISCRETE_BINS = 30  # Number of bins to Discretize each observation dim

In [3]:
class SARSA(object):
    def __init__(self, env):
        self.obs_shape = env.observation_space.shape
        self.obs_high = env.observation_space.high
        self.obs_low = env.observation_space.low
        self.obs_bins = NUM_DISCRETE_BINS  # Number of bins to Discretize each observation dim
        self.bin_width = (self.obs_high - self.obs_low) / self.obs_bins
        self.action_shape = env.action_space.n
        # Create a multi-dimensional array (aka. Table) to represent the
        # Q-values
        self.Q = np.zeros((self.obs_bins + 1, self.obs_bins + 1,
                           self.action_shape))  # (51 x 51 x 3)
        self.alpha = ALPHA  # Learning rate
        self.gamma = GAMMA  # Discount factor
        self.epsilon = 1.0

    def discretize(self, obs):
        return tuple(((obs - self.obs_low) / self.bin_width).astype(int))

    def get_action(self, obs):
        discretized_obs = self.discretize(obs)
        # Epsilon-Greedy action selection
        if self.epsilon > EPSILON_MIN:
            self.epsilon -= EPSILON_DECAY
        if np.random.random() > self.epsilon:
            return np.argmax(self.Q[discretized_obs])
        else:  # Choose a random action
            return np.random.choice([a for a in range(self.action_shape)])

    def learn(self, obs, action, reward, next_obs, next_action): # modified
        discretized_obs = self.discretize(obs)
        discretized_next_obs = self.discretize(next_obs)
        td_target = reward + self.gamma * self.Q[discretized_next_obs][next_action] # modified
        td_error = td_target - self.Q[discretized_obs][action]
        self.Q[discretized_obs][action] += self.alpha * td_error

In [4]:
def train(agent, env):
    best_reward = -float('inf')
    for episode in range(MAX_NUM_EPISODES):
        done = False
        obs = env.reset()
        total_reward = 0.0   
        while not done:
            action = agent.get_action(obs)
            next_obs, reward, done, info = env.step(action)
            next_action = agent.get_action(next_obs) # modified
            agent.learn(obs, action, reward, next_obs, next_action) # modified
            obs = next_obs
            action = next_action
            total_reward += reward
        if total_reward > best_reward:
            best_reward = total_reward
        print("Episode#:{} reward:{} best_reward:{} eps:{}".format(episode,
                                     total_reward, best_reward, agent.epsilon))
    # Return the trained policy
    return np.argmax(agent.Q, axis=2)

In [5]:
def test(agent, env, policy):
    done = False
    obs = env.reset()
    total_reward = 0.0
    while not done:
        action = policy[agent.discretize(obs)]
        next_obs, reward, done, info = env.step(action)
        obs = next_obs
        total_reward += reward
    return total_reward

In [6]:
if __name__ == "__main__":
    env = gym.make('MountainCar-v0')
    agent = SARSA(env)
    learned_policy = train(agent, env)
    # Use the Gym Monitor wrapper to evalaute the agent and record video
    gym_monitor_path = "./gym_monitor_output"
    env = gym.wrappers.Monitor(env, gym_monitor_path, force=True)
    for _ in range(1000):
        test(agent, env, learned_policy)
    env.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Episode#:1 reward:-200.0 best_reward:-200.0 eps:0.9996000000000329
Episode#:2 reward:-200.0 best_reward:-200.0 eps:0.9994000000000494
Episode#:3 reward:-200.0 best_reward:-200.0 eps:0.9992000000000658
Episode#:4 reward:-200.0 best_reward:-200.0 eps:0.9990000000000823
Episode#:5 reward:-200.0 best_reward:-200.0 eps:0.9988000000000987
Episode#:6 reward:-200.0 best_reward:-200.0 eps:0.9986000000001152
Episode#:7 reward:-200.0 best_reward:-200.0 eps:0.9984000000001316
Episode#:8 reward:-200.0 best_reward:-200.0 eps:0.9982000000001481
Episode#:9 reward:-200.0 best_reward:-200.0 eps:0.9980000000001645
Episode#:10 reward:-200.0 best_reward:-200.0 eps:0.997800000000181
Episode#:11 reward:-200.0 best_reward:-200.0 eps:0.9976000000001974
Episode#:12 reward:-200.0 best_reward:-200.0 eps:0.9974000000002139
Episode#:13 reward:-200.0 best_reward:-200.0 eps:0.9972000000002303
Episode#:14 reward:-200.0 best_reward:-200.0 eps:0.9970000000

NoSuchDisplayException: ignored

Last error is from lack of a screen connected to Colab. Code runs find in anaconda on local machine

In [None]:
# Reference
# https://github.com/srnand/Reinforcement-Learning-using-OpenAI-Gym/tree/master/Mountain_Car