# Steepest Ascent Hill Climbing with Adaptive Noise Scaling

---

In this notebook, we will train steepest ascent hill climbing with adaptive noise scaling with OpenAI Gym's Cartpole environment.

### 1. Import the Necessary Packages

In [1]:
import gym
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
import math
%matplotlib inline

### 2. Define the Policy

In [2]:
env = gym.make('CartPole-v0')
print('observation space:', env.observation_space)
print('action space:', env.action_space)

class Policy():
    def __init__(self, s_size=4, a_size=2):
        self.w = np.random.rand(s_size, a_size)  # weights for simple linear policy: state_space x action_space
        self.s_size = s_size
        self.a_size = a_size
        
    def forward(self, state):
        x = np.dot(state, self.w)
        return np.exp(x)/sum(np.exp(x))
    
    def act(self, state):
        probs = self.forward(state)
        #action = np.random.choice(2, p=probs) # option 1: stochastic policy
        action = np.argmax(probs)              # option 2: deterministic policy
        return action

observation space: Box(4,)
action space: Discrete(2)


In [3]:
policy = Policy()
# Define Evaluation function
def run_episode(weights, gamma=1.0, max_t=5000):
    policy.w = weights
    episode_return = 0.0
    state = env.reset()
    for t in range(max_t):
        action = policy.act(state)
        state, reward, done, _ = env.step(action)
        episode_return += reward * math.pow(gamma, t)
        if done:
            break 

    return episode_return

### 3. Train the Agent with Stochastic Policy Search (Steepest Ascent Hill Climbing with Adaptive Noise Scaling)

In [4]:
env = gym.make('CartPole-v0')
env.seed(101)
np.random.seed(101)

#policy = Policy()

def steepest_ascent_hill_climbing(n_episodes=1000, max_t=1000, gamma=1.0, print_every=10, pop_size=50, noise_scale=1e-2):
    """Implementation of hill climbing with adaptive noise scaling.
        
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        gamma (float): discount rate
        print_every (int): how often to print average score (over last 100 episodes)
    """
    scores_deque = deque(maxlen=100)
    scores = []
    best_R = -np.Inf
    best_w = policy.w
    for i_episode in range(1, n_episodes+1):
        weights_pop = [best_w + (np.random.rand(policy.s_size, policy.a_size)) for i in range(pop_size)]
        rewards = np.array([run_episode(weights, gamma, max_t) for weights in weights_pop])

        best_id = np.argmax(rewards)
        best_weight = weights_pop[best_id]

        # run episode with best weights
        R = run_episode(best_weight, gamma=1.0)
        scores_deque.append(R)
        scores.append(R)

        if R >= best_R: # found better weights 
            best_R = R
            best_w = policy.w
            noise_scale = max(1e-3, noise_scale / 2)
            policy.w += noise_scale * np.random.rand(*policy.w.shape)
        else: # did not find better weights 
            noise_scale = min(2, noise_scale * 2)
            policy.w = best_w + noise_scale * np.random.rand(*policy.w.shape)

        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        if np.mean(scores_deque)>=195.0:
            print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            policy.w = best_w
            break
        
    return scores
            
scores = steepest_ascent_hill_climbing()

Episode 10	Average Score: 156.00
Episode 20	Average Score: 178.00
Episode 30	Average Score: 185.33
Episode 40	Average Score: 189.00
Episode 50	Average Score: 191.20
Episode 60	Average Score: 192.67
Episode 70	Average Score: 192.79
Episode 80	Average Score: 192.88
Episode 90	Average Score: 193.59
Episode 100	Average Score: 194.04
Environment solved in 101 episodes!	Average Score: 195.96


### 4. Plot the Scores

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

### 5. Watch a Smart Agent!

In [None]:
env = gym.make('CartPole-v0')
state = env.reset()
for t in range(200):
    action = policy.act(state)
    env.render()
    state, reward, done, _ = env.step(action)
    if done:
        break 

env.close()