# Evolution Strategies Method

In [None]:
import random
import numpy as np

In [None]:
import gym

## Set Configs

In [None]:
SIGMA = 0.1

## Set Environment

In [None]:
ENV_NAME = 'BipedalWalker-v2'
env = gym.make(ENV_NAME).unwrapped; env.seed(90);

In [None]:
print('Environment Display:')
env.reset() # reset environment to a new, random state
# env.render()

print('State space {}'.format(env.observation_space))
print('Action space {}'.format(env.action_space))

## Define [ES](https://arxiv.org/pdf/1703.03864.pdf) Agent

In [None]:
class ESAgent():
    
    def __init__(self, env, population_size=20, sigma=0.1, episode_average=1, 
                 initial_exploration=1.0, final_exploration=0.0, exploration_dec_steps=1e6):
    
        self.env = env
        
        self.weights = [np.zeros(shape=(24, 16)), 
                        np.zeros(shape=(16, 16)), 
                        np.zeros(shape=(16, 4))]
        
        self.population_size = population_size
        self.sigma = sigma
        self.episode_average = episode_average
        self.initial_exploration = initial_exploration
        self.final_exploration = final_exploration
        self.exploration_dec_steps = exploration_dec_steps
        self.memory_length = 1
        
        self.exploration = initial_exploration
        
    def act(self, sequence):
        
        sequence = np.array(sequence)
        action = np.expand_dims(sequence.flatten(), 0)
        action = action/ np.linalg.norm(action)
        
        for layers in self.weights:
            action = np.dot(action, layers)
        
        return action[0]
    
    def get_population(self):
        
        population = []
        for i in range(self.population_size):
            sample = []
            for weight in self.weights:
                sample.append(np.random.randn(*weight.shape))
            population.append(sample)
        return population
    
    def get_weights_candidate(self, weight, sample):
        
        weights_candidate = []
        for index, i in enumerate(sample):
            jittered = self.sigma * i
            weights_candidate.append(weight[index] + jittered)
        return weights_candidate
    
    def get_rewards(self, population):
        
        rewards = []
        for sample in population:
            weights_candidate = self.get_weights_candidate(self.weights, sample)
            rewards.append(self.get_reward(weights_candidate))
        rewards = np.array(rewards)
        return rewards
    
    def get_reward(self, weights):
        
        total_reward = 0.0
        self.set_weights(weights)
        
        for episode in range(self.episode_average):
            
            state = self.env.reset()
            sequence = [state] * self.memory_length
            done = False
            
            while not done:
                
                self.exploration = max(self.final_exploration, self.exploration - self.initial_exploration/ self.exploration_dec_steps)
                if random.random() < self.exploration:
                    action = self.env.action_space.sample()
                else:
                    action = self.act(sequence)
                    
                next_state, reward, done, _ = self.env.step(action)
                total_reward += reward
                sequence = sequence[1:]
                sequence.append(next_state)
            
        return total_reward/ self.episode_average
    
    def set_weights(self, weights):
        self.weights = weights

In [None]:
agent = ESAgent(env)

## Train The Agent

In [None]:
def train_agent(num_episodes=5000):
    
    scores = []
    for i_episode in range(1, num_episodes+1):

        # generate random variations around original policy
        population = agent.get_population()

        # evaluate each policy over one episode
        for sample in population:
            weight_candidates = agent.get_weights_candidate(agent.weights, sample)

        # initial state
        state = env.reset()

        reward = agent.get_rewards(population)
        sequence = [state] * agent.memory_length
        total_reward = 0

        while True:

            # env.render()
            action = agent.act(sequence)

            next_state, reward, done, _ = env.step(action)

            sequence = next_state
            total_reward += reward
            scores.append(total_reward)
            
            print(f'Episode {i_episode}, Total Reward: {total_reward:.2f}, Average Score: {total_reward/i_episode:.2f}')

            if done:
                break
                
    print('Training completed.')
    
    return scores

In [None]:
scores = train_agent(num_episodes=5000)

## Evaluate The Agent

In [None]:
plt.figure(figsize=(10,5))
plt.plot(np.arange(len(scores)), scores, color='green')
plt.xlabel('Num of episodes')
plt.ylabel('Score')
if not os.path.exists('./images/'): os.makedirs('./images/')
plt.savefig('./images/plot_of_evolution_strategies_evaluation.png')
plt.show()

---