# Evolution Strategies Method

In [1]:
import os
import random
import numpy as np
import matplotlib.pyplot as plt

import pickle

In [2]:
import gym
from evostra import EvolutionStrategy

In [3]:
from model import Model

## Set Configs

In [4]:
POPULATION_SIZE = 20
EPISODE_AVERAGE = 1
MAX_TIME = 1500
SIGMA = 0.1
LR = 0.1
DECAY_RATE = 0.995
INITIAL_EXPLORATION = 1.0
FINAL_EXPLORATION = 0.0
EXPLORATION_DEC_STEPS = 1e6
PRINT_EVERY = 1

## Set Environment

In [5]:
ENV_NAME = 'BipedalWalker-v2'
env = gym.make(ENV_NAME); env.seed(90); # remove unwrapped

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [6]:
print('Environment Display:')
env.reset() # reset environment to a new, random state
# env.render()

print('State space {}'.format(env.observation_space))
print('Action space {}'.format(env.action_space))

Environment Display:
State space Box(24,)
Action space Box(4,)


## Define [ES](https://arxiv.org/pdf/1703.03864.pdf) Agent

In [7]:
class ESAgent():
    
    def __init__(self, env, population_size=20, episode_average=1, max_time=1500, 
                 sigma=0.1, learning_rate=0.1, decay_rate=0.995, 
                 initial_exploration=1.0, final_exploration=0.0, exploration_dec_steps=1e6):
    
        self.env = env
                
        self.population_size = population_size
        self.episode_average = episode_average
        self.max_time = max_time
        self.sigma = sigma
        self.learning_rate = learning_rate
        self.decay_rate = decay_rate
        self.initial_exploration = initial_exploration
        self.final_exploration = final_exploration
        self.exploration_dec_steps = exploration_dec_steps
        self.agent_history_length = 1
        
        self.exploration = self.initial_exploration
        
        self.model = Model()
        self.es = EvolutionStrategy(self.model.get_weights(), self.get_reward, 
                                    self.population_size, self.sigma, self.learning_rate, self.decay_rate, num_threads=1)
        
        self.scores = []
        
    def get_predicted_action(self, sequence):
        
        action = self.model.predict(np.array(sequence))
        return action
    
    def get_reward(self, weights):
        
        total_reward = 0.0
        self.model.set_weights(weights)
        
        for i_episode in range(1, self.episode_average+1):
            
            state = self.env.reset()
            sequence = [state] * self.agent_history_length
            done = False
            
            while not done:
                
                self.exploration = max(self.final_exploration, self.exploration - self.initial_exploration / self.exploration_dec_steps)
                if random.random() < self.exploration:
                    action = self.env.action_space.sample()
                else:
                    action = self.get_predicted_action(sequence)
                
                next_state, reward, done, _ = self.env.step(action)
                total_reward += reward
                sequence = sequence[1:]
                sequence.append(next_state)
            
        average_reward = total_reward/ self.episode_average
        self.scores.append(average_reward)
        
        return average_reward
    
    def train(self, num_episodes, print_every=10):
        self.es.run(num_episodes, print_step=print_every)
        self.save(f'../agents/ES_{ENV_NAME}.pth')
        
        return self.scores
    
    def save(self, policy_path):
        
        if not os.path.exists('../agents/'): os.makedirs('../agents/')
        with open(policy_path, 'wb') as weights:
            pickle.dump(self.es.get_weights(), weights)
            
    def load(self, policy_path):
        
        with open(policy_path, 'rb') as weights:
            self.model.set_weights(pickle.load(weights))
        self.es.weights = self.model.get_weights()
        
    def watch(self, num_episodes=10, render=True):
        
        self.model.set_weights(self.es.weights)
        for i_episode in range(1, num_episodes+1):
            
            total_reward = 0.0
            state = self.env.reset()
            sequence = [state] * self.agent_history_length
            done = False
            
            while not done:
                if render:
                    self.env.render()
                action = self.get_predicted_action(sequence)
                next_state, reward, done, _ = self.env.step(action)
                total_reward += reward
                sequence = sequence[1:]
                sequence.append(next_state)
                
            print(f'Episode: {i_episode}, Total Reward: {total_reward:.2f}')

In [8]:
agent = ESAgent(env, POPULATION_SIZE, EPISODE_AVERAGE, MAX_TIME, SIGMA, LR, DECAY_RATE, 
                INITIAL_EXPLORATION, FINAL_EXPLORATION, EXPLORATION_DEC_STEPS)

## Train The Agent

In [None]:
scores = agent.train(num_episodes=1000, print_every=PRINT_EVERY)

Episode: 1, Average Reward: -118.74
Episode: 2, Average Reward: -73.00
Episode: 3, Average Reward: -109.01
Episode: 4, Average Reward: -103.03
Episode: 5, Average Reward: -99.42
Episode: 6, Average Reward: -88.58
Episode: 7, Average Reward: -124.86
Episode: 8, Average Reward: -118.71
Episode: 9, Average Reward: -96.63
Episode: 10, Average Reward: -88.53
Episode: 11, Average Reward: -105.72
Episode: 12, Average Reward: -106.54
Episode: 13, Average Reward: -110.71
Episode: 14, Average Reward: -105.52
Episode: 15, Average Reward: -111.57
Episode: 16, Average Reward: -108.74
Episode: 17, Average Reward: -115.19
Episode: 18, Average Reward: -105.82
Episode: 19, Average Reward: -223.40
Episode: 20, Average Reward: -114.24
Episode: 21, Average Reward: -118.60
Episode: 22, Average Reward: -104.23
Episode: 23, Average Reward: -91.33
Episode: 24, Average Reward: -106.31
Episode: 25, Average Reward: -118.69
Episode: 26, Average Reward: -117.17
Episode: 27, Average Reward: -107.49
Episode: 28, Ave

## Evaluate The Agent

In [None]:
plt.figure(figsize=(10,5))
plt.plot(np.arange(len(scores)), scores, color='green')
plt.xlabel('Num of episodes')
plt.ylabel('Score')
if not os.path.exists('../images/'): os.makedirs('../images/')
plt.savefig('../images/plot_of_evolution_strategies_evaluation.png'ß)
plt.show()

## 🎬 Watch The Smart Agent

In [None]:
agent.load(f'../agents/ES_{ENV_NAME}.pth');

In [None]:
agent.watch(num_episodes=10)

---