In [1]:
import os
import numpy as np
import gym
from gym import wrappers
import pybullet_envs

In [2]:
class hp():
    def __init__(self):
        self.n = 1000                                      #number of times we want to loop
        self.episode_len = 1000                            #length of each episode from start to end
        self.learn_rate = 0.02                             #learning rate
        self.direction=16                                  #number of direction 16-positive 16-negative
        self.best_direction = 16                           #best direction that support increasing reward
        assert self.best_direction <= self.direction       #assertion
        self.noise = 0.03                                  #standard deviation
        self.seed = 1                                      #also called random state
        self.env_name="HalfCheetahBulletEnv-v0"            #environmnt name

In [3]:
class Normalise():
    def __init__(self,input_size):
        self.n = np.zeros(input_size)
        self.mean = np.zeros(input_size)
        self.mean_diff = np.zeros(input_size)
        self.var = np.zeros(input_size)
    def observe(self,x):
        self.n+=1
        last_mean = self.mean.copy()
        self.mean += (x-self.mean)/self.n
        self.mean_diff += (x-last_mean)*(x-self.mean)
        self.var = (self.mean_diff/self.n).clip(min=1e-2)
    def normalise(self,inputs):
        obs_mean = self.mean
        obs_std = np.sqrt(self.var)
        return (inputs-obs_mean)/obs_std

In [4]:
class Policy():
    def __init__(self,input_size,output_size):
        self.theta = np.zeros((output_size,input_size))             #weights matrix
    
    def evaluate(self,inputs,delta=None,direction=None):
        if direction is None:
            return (self.theta).dot(inputs)
        elif direction == 'positive':
            return (self.theta + hp.noise * delta).dot(inputs)        #perceptron
        else:
            return (self.theta - hp.noise * delta).dot(inputs)
    def sample_deltas(self):
        return [np.random.randn(*self.theta.shape) for _ in range(hp.direction)]
    def update(self, rollouts , sigma_r):
        step = np.zeros(self.theta.shape)
        for rpos,rneg,d in rollouts:                                #approximated Gradient Descent
            step+= (rpos-rneg)*d
        self.theta+= hp.learn_rate/(hp.best_direction*sigma_r)*step

In [5]:
def explore(env, normaliser, policy, direction=None , delta=None):
    state = env.reset()
    done = False
    num_plays = 0
    sum_rewards = 0
    while not done and num_plays<hp.episode_len:
        normaliser.observe(state)
        state = normaliser.normalise(state)
        action = policy.evaluate(state,delta,direction)
        state,reward,done,_ = env.step(action)
        reward = max(min(reward,1),-1)
        sum_rewards+= reward
        num_plays+=1
    return sum_rewards

#### Training the A.I

In [6]:
def train(env, policy, normaliser, hp):
    for i in range(hp.n):
        
            # Initialising deltas or pertubations for adjusting the weights
            deltas = policy.sample_deltas()
            positive_rewards = [0] * hp.direction
            negative_rewards = [0] * hp.direction
            
            #Getting the positive rewards in positive direction
            for k in range(hp.direction):
                positive_rewards[k] = explore(env, normaliser, policy, direction="positive", delta=deltas[k])
                
            #Getting the negative rewards in negative direction
            for k in range(hp.direction):
                negative_rewards[k] = explore(env, normaliser, policy, direction="negative", delta=deltas[k])
            
            #Gathering all positive/negative rewards to compute the standard deviation of these rows
            all_rewards = np.array(positive_rewards + negative_rewards)
            sigma_r = all_rewards.std()
            
            #Sorting the rollouts by the max(rpos,rneg) and selecting the best directions
            scores = { k:[max(r_pos,r_pos)]  for k,(r_pos,r_neg) in enumerate(zip(positive_rewards,negative_rewards))}
            order = sorted(scores.keys(), key = lambda x: scores[x])[:hp.best_direction]
            rollouts = [[positive_rewards[k],negative_rewards[k],deltas[k]] for k in order]
            
            #updating our policy
            policy.update(rollouts,sigma_r)
            
            #Printing the final reward of the policy after the update
            reward_evaluation = explore(env, normaliser, policy)
            print('Step: ',i,' Reward :', reward_evaluation)

#### Running the code

In [7]:
def mkdir(base,name):
    path = os.path.join(base,name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path
work_dir = mkdir('exp','brs')                                    #saving the videos of our A.I walking
monitor_dir = mkdir(work_dir, 'monitor')

#### Object Creation

In [8]:
hp = hp()
np.random.seed(hp.seed)
env = gym.make(hp.env_name)
env = wrappers.Monitor(env, monitor_dir , force = True)         #populates the videos to monitor_dir
nb_inputs = env.observation_space.shape[0]
nb_outputs = env.action_space.shape[0]
policy = Policy(nb_inputs,nb_outputs)
normaliser = Normalise(nb_inputs)
train(env, policy, normaliser, hp)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
WalkerBase::__init__ start
[33mWARN: Environment '<class 'pybullet_envs.gym_locomotion_envs.HalfCheetahBulletEnv'>' has deprecated methods '_step' and '_reset' rather than 'step' and 'reset'. Compatibility code invoked. Set _gym_disable_underscore_compat = True to disable this behavior.[0m
Step:  0  Reward : -960.0487993108393
Step:  1  Reward : -968.6851444661884
Step:  2  Reward : -936.953898240947
Step:  3  Reward : -937.1927955478343
Step:  4  Reward : -935.8571926134282
Step:  5  Reward : -908.5218897226828
Step:  6  Reward : -955.3801738286089
Step:  7  

Step:  192  Reward : 492.75549106137817
Step:  193  Reward : 498.58590451844475
Step:  194  Reward : 494.3528936269539
Step:  195  Reward : 495.02299545492446
Step:  196  Reward : 489.41403181937034
Step:  197  Reward : 489.9133306491733
Step:  198  Reward : 494.9378975638943
Step:  199  Reward : 491.0268264012637
Step:  200  Reward : 485.8098173070283
Step:  201  Reward : 68.05330614585941
Step:  202  Reward : 478.76452809203096
Step:  203  Reward : 488.87255397341187
Step:  204  Reward : 479.9809915918315
Step:  205  Reward : 484.40609228345534
Step:  206  Reward : 489.72090289284523
Step:  207  Reward : 490.5366861375139
Step:  208  Reward : 490.53854387372013
Step:  209  Reward : 491.99912008534517
Step:  210  Reward : 483.4483531984253
Step:  211  Reward : 482.2188336722392
Step:  212  Reward : 491.5337330094416
Step:  213  Reward : 490.50238887457425
Step:  214  Reward : 491.4881615961411
Step:  215  Reward : -129.28078790044285
Step:  216  Reward : 471.3540944235787
Step:  217  

KeyboardInterrupt: 