 How to train an AI to balance the cart pole
 The code is adapted from [Here](https://github.com/udacity/deep-reinforcement-learning/blob/master/cross-entropy/CEM.ipynb).

In [1]:
import numpy as np
import gym
import pandas as pd


env = gym.make("CartPole-v1")

total_reward = 0.0
total_steps = 0
obs = env.reset()


 Read more about the gym environment [HERE](https://gym.openai.com/envs/CartPole-v1/)

In [2]:

while True:
    action = env.action_space.sample()
    env.render()
    obs, reward, done, _ = env.step(action)
    total_reward += reward
    total_steps += 1
    if done:
        break

print("Episode done in %d steps, total reward %.2f" %
      (total_steps, total_reward))



Episode done in 34 steps, total reward 34.00


![rewardCurve](<./rewardCurve.png>)
<img src="rewardCurve.png" alt="rewardCurve" style="zoom:40%;" />


In [3]:
class LinearPolicy(object):

    def __init__(self, theta, ob_space, ac_space):
        """
        dim_ob: dimension of observations
        n_actions: number of actions
        theta: flat vector of parameters
        """
        dim_ob = ob_space.shape[0]
        n_actions = ac_space.n
        assert len(theta) == (dim_ob + 1) * n_actions
        self.W = theta[0: dim_ob * n_actions].reshape(dim_ob, n_actions)
        self.b = theta[dim_ob * n_actions: None].reshape(1, n_actions)

    def act(self, ob):
        y = ob.dot(self.W) + self.b
        a = y.argmax()
        return a



In [4]:
def run_episode(policy, env, num_steps, render=False):
    total_rew = 0
    ob = env.reset()
    for t in range(num_steps):
        a = policy.act(ob)
        (ob, reward, done, _info) = env.step(a)
        total_rew += reward
        if render and t % 3 == 0:
            env.render()
        if done:
            break

    return total_rew


dim_theta = (env.observation_space.shape[0]+1) * env.action_space.n
print("observation space:", env.observation_space.shape)
print("action space:", env.action_space.n)
print("parameter number:", dim_theta)


# Initialize mean and standard deviation
theta_mean = np.zeros(dim_theta)
theta_std = np.ones(dim_theta)
popsize = 25
n_elite = 5
num_steps = 500


reward_list = []

for itr in range(50):
    # Sample parameter vectors
    thetas = np.random.multivariate_normal(mean=theta_mean,
                                           cov=np.diag(np.array(theta_std**2)),
                                           size=popsize)
    rewards = []
    for theta in thetas:
        policy = LinearPolicy(theta, env.observation_space, env.action_space)
        r = run_episode(policy, env, num_steps)
        rewards.append(r)

    rewards = np.array(rewards)
    # Get elite parameters
    elite_inds = rewards.argsort()[-n_elite:]
    elite_thetas = thetas[elite_inds]

    # Update theta_mean, theta_std
    theta_mean = elite_thetas.mean(axis=0)
    theta_std = elite_thetas.std(axis=0)
    print("[Iteration %2i] mean: %5.3g max: %5.3g" %
          (itr, np.mean(rewards), np.max(rewards)))
    reward_list.append(np.mean(rewards))
    policy = LinearPolicy(theta_mean, env.observation_space, env.action_space)
    # run_episode(policy, env, num_steps, render=True)

df = pd.DataFrame({"reward": reward_list})
df.to_csv("./CemLabData/results.csv",
          index=False, header=True)


observation space: (4,)
action space: 2
parameter number: 10
[Iteration  0] mean:  17.1 max:    60
[Iteration  1] mean:  44.8 max:   149
[Iteration  2] mean:  78.6 max:   315
[Iteration  3] mean:   134 max:   456
[Iteration  4] mean:   236 max:   500
[Iteration  5] mean:   343 max:   500
[Iteration  6] mean:   417 max:   500
[Iteration  7] mean:   474 max:   500
[Iteration  8] mean:   487 max:   500
[Iteration  9] mean:   482 max:   500
[Iteration 10] mean:   492 max:   500
[Iteration 11] mean:   499 max:   500
[Iteration 12] mean:   500 max:   500
[Iteration 13] mean:   500 max:   500
[Iteration 14] mean:   500 max:   500
[Iteration 15] mean:   500 max:   500
[Iteration 16] mean:   500 max:   500
[Iteration 17] mean:   500 max:   500
[Iteration 18] mean:   500 max:   500
[Iteration 19] mean:   500 max:   500
[Iteration 20] mean:   500 max:   500
[Iteration 21] mean:   500 max:   500
[Iteration 22] mean:   500 max:   500
[Iteration 23] mean:   500 max:   500
[Iteration 24] mean:   500 