In [1]:
# Checking which python3 I am using
import sys
print(sys.executable)

/Users/lucien/.pyenv/versions/3.7.6/bin/python3


In [2]:
# Import necessary libraries
import gym
import numpy as np

In [3]:
# Make our environment
env = gym.make("CartPole-v1")
print("Created env:", env)

Created env: <TimeLimit<CartPoleEnv<CartPole-v1>>>


In [4]:
# Initialize our environment
state = env.reset()
print("The starting state is:", state)

The starting state is: [-0.03985798  0.04552898  0.01451837 -0.02588705]


In [18]:
# Look at how the cartpole behaves by taking a random policy
env.reset()
for _ in range(100):
    env.render()
    env.step(env.action_space.sample()) # take a random action



In [7]:
# Import ray and rllib library
import ray
from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG
from ray.tune.logger import pretty_print

In [8]:
# Initialize ray before training, a necessary step
info = ray.init(ignore_reinit_error=True, log_to_driver=False)

2020-10-29 21:43:02,414	INFO services.py:1166 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


In [26]:
# The actual training happens here!

config = DEFAULT_CONFIG.copy()
config['num_workers'] = 1
config['num_sgd_iter'] = 30
config['sgd_minibatch_size'] = 128
config['model']['fcnet_hiddens'] = [100, 100]
config['num_cpus_per_worker'] = 0 

agent = PPOTrainer(config, 'CartPole-v1')

N = 20
results = []
episode_data = []
episode_json = []

for n in range(N):
    result = agent.train()
    results.append(result)
    
    episode = {'n': n, 
               'episode_reward_min':  result['episode_reward_min'],  
               'episode_reward_mean': result['episode_reward_mean'], 
               'episode_reward_max':  result['episode_reward_max'],  
               'episode_len_mean':    result['episode_len_mean']} 
    
    episode_data.append(episode)
    episode_json.append(json.dumps(episode))
    
    print(f'{n:3d}: Min/Mean/Max reward: {result["episode_reward_min"]:8.4f}/{result["episode_reward_mean"]:8.4f}/{result["episode_reward_max"]:8.4f}')



  0: Min/Mean/Max reward:   8.0000/ 24.5309/ 97.0000
  1: Min/Mean/Max reward:  12.0000/ 40.9000/130.0000
  2: Min/Mean/Max reward:  13.0000/ 61.0300/236.0000
  3: Min/Mean/Max reward:  13.0000/ 86.1700/236.0000
  4: Min/Mean/Max reward:  13.0000/118.1100/363.0000
  5: Min/Mean/Max reward:  13.0000/152.3100/476.0000
  6: Min/Mean/Max reward:  19.0000/177.4000/500.0000
  7: Min/Mean/Max reward:  28.0000/208.8400/500.0000
  8: Min/Mean/Max reward:  28.0000/240.0100/500.0000
  9: Min/Mean/Max reward:  37.0000/267.7000/500.0000
 10: Min/Mean/Max reward:  49.0000/295.3300/500.0000
 11: Min/Mean/Max reward:  55.0000/321.1000/500.0000
 12: Min/Mean/Max reward:  55.0000/340.3500/500.0000
 13: Min/Mean/Max reward:  55.0000/362.0000/500.0000
 14: Min/Mean/Max reward:  55.0000/376.3600/500.0000
 15: Min/Mean/Max reward: 129.0000/383.8000/500.0000
 16: Min/Mean/Max reward: 173.0000/397.7600/500.0000
 17: Min/Mean/Max reward: 173.0000/404.0500/500.0000
 18: Min/Mean/Max reward: 173.0000/409.9300/50

In [40]:
# Inspect our result
state = env.reset()
done = False
total_reward = 0
while not done:
    action = agent.compute_action(state)
    state, reward, done, _ = env.step(action) # take a random action
    total_reward += reward
    env.render()
print("Total reward",total_reward)

Total reward 1000.0
