## Importing required dependencies

In [35]:
import sys
sys.path.append('/home/dell/miniconda3/lib/python3.8/site-packages')

import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

## Loading Environment

In [36]:
environment_name=('CartPole-v0')
env=gym.make(environment_name)

In [37]:
environment_name


'CartPole-v0'

In [38]:
episodes=5
for episode in range(1, episodes+1):
    state=env.reset()
    done=False
    score=0
    
    while not done:
        env.render()
        action=env.action_space.sample()
        n_state, reward, done, info=env.step(action)
        #print(done)
        score+=reward
    print('Episode: {}, Reward: {}'.format(episode, score))
    print(score)
env.close()
    

Episode: 1, Reward: 32.0
32.0
Episode: 2, Reward: 12.0
12.0
Episode: 3, Reward: 11.0
11.0
Episode: 4, Reward: 18.0
18.0
Episode: 5, Reward: 29.0
29.0


## Understanding the environment

In [39]:
env.action_space.sample()

1

In [40]:
done

True

In [41]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [42]:
env.action_space

Discrete(2)

## Training

In [43]:
# Need to make directory first
log_path=os.path.join('Training', 'Logs')
log_path

'Training/Logs'

In [44]:
env=gym.make(environment_name)
env=DummyVecEnv([lambda: env])
model=PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [45]:
model.learn(total_timesteps=20000)

Logging to Training/Logs/PPO_5
-----------------------------
| time/              |      |
|    fps             | 1201 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 934          |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0093868235 |
|    clip_fraction        | 0.127        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.685       |
|    explained_variance   | -0.00294     |
|    learning_rate        | 0.0003       |
|    loss                 | 6.37         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0202      |
|    value_loss           | 48.9         |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7fccd4a089a0>

## Save and Reload model

In [46]:
PPO_path=os.path.join('Training','Saved Models', 'ppo_model_cartpole')

In [47]:
model.save(PPO_path)

## Evaluation

In [48]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(200.0, 0.0)

In [49]:
env.close()

## Test

In [50]:
episodes=5
for episode in range(1, episodes+1):
    obs=env.reset()
    done=False
    score=0
    
    while not done:
        env.render()
        action, _=model.predict(obs)
        obs, reward, done, info=env.step(action)
        #print(done)
        score+=reward
    print('Episode: {}, Reward: {}'.format(episode, score))
    print(score)
env.close()

Episode: 1, Reward: [192.]
[192.]
Episode: 2, Reward: [200.]
[200.]
Episode: 3, Reward: [200.]
[200.]
Episode: 4, Reward: [200.]
[200.]
Episode: 5, Reward: [200.]
[200.]


### Unfolding things

In [51]:
obs=env.reset()

In [52]:
obs

array([[0.01306982, 0.03553905, 0.00657437, 0.02909288]], dtype=float32)

In [53]:
action, _=model.predict(obs)

In [54]:
action

array([1])

In [55]:
env.action_space.sample()

1

In [56]:
env.step(action)

(array([[ 0.0137806 ,  0.2305661 ,  0.00715622, -0.26150852]],
       dtype=float32),
 array([1.], dtype=float32),
 array([False]),
 [{}])

### Viewing logs in tensorboard

In [57]:
training_log_path=os.path.join(log_path, 'PPO_3')

In [58]:
training_log_path

'Training/Logs/PPO_3'

## Adding a callback to the training stage

In [61]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
import os

In [62]:
save_path=os.path.join('Training','Saved Models')

In [63]:
stop_callback=StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback=EvalCallback(env, callback_on_new_best=stop_callback, eval_freq=10000, best_model_save_path=save_path,
                          verbose=1)

In [64]:
model=PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cpu device


In [66]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training/Logs/PPO_7
-----------------------------
| time/              |      |
|    fps             | 1433 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 953         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009754866 |
|    clip_fraction        | 0.111       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.594      |
|    explained_variance   | 0.445       |
|    learning_rate        | 0.0003      |
|    loss                 | 39.1        |
|    n_updates            | 50          |
|    policy_gradient_loss | -0.0199     |
|    value_loss           | 60.5        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x7fccd6197790>

## Changing Policies

In [68]:
net_arch=[dict(pi=[128,128,128,128],vf=[128,128,128,128])]

In [69]:
model=PPO('MlpPolicy',env, verbose=1, policy_kwargs={'net_arch':net_arch})

Using cpu device


In [70]:
model.learn(total_timesteps=20000, callback=eval_callback)

-----------------------------
| time/              |      |
|    fps             | 944  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 627         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013080381 |
|    clip_fraction        | 0.173       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.682      |
|    explained_variance   | -0.012      |
|    learning_rate        | 0.0003      |
|    loss                 | 2.37        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0197     |
|    value_loss           | 17.3        |
-----------------------------------------
----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7fccd6197490>

## Using an alternative algorithm

In [71]:
from stable_baselines3 import DQN
model=DQN('MlpPolicy',env, verbose=1, tensorboard_log=log_path )

Using cpu device


In [72]:
model.learn(total_timesteps=20000, )

Logging to Training/Logs/DQN_1
----------------------------------
| rollout/            |          |
|    exploration rate | 0.927    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4171     |
|    time_elapsed     | 0        |
|    total timesteps  | 154      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.881    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 4771     |
|    time_elapsed     | 0        |
|    total timesteps  | 251      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.837    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 5189     |
|    time_elapsed     | 0        |
|    total timesteps  | 344      |
----------------------------------
------------------------

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 108      |
|    fps              | 5231     |
|    time_elapsed     | 0        |
|    total timesteps  | 2588     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 112      |
|    fps              | 5239     |
|    time_elapsed     | 0        |
|    total timesteps  | 2656     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 116      |
|    fps              | 5272     |
|    time_elapsed     | 0        |
|    total timesteps  | 2740     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 216      |
|    fps              | 5270     |
|    time_elapsed     | 0        |
|    total timesteps  | 4827     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 220      |
|    fps              | 5308     |
|    time_elapsed     | 0        |
|    total timesteps  | 4957     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 224      |
|    fps              | 5305     |
|    time_elapsed     | 0        |
|    total timesteps  | 5061     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 324      |
|    fps              | 5410     |
|    time_elapsed     | 1        |
|    total timesteps  | 7078     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 328      |
|    fps              | 5416     |
|    time_elapsed     | 1        |
|    total timesteps  | 7184     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 332      |
|    fps              | 5430     |
|    time_elapsed     | 1        |
|    total timesteps  | 7279     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 432      |
|    fps              | 5472     |
|    time_elapsed     | 1        |
|    total timesteps  | 9541     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 436      |
|    fps              | 5433     |
|    time_elapsed     | 1        |
|    total timesteps  | 9615     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 440      |
|    fps              | 5422     |
|    time_elapsed     | 1        |
|    total timesteps  | 9690     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 540      |
|    fps              | 5289     |
|    time_elapsed     | 2        |
|    total timesteps  | 12021    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 544      |
|    fps              | 5303     |
|    time_elapsed     | 2        |
|    total timesteps  | 12132    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 548      |
|    fps              | 5303     |
|    time_elapsed     | 2        |
|    total timesteps  | 12197    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 648      |
|    fps              | 5380     |
|    time_elapsed     | 2        |
|    total timesteps  | 14288    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 652      |
|    fps              | 5384     |
|    time_elapsed     | 2        |
|    total timesteps  | 14400    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 656      |
|    fps              | 5385     |
|    time_elapsed     | 2        |
|    total timesteps  | 14464    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 756      |
|    fps              | 5452     |
|    time_elapsed     | 3        |
|    total timesteps  | 16716    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 760      |
|    fps              | 5460     |
|    time_elapsed     | 3        |
|    total timesteps  | 16826    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 764      |
|    fps              | 5467     |
|    time_elapsed     | 3        |
|    total timesteps  | 16923    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 864      |
|    fps              | 5422     |
|    time_elapsed     | 3        |
|    total timesteps  | 18881    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 868      |
|    fps              | 5421     |
|    time_elapsed     | 3        |
|    total timesteps  | 18958    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 872      |
|    fps              | 5423     |
|    time_elapsed     | 3        |
|    total timesteps  | 19021    |
----------------------------------
----------------------------------
| rollout/          

<stable_baselines3.dqn.dqn.DQN at 0x7fccd6194880>