In [19]:
import os
import gym
import numpy as np
from gym.spaces import Box, Discrete
from stable_baselines3 import PPO

In [36]:
avg_temp = 38
variance = 4
good_variance = 1
low_good = avg_temp - good_variance
high_good = avg_temp + good_variance

# state, reward, done, additional_info = env.step(action)

class CloudEnv(gym.Env):
    def __init__(self):
        self.reset()
        self.action_space = Discrete(2)
        self.observation_space = Box(low=0, high=100, shape=(1,), dtype=np.float32)

    def get_state(self):
        return self.temp

    def get_reward(self):
        if low_good <= self.temp and self.temp >= high_good:
            return 1
#         return -max(low_good - self.temp, self.temp - high_good)
        return -1

    def is_done(self):
        return self.timestamp >= 100

    def render(self):
        pass

    def step(self, action):
        if action == 0:
            self.temp += 1
        else:
            self.temp -= 1
        self.temp = max(0, min(100, self.temp))
        self.timestamp += 1
        return self.get_state(), self.get_reward(), self.is_done(), {}

    def reset(self):
        self.temp = np.random.randint(avg_temp - variance, avg_temp + variance)
        self.timestamp = 0
        return self.get_state()

In [37]:
env = CloudEnv()

In [38]:
log_path = os.path.join('Training', 'Logs')
model = PPO(policy='MlpPolicy', env=env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [39]:
model.learn(total_timesteps=100000)

Logging to Training/Logs/PPO_4
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | 4.6      |
| time/              |          |
|    fps             | 1667     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 33          |
| time/                   |             |
|    fps                  | 1138        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.020043207 |
|    clip_fraction        | 0.236       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.674      |
|    explained_variance   | 0.00216     |

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 96.6         |
| time/                   |              |
|    fps                  | 837          |
|    iterations           | 11           |
|    time_elapsed         | 26           |
|    total_timesteps      | 22528        |
| train/                  |              |
|    approx_kl            | 0.0032175537 |
|    clip_fraction        | 0.0375       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.382       |
|    explained_variance   | -1.03e-05    |
|    learning_rate        | 0.0003       |
|    loss                 | 128          |
|    n_updates            | 100          |
|    policy_gradient_loss | -0.000561    |
|    value_loss           | 151          |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 100           |
|    ep_rew_mean          | 97.4          |
| time/                   |               |
|    fps                  | 813           |
|    iterations           | 21            |
|    time_elapsed         | 52            |
|    total_timesteps      | 43008         |
| train/                  |               |
|    approx_kl            | 0.00073907373 |
|    clip_fraction        | 0.0243        |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.367        |
|    explained_variance   | -1.43e-06     |
|    learning_rate        | 0.0003        |
|    loss                 | 86.9          |
|    n_updates            | 200           |
|    policy_gradient_loss | 0.000884      |
|    value_loss           | 166           |
-------------------------------------------
------------------------------------------
| rollout/                |      

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 97.4         |
| time/                   |              |
|    fps                  | 821          |
|    iterations           | 31           |
|    time_elapsed         | 77           |
|    total_timesteps      | 63488        |
| train/                  |              |
|    approx_kl            | 0.0022978762 |
|    clip_fraction        | 0.0358       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.338       |
|    explained_variance   | -3.58e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 50.8         |
|    n_updates            | 300          |
|    policy_gradient_loss | -0.000293    |
|    value_loss           | 174          |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 97.7         |
| time/                   |              |
|    fps                  | 825          |
|    iterations           | 41           |
|    time_elapsed         | 101          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0022350647 |
|    clip_fraction        | 0.0557       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.277       |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 42.3         |
|    n_updates            | 400          |
|    policy_gradient_loss | 0.00346      |
|    value_loss           | 173          |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

<stable_baselines3.ppo.ppo.PPO at 0x7fc67d36dcd0>

In [35]:
save_path = os.path.join('Training', 'Saved Models', 'ShowerTrainPPO100k')
model.save(save_path)