## 1. Import Dependencies

In [1]:
# IMPORT GYM STUFF
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete

# IMPORT HELPERS
import numpy as np
import random
import os

# IMPORT STABLE BASELINES STUFF
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

## 2. Types of Spaces

In [2]:
Discrete(3)

Discrete(3)

In [3]:
Discrete(3).sample()

2

In [4]:
Box(0, 1, shape=(3, 3))

Box(0.0, 1.0, (3, 3), float32)

In [5]:
Box(0, 1, shape=(3, 3)).sample()

array([[0.18188913, 0.72655404, 0.3825662 ],
       [0.06926471, 0.10515564, 0.8369738 ],
       [0.3734838 , 0.366442  , 0.83899665]], dtype=float32)

In [6]:
# stable baselines doesn't support tuple
# Tuple and Dict act like wrappers to wrape several types of spaces
Tuple((Discrete(3), Box(0, 1, shape=(3, ))))

Tuple(Discrete(3), Box(0.0, 1.0, (3,), float32))

In [7]:
Tuple((Discrete(3), Box(0, 1, shape=(3, )))).sample()

(1, array([0.6121955 , 0.29989195, 0.9227487 ], dtype=float32))

In [8]:
Dict({'height': Discrete(2), 'speed': Box(0, 100, shape=(1,))})

Dict('height': Discrete(2), 'speed': Box(0.0, 100.0, (1,), float32))

In [9]:
Dict({'height': Discrete(2), 'speed': Box(0, 100, shape=(1,)), 'color':MultiBinary(4)}).sample()

OrderedDict([('color', array([0, 0, 1, 0], dtype=int8)),
             ('height', 0),
             ('speed', array([62.861362], dtype=float32))])

In [10]:
MultiBinary(4)

MultiBinary(4)

In [11]:
MultiBinary(4).sample()

array([0, 1, 0, 1], dtype=int8)

In [12]:
MultiDiscrete([5, 2, 2])

MultiDiscrete([5 2 2])

In [13]:
MultiDiscrete([5, 2, 2]).sample()

array([2, 0, 1])

## 3. Building an Environment
- Build an agent to give us the best shower possible 
- Randomely temperature
- 37 and 39 degree range is ideal
- Train an agent to automatically respond to the changes in temperature and get it within the ideal temperature

In [70]:
class ShowerEnv(Env):
    def __init__(self):
        self.action_space = Discrete(3)  # actions we can take (down, stay, up)
        self.observation_space = Box(low=0, high=100, shape=(1,))  # temperature array
        self.state = 38 + random.randint(-3, 3)  # set start temperature
        self.shower_length = 60  # set shower (aka episode) length

    def step(self, action):
        # apply temp adj, to apply the impact of our action to state
        # apply action:
        # 0 -1 = -1 temperature
        # 1 -1 = 0
        # 2 -1 = 1 temperature
        self.state += action-1

        # decrease shower time by 1 second
        self.shower_length -= 1
        
        # calculate reward
        if self.state >= 37 and self.state <= 39:
            reward = 1
        else:
            reward = -1

        # check whether the shower is done
        if self.shower_length <= 0:
            done = True
        else:
            done = False
        
        # apply temperature noise
        # self.state += random.randint(-1, 1)
        # set placeholder for info
        info = {}

        # return step information: temperature, reward for the particular episode, whether it's done, truncated sign and info
        return self.state, reward, done, False, info

    def render(self):
        # implement viz
        pass

    def reset(self, seed=None):
        # reset showertemperature
        self.state = np.array([38 + random.randint(-3, 3)], dtype=np.float32)
        # reset shower time
        self.shower_length = 60
        return self.state, {}

In [71]:
env = ShowerEnv()

In [72]:
env.action_space

Discrete(3)

In [73]:
env.action_space.sample()

2

In [74]:
env.observation_space

Box(0.0, 100.0, (1,), float32)

In [75]:
env.observation_space.sample()

array([69.825134], dtype=float32)

In [76]:
env.reset()

(array([39.], dtype=float32), {})

In [77]:
from stable_baselines3.common.env_checker import check_env

In [78]:
check_env(env, warn=True)

## 4. Test Environment

In [80]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info, _ = env.step(action)
        score += reward
    print(f'Episode:{episode} -> Score:{score}')
env.close()

Episode:1 -> Score:-8
Episode:2 -> Score:-48
Episode:3 -> Score:-60
Episode:4 -> Score:-6
Episode:5 -> Score:-38


## 5. Train Model

In [81]:
log_path = os.path.join('Training', 'CustomEnv', 'Logs')
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)  # SINCE WE HAVE TABULAR DATA WE USE MLP POLICY, RATHER THAN IMAGE DATA AND USING CnnPolicy

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [83]:
model.learn(total_timesteps=400000)

Logging to Training/CustomEnv/Logs/PPO_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | 21.7     |
| time/              |          |
|    fps             | 2926     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 60          |
|    ep_rew_mean          | 22.6        |
| time/                   |             |
|    fps                  | 1337        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011166927 |
|    clip_fraction        | 0.139       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1          |
|    explained_variance   | -0.

<stable_baselines3.ppo.ppo.PPO at 0x7fc0c10fef50>

## 6. Save Model

In [84]:
shower_path = os.path.join('Training', 'CustomEnv', 'Saved Models', 'Shower_400k_Model_PPO')

In [85]:
model.save(shower_path)



In [86]:
del model

In [87]:
model = PPO.load(shower_path, env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [88]:
evaluate_policy(model, env, n_eval_episodes=10, render=False)



(23.6, 54.73426714591143)