In [1]:
import os
from environment import LettuceGreenhouse
import time
import matplotlib.pyplot as plt
import numpy as np
from stable_baselines3 import PPO
import yaml
import datetime
from stable_baselines3.common import results_plotter
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.evaluation import evaluate_policy
from torch.utils.tensorboard import SummaryWriter

In [None]:
## Run the simulation....



In [2]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq:
    :param log_dir: Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: Verbosity level: 0 for no output, 1 for info messages, 2 for debug messages
    """
    def __init__(self, check_freq: int, log_dir: str, verbose: int = 1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, "best_model")
        self.best_mean_reward = -np.inf
        self.writer = SummaryWriter(log_dir=self.log_dir) 

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)
        

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

          # Retrieve training reward
          x, y = ts2xy(load_results(self.log_dir), "timesteps")
          if len(x) > 0:
              # Mean training reward over the last 100 episodes
              mean_reward = np.mean(y[-100:])
              if self.verbose >= 1:
                print(f"Num timesteps: {self.num_timesteps}")
                print(f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}")

              # New best model, you could save the agent here
              if mean_reward > self.best_mean_reward:
                  self.best_mean_reward = mean_reward
                  # Example for saving best model
                  if self.verbose >= 1:
                    print(f"Saving new best model to {self.save_path}")
                  self.model.save(self.save_path)
        

        info = self.locals['infos'][0]
        plt.plot(info['timestep_plot'], info['supply_co2_plot'])
        plt.xlabel('Time in 15 min steps')
        plt.ylabel('Supply rate of carbon dioxide [mg]/[m^2][s]')
        plot_co2_supply = plt.gcf()
        plt.close()
        ### get the 
        plt.plot(info['timestep_plot'], info['indoor_co2_plot'])
        plt.xlabel('Time in 15 min steps')
        plt.ylabel('Indoor CO¬2 concentration [ppm]')
        plot_co2_indoor = plt.gcf()
        plt.close()
        # # Generate the plot
        self.writer.add_figure('Supply Rate of CO2', plot_co2_supply, self.num_timesteps)  # Log the plot to TensorBoard
        self.writer.add_figure('Indoor CO2', plot_co2_indoor, self.num_timesteps)  # Log the plot to TensorBoard

        return True
class EvalCallback(BaseCallback):
    """
    Callback for evaluating an agent.

    :param eval_env: (gym.Env) The environment used for initialization
    :param n_eval_episodes: (int) The number of episodes to test the agent
    :param eval_freq: (int) Evaluate the agent every eval_freq call of the callback.
    """

    def __init__(self, eval_env, n_eval_episodes=5, eval_freq=20):
        super().__init__()
        self.eval_env = eval_env
        self.n_eval_episodes = n_eval_episodes
        self.eval_freq = eval_freq
        self.best_mean_reward = -np.inf

    def _on_step(self):
        """
        This method will be called by the model.

        :return: (bool)
        """

        # self.n_calls is automatically updated because
        # we derive from BaseCallback
        if self.n_calls % self.eval_freq == 0:
            # === YOUR CODE HERE ===#
            # Evaluate the agent:
            # you need to do self.n_eval_episodes loop using self.eval_env
            # hint: you can use self.model.predict(obs, deterministic=True)

            # Save the agent if needed
            # and update self.best_mean_reward

            print("Best mean reward: {:.2f}".format(self.best_mean_reward))

            # ====================== #
        return True
# Create log and model directories dir
log_dir = f"logs/{int(time.time())}/"
os.makedirs(log_dir, exist_ok=True)
models_dir = f"models/{int(time.time())}/"
os.makedirs(models_dir)


# Create the callback: check every 1000 steps
## check every 1000 steps to see if ideal model has been found....
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)

above we see our class that will essentially look for the best mean reward and then save that model...

Next as we see below we will initialize our environment. This in our case is the greenhouse environment. We will also reset the greenhouse environment to initialze it.

Then the model will be defined based on a multi perceptron policy, and then we initialize the callback function for what will happen while the model is learning so while we check for best model every 1000 steps, we are running our model for 10000 timesteps.... 
-- this timesteps means that we will finish the environment time steps and then we will continue on until 10000 is reached.

In [3]:
# create the environment object for the greenhouse environment
gh =  LettuceGreenhouse()
# initialize the greenhouse environment
gh.reset()

# Logs will be saved in log_dir/monitor.csv
## Used to know episode reward, length, time, and other data...
env = Monitor(gh,log_dir)

#seed for reproducability
seed = 5

# Number of Timesteps
n_steps=  10000

# Train a the PPO agent
## Set the seed so it is repeatable
model = PPO("MlpPolicy", env, seed=seed, verbose=1)

for i in range(1,3):
    # Pass the callback object to the model's `learn()` method
    model.learn(total_timesteps=n_steps, reset_num_timesteps=False, callback = callback)

Using cpu device
Wrapping the env in a DummyVecEnv.


: 

: 

In [4]:
# results_plotter.plot_results([log_dir],1e5, results_plotter.X_TIMESTEPS, "PPO Greenhouse Data")

: 

: 

In [5]:
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=2)
print(f"The mean reward is {mean_reward} and the standard deviation of the reward is {std_reward}")

KeyboardInterrupt: 

In [16]:

ppo_model_name = f'PPO-run'
folder =  ppo_model_name + "_1"
log_dir_tensorboard = os.path.join(log_dir, folder)

%load_ext tensorboard
%tensorboard --logdir $log_dir_tensorboard

ERROR: Failed to launch TensorBoard (exited with 2).
Contents of stderr:
usage: tensorboard [-h] [--helpfull] [--logdir PATH] [--logdir_spec PATH_SPEC]
                   [--host ADDR] [--bind_all] [--port PORT]
                   [--reuse_port BOOL] [--load_fast {false,auto,true}]
                   [--extra_data_server_flags EXTRA_DATA_SERVER_FLAGS]
                   [--grpc_creds_type {local,ssl,ssl_dev}]
                   [--grpc_data_provider PORT] [--purge_orphaned_data BOOL]
                   [--db URI] [--db_import] [--inspect] [--version_tb]
                   [--tag TAG] [--event_file PATH] [--path_prefix PATH]
                   [--window_title TEXT] [--max_reload_threads COUNT]
                   [--reload_interval SECONDS] [--reload_task TYPE]
                   [--reload_multifile BOOL]
                   [--reload_multifile_inactive_secs SECONDS]
                   [--generic_data TYPE]
                   [--samples_per_plugin SAMPLES_PER_PLUGIN]
                   [-