# Setup

In [1]:
import math
import numpy as np
from tensorforce.environments import Environment

# Define the Environment class

Here we define the Environment class we want to work with.  Comes from: https://tensorforce.readthedocs.io/en/latest/basics/getting-started.html

We want to define an environment that returns the result of turning on a heater. 

In [44]:
class ThermostatEnvironment(Environment):
    
    def __init__(self):
        ## Some initializations.  Will eventually parameterize this in the constructor.
        self.tau = 3.0
        self.current_temp = 0.0
        self.timestep = 0
        
        super().__init__()

    def states(self):
        return dict(type='float', shape=(1,))

    def actions(self):
        return dict(type='int', num_values=2)

    # Optional, should only be defined if environment has a natural maximum
    # episode length
    def max_episode_timesteps(self):
        # return super().max_episode_timesteps()
        return 5

    # Optional
    def close(self):
        super().close()

    def reset(self):
        state = np.random.random(size=(1,))
        self.current_temp = 0.0
        self.timestep = 0
        return state

    def response(self, action):
        """Respond to an action.  When the action is 1, the temperature
        exponentially decays approaches 1.0.  When the action is 0,
        the current temperature decays towards 0.0.
        """
        return action + (self.current_temp - action) * math.exp(-1.0 / self.tau)

    def reward_compute(self):
        """ The reward here is 0 if the current temp is between 0.4 and 0.6,
        else it is distance the temp is away from the 0.4 or 0.6 boundary.
        """
        delta = abs(self.current_temp - 0.5)
        if delta < 0.1:
            return 0
        else:
            return -delta + 0.1

    def execute(self, actions):
        ## Check the action is either 0 or 1 -- heater on or off.
        assert actions == 0 or actions == 1

        ## Increment timestamp
        self.timestep += 1
        
        ## Update the current_temp
        self.current_temp = self.response(actions)
        
        ## Compute the reward
        reward = self.reward_compute()

        ## The only way to go terminal is to exceed max_episode_timestamp.
        terminal = False
        if self.timestep > self.max_episode_timesteps():
            terminal = True
        
        return self.current_temp, terminal, reward

environment = ThermostatEnvironment()

# Exercise the environment

In [45]:
print(environment.execute(1))
print(environment.execute(1))
print(environment.execute(1))
print(environment.execute(1))
print(environment.execute(1))
print(environment.execute(1))
print(environment.execute(1))

(0.28346868942621073, False, -0.11653131057378927)
(0.486582880967408, False, 0)
(0.6321205588285577, False, -0.03212055882855766)
(0.7364028618842733, False, -0.13640286188427328)
(0.8111243971624382, False, -0.21112439716243822)
(0.8646647167633874, True, -0.26466471676338743)
(0.903028032135595, True, -0.30302803213559504)
