In [1]:
import os
import sys
import numpy as np

from tensorforce.agents import Agent
from tensorforce.environments import Environment
from tensorforce.execution import Runner

from LANGEVIN2D_ENV import Langevin2D_Env

In [2]:
# Saver directory
directory = os.path.join(os.getcwd(), 'agents' ,'saver_data_D_1em4_dta_0p01_maxa_1_ep100_lstm2_64_gr_1_wn_1_r_m_e_0p1')

# Environment Parameters
env_params = {
    "dt": 0.0005,
    "T" : 100.0,
    "a" : 10.0 +10.0j,
    "b" : -5.0e2,
    "D" : 1.0e-4,
    "x0": 0.03 + 0.0j
    }

# Controller Parameters
optimization_params = {
    "min_value_forcing": -1.0,
    "max_value_forcing": 1.0
    }

# Training Parameters
training_params = {
    "num_episodes" : 100,
    "dt_action"    : 0.01
}

# Compute environment and action input timesteps
n_env_steps = int(training_params["dt_action"] / env_params["dt"])
max_episode_timesteps = int(env_params["T"]/env_params["dt"]/n_env_steps)


In [3]:
# Create and instance of the complex Stuart-Landau environment
environment = Langevin2D_Env(n_env_steps = n_env_steps)
environment.env_params = env_params
environment.optimization_params = optimization_params

200000


In [4]:
# Specify network architecture - 2 layers/64 neurons
policy_network = [
    [   
        dict(type='retrieve', tensors='observation'),
        dict(type='dense', size=32),
        dict(type='dense', size=32),
        dict(type='register' , tensor ='intermed-1')
    ],
    [   
        dict(type='retrieve', tensors='prev_action'),
        dict(type='dense', size=32),
        dict(type='dense', size=32),
        dict(type='register' , tensor ='intermed-2')
    ],
    [
        dict(type='retrieve', tensors=['intermed-1','intermed-2'], aggregation='concat'),
        dict(type='internal_lstm', size=64, length=1, bias=True),
        dict(type='internal_lstm', size=64, length=1, bias=True),
        dict(type='dense', size=16),
    ]
]

network = "auto"

In [5]:
# Specify the agent parameters - PPO algorithm
agent = Agent.create(
    # Agent + Environment
    agent='ppo',  # Agent specification
    environment=environment,  # Environment object
    exploration=0.1,
    # Network
    network=policy_network,  # Policy NN specification
    # Optimization
    batch_size=1,  # Number of episodes per update batch
    learning_rate=1e-2,  # Optimizer learning rate
    subsampling_fraction=0.75,  # Fraction of batch timesteps to subsample
    optimization_steps=25,
    # Reward estimation
    likelihood_ratio_clipping=0.2, # The epsilon of the ppo CLI objective
    estimate_terminal=False,  # Whether to estimate the value of terminal states
    # TODO: gae_lambda=0.97 doesn't currently exist - ???
    # Critic
    critic_network=network,  # Critic NN specification
    critic_optimizer=dict(
        type='multi_step', num_steps=5,
        optimizer=dict(type='adam', learning_rate=1e-2)
    ),
    # Regularization
    entropy_regularization=0.01,  # To discourage policy from being too 'certain'
    # TensorFlow
    saver=dict(directory=directory),  # TensorFlow saver configuration for periodic implicit saving
    # TensorBoard Summarizer
    summarizer=dict(directory=os.path.join(directory, 'summarizer') , labels="all")
)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Create CheckpointSaverHook.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /Users/lucienviala/Documents/IMPERIAL/PROJECT/CODE/2D_LANGEVIN_CONTROL_OBS_ACT/agents/saver_data_D_1em4_dta_0p01_maxa_1_ep100_lstm2_64_gr_1_wn_1_r_m_e_0p1/agent-0
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /Users/lucienviala/Documents/IMPERIAL/PROJECT/CODE/2D_LANGEVIN_CONTROL_OBS_ACT/agents/saver_data_D_1em4_dta_0p01_maxa_1_ep100_lstm2_64_gr_1_wn_1_r_m_e_0p1/agent.


In [6]:
# Set up control time with reference to simulation time
dt_action = 0.01
dt = environment.env_params["dt"]
T = environment.env_params["T"]
n_env_steps = int(dt_action / dt)
n_actions = int(T/dt/n_env_steps)
print(n_env_steps,n_actions)

20 10000


In [7]:
# Initiate environment to initial state
time  = np.zeros((environment.max_episode_timesteps()))
state = environment.reset()

# Episode reward - defined as magnitude of the complex state
sum_rewards = 0.0

# Initialize agent internals for agents with internal RNNs
internals = agent.initial_internals()

print(state , type(state))

{'observation': array([-0.056704  ,  0.06827483]), 'prev_action': array([0., 0.])} <class 'dict'>


In [8]:
agent.states_spec

OrderedDict([('observation', {'type': 'float', 'shape': (2,)}),
             ('prev_action', {'type': 'float', 'shape': (2,)})])

In [14]:
action , internals = agent.act(states=state, internals=internals, evaluation=True)

In [15]:
print(action)

[-0.00605154  0.00741863]
