In [1]:
import os
import sys
import numpy as np

from tensorforce.agents import Agent
from tensorforce.environments import Environment
from tensorforce.execution import Runner

from LANGEVIN2D_ENV import Langevin2D_Env

In [2]:
# Saver directory
directory = os.path.join(os.getcwd(), 'agents' ,'saver_data_D_1em4_dta_0p01_maxa_1_ep100_lstm2_64_gr_1_wn_1_r_m_e_0p1')

# Environment Parameters
env_params = {
    "dt": 0.0005,
    "T" : 100.0,
    "a" : 10.0 +10.0j,
    "b" : -5.0e2,
    "D" : 1.0e-4,
    "x0": 0.03 + 0.0j
    }

# Controller Parameters
optimization_params = {
    "min_value_forcing": -1.0,
    "max_value_forcing": 1.0
    }

# Training Parameters
training_params = {
    "num_episodes" : 100,
    "dt_action"    : 0.01
}

# Compute environment and action input timesteps
n_env_steps = int(training_params["dt_action"] / env_params["dt"])
max_episode_timesteps = int(env_params["T"]/env_params["dt"]/n_env_steps)


In [3]:
# Create and instance of the complex Stuart-Landau environment
environment = Langevin2D_Env(n_env_steps = n_env_steps)
environment.env_params = env_params
environment.optimization_params = optimization_params

200000


In [4]:
# Specify network architecture - 2 layers/64 neurons
policy_network = "auto"

network = "auto"

In [5]:
# Specify the agent parameters - PPO algorithm
agent = Agent.create(
    # Agent + Environment
    agent='ppo',  # Agent specification
    environment=environment,  # Environment object
    exploration=0.1,
    # Network
    network=policy_network,  # Policy NN specification
    # Optimization
    batch_size=1,  # Number of episodes per update batch
    learning_rate=1e-2,  # Optimizer learning rate
    subsampling_fraction=0.75,  # Fraction of batch timesteps to subsample
    optimization_steps=25,
    # Reward estimation
    likelihood_ratio_clipping=0.2, # The epsilon of the ppo CLI objective
    estimate_terminal=False,  # Whether to estimate the value of terminal states
    # TODO: gae_lambda=0.97 doesn't currently exist - ???
    # Critic
    critic_network=network,  # Critic NN specification
    critic_optimizer=dict(
        type='multi_step', num_steps=5,
        optimizer=dict(type='adam', learning_rate=1e-2)
    ),
    # Regularization
    entropy_regularization=0.01,  # To discourage policy from being too 'certain'
    # TensorFlow
    #saver=dict(directory=directory),  # TensorFlow saver configuration for periodic implicit saving
    # TensorBoard Summarizer
    #summarizer=dict(directory=os.path.join(directory, 'summarizer') , labels="all")
)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [6]:
# Set up control time with reference to simulation time
dt_action = 0.01
dt = environment.env_params["dt"]
T = environment.env_params["T"]
n_env_steps = int(dt_action / dt)
n_actions = int(T/dt/n_env_steps)
print(n_env_steps,n_actions)

20 10000


In [7]:
# Runner definition - Serial runner
runner = Runner(
    environment=environment,
    agent=agent,
    max_episode_timesteps=2,
    #evaluation=True
)

In [8]:
# Proceed to training
runner.run(
    num_episodes=3,
    save_best_agent=os.path.join(os.getcwd(), 'best_agent')
)

Episodes:   0%|          | 0/3 [00:00, reward=0.00, ts/ep=0, sec/ep=0.00, ms/ts=0.0, agent=0.0%](0.03+0j)
Episodes:  33%|███▎      | 1/3 [00:08, reward=-0.12, ts/ep=2, sec/ep=8.62, ms/ts=4312.2, agent=100.0%](0.03+0j)
Episodes:  67%|██████▋   | 2/3 [00:13, reward=-0.10, ts/ep=2, sec/ep=4.56, ms/ts=2279.5, agent=100.0%](0.03+0j)
Episodes: 100%|██████████| 3/3 [00:17, reward=-0.12, ts/ep=2, sec/ep=4.50, ms/ts=2248.9, agent=100.0%]

In [9]:
import csv
# Print statistics
print("Learning finished. Total episodes: {ep}. Average reward of last 100 episodes: {ar}.".format(
    ep=runner.episodes,
    ar=np.mean(runner.episode_rewards[-100:]))
)

name = "returns_tf.csv"
if (not os.path.exists("saved_models")):
    os.mkdir("saved_models")
if (not os.path.exists("saved_models/" + name)):
    with open("saved_models/" + name, "w") as csv_file:
        spam_writer = csv.writer(csv_file, delimiter=";", lineterminator="\n")
        spam_writer.writerow(["Episode", "Return"])
        for ep in range(len(runner.episode_rewards)):
            spam_writer.writerow([ep+1, runner.episode_rewards[ep]])

runner.close()


Episodes: 100%|██████████| 3/3 [00:17, reward=-0.12, ts/ep=2, sec/ep=4.50, ms/ts=2248.9, agent=100.0%]Learning finished. Total episodes: 3. Average reward of last 100 episodes: -0.11271708571155668.

