In [1]:
!sudo apt-get update
#!sudo apt-get install -y xvfb ffmpeg freeglut3-dev
#!pip install 'imageio==2.4.0'
#!pip install pyvirtualdisplay
!pip install tf-agents[reverb]
!pip install pyglet xvfbwrapper


Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease            
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease                      
Hit:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Reading package lists... Done


In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import functools
import os
import time

from absl import app
from absl import flags
from absl import logging
import gin
import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import
from tf_agents.agents.ppo import ppo_clip_agent
from tf_agents.drivers import dynamic_episode_driver
from tf_agents.environments import parallel_py_environment
from tf_agents.environments import suite_mujoco
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import actor_distribution_network
from tf_agents.networks import actor_distribution_rnn_network
from tf_agents.networks import value_network
from tf_agents.networks import value_rnn_network
from tf_agents.policies import policy_saver
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.system import system_multiprocessing as multiprocessing
from tf_agents.utils import common

from tf_agents.policies import random_tf_policy

from utils import load_dataset
from envs.trading_env_ext import TradingEnv

2023-09-21 03:09:29.670733: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-21 03:09:29.725794: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-21 03:09:29.727742: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
num_environment_steps=5000000 # original was 25M
collect_episodes_per_iteration=30
#num_parallel_environments=30
replay_buffer_capacity=1001
num_epochs=25
num_eval_episodes=30
debug_summaries=False
summarize_grads_and_vars=False
batch_size=1

summary_interval=50
learning_rate=1e-3

actor_fc_layers=(200, 100)
value_fc_layers=(200, 100)

WINDOW_SIZE=10

eval_interval=400
log_interval=100

In [4]:
eval_metrics = [
      tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
      tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes),
  ]

In [5]:
tf.compat.v1.enable_v2_behavior()
global_step = tf.compat.v1.train.get_or_create_global_step()
logging.set_verbosity(logging.INFO)

In [6]:
df = load_dataset('./KO.csv')

get_train_env = lambda: TradingEnv(df=df, window_size=WINDOW_SIZE, frame_bound=(WINDOW_SIZE, 1500))
get_eval_env = lambda: TradingEnv(df=df, window_size=WINDOW_SIZE, frame_bound=(1600, 2500))

In [None]:
with tf.compat.v2.summary.record_if(
    lambda: tf.math.equal(global_step % summary_interval, 0)
):
    eval_tf_env = tf_py_environment.TFPyEnvironment(get_eval_env())
    tf_env = tf_py_environment.TFPyEnvironment(get_train_env())
    
    #optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    
    actor_net = actor_distribution_network.ActorDistributionNetwork(
        tf_env.observation_spec(),
        tf_env.action_spec(),
        fc_layer_params=actor_fc_layers,
        activation_fn=tf.keras.activations.tanh,
    )
    value_net = value_network.ValueNetwork(
        tf_env.observation_spec(),
        fc_layer_params=value_fc_layers,
        activation_fn=tf.keras.activations.tanh,
    )

    # Agent
    tf_agent = ppo_clip_agent.PPOClipAgent(
         tf_env.time_step_spec(),
         tf_env.action_spec(),
         optimizer,
         actor_net=actor_net,
         value_net=value_net,
         entropy_regularization=0.0,
         importance_ratio_clipping=0.2,
         normalize_observations=False,
         normalize_rewards=False,
         use_gae=True,
         num_epochs=num_epochs,
         debug_summaries=debug_summaries,
         summarize_grads_and_vars=summarize_grads_and_vars,
         train_step_counter=global_step,
     )
    tf_agent.initialize()

    # metrics
    environment_steps_metric = tf_metrics.EnvironmentSteps()
    step_metrics = [
        tf_metrics.NumberOfEpisodes(),
        environment_steps_metric,
    ]

    train_metrics = step_metrics + [
        tf_metrics.AverageReturnMetric(),
        tf_metrics.AverageEpisodeLengthMetric(),
    ]

    # prepare driver
    eval_policy = tf_agent.policy
    collect_policy = tf_agent.collect_policy
    
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        tf_agent.collect_data_spec,
        batch_size=batch_size,
        max_length=replay_buffer_capacity,
    )
    
    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env,
        collect_policy,
        observers=[replay_buffer.add_batch] + train_metrics,
        num_episodes=collect_episodes_per_iteration,
    )

    def train_step():
        trajectories = replay_buffer.gather_all()
        return tf_agent.train(experience=trajectories)


    collect_driver.run = common.function(collect_driver.run, autograph=False)
    tf_agent.train = common.function(tf_agent.train, autograph=False)
    train_step = common.function(train_step)

    collect_time = 0
    train_time = 0
    timed_at_step = global_step.numpy()

    # Training
    while environment_steps_metric.result() < num_environment_steps:
        # print(environment_steps_metric.result())
        global_step_val = global_step.numpy()
        # print(global_step_val)
        if global_step_val % eval_interval == 0:
            metric_results = metric_utils.eager_compute(
                eval_metrics,
                eval_tf_env,
                eval_policy,
                num_episodes=num_eval_episodes,
                train_step=global_step,
            )
            print(f'eval results: {metric_results}')
        start_time = time.time()
        collect_driver.run()
        collect_time += time.time() - start_time

        start_time = time.time()
        total_loss, _ = train_step()
        replay_buffer.clear()
        train_time += time.time() - start_time

        for train_metric in train_metrics:
            train_metric.tf_summaries(
                train_step=global_step, step_metrics=step_metrics
            )
        
        if global_step_val % log_interval == 0:
            logging.info('step = %d, loss = %f, env_steps = %d', global_step_val, total_loss, environment_steps_metric.result())
            # steps_per_sec = (global_step_val - timed_at_step) / (
            #     collect_time + train_time
            # )
            # logging.info('%.3f steps/sec', steps_per_sec)
            logging.info(
                'collect_time = %.3f, train_time = %.3f', collect_time, train_time
            )
            # with tf.compat.v2.summary.record_if(True):
            #     tf.compat.v2.summary.scalar(
            #         name='global_steps_per_sec', data=steps_per_sec, step=global_step
            #     )
            timed_at_step = global_step_val
            collect_time = 0
            train_time = 0

    # One final eval before exiting.
    # metric_utils.eager_compute(
    #     eval_metrics,
    #     eval_tf_env,
    #     eval_policy,
    #     num_episodes=num_eval_episodes,
    #     train_step=global_step
    # )
    

  self._minimum[self._minimum == -np.inf] = low
  self._minimum[self._minimum == np.inf] = high
  self._maximum[self._maximum == -np.inf] = low
  self._maximum[self._maximum == np.inf] = high
  self._minimum[self._minimum == -np.inf] = low
  self._minimum[self._minimum == np.inf] = high
  self._maximum[self._maximum == -np.inf] = low
  self._maximum[self._maximum == np.inf] = high


tf.Tensor(0, shape=(), dtype=int64)


In [None]:
eval_py_env = TradingEnv(df=df, window_size=WINDOW_SIZE, frame_bound=(2000, 2500))
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

def render_policy_eval(policy, filename):
    time_step = eval_env.reset()
    while not time_step.is_last():
        action_step = policy.action(time_step)
        time_step = eval_env.step(action_step.action)
    eval_py_env.render('human')
    #plt.savefig(filename)

render_policy_eval(tf_agent.policy, "trained-agent")

In [None]:
random_policy = random_tf_policy.RandomTFPolicy(eval_env.time_step_spec(),eval_env.action_spec())
render_policy_eval(random_policy, "random-agent")