In [1]:
import pandas as pd
import pandas_ta
import numpy as np
 
# from gym_anytrading.datasets import FOREX_EURUSD_1H_ASK, STOCKS_GOOGL
import matplotlib.pyplot as plt

In [2]:
import os
import time
import datetime
from six.moves import range

import tensorflow as tf

In [8]:
PRICE_COLUMN = 'close'
USE_PRICE_RANGE_COLUMNS = False

freq = 'h'
start_date='2016-01-02'
end_date='2019-03-28'

ranges_dict_path = 'data\\ranges_dict.pickle'
save_path = f'.\\data\\featured_prices_{freq}_start_{start_date}.csv'

# prices_path = '.\\data\\prices_freq-min_2019-01-01_2019-03-28.csv'
prices_path = '.\\data\\sources\\coinbaseUSD_1-min_data_2014-12-01_to_2019-01-09.csv'

# Scrapped from twitters from 2016-01-01 to 2019-03-29, Collecting Tweets containing Bitcoin or BTC
tweets_path = 'data/sources/tweets_historical.csv'

In [4]:
import features.price_features as price_features

indicators = ['rsi', 'macd']
basic_args = {'append': True, 'ewm': True, 'adjust': True, 'freq': freq, 'signal_indicators': True}
args = dict(zip(indicators, [basic_args] * len(indicators)))

# args['rsi']['xa'] = 70
# args['rsi']['xb'] = 30

prices_df, ranges_dict = price_features.main(
    prices_path=prices_path,
    ranges_dict_path=ranges_dict_path,
    save_path=save_path,
    onlyRead=False,
    freq=[freq, 'd'],
    timestamp_col='Timestamp',
    cleanNans=True,
    start_date=start_date,
    args=args
)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lluis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Lluis\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Loading data...
Filling All Time data
Filling NA data
Aggregating from min to h level
Aggregating from min to d level
Generating TA features...
1 out of 2 featuresDropping 792 rows because of NaN values


In [9]:
from features.tweets_preprocess import (
    tweetsPreprocess,
    VADER_COLUMNS,
    TEXTBLOB_COLUMNS,
)

# TODO: Save tweets sentiment independent of prices and one file per date range and frequency

sentiment_cols = VADER_COLUMNS + TEXTBLOB_COLUMNS

save_path='data/preprocess/twitter.csv'

partial_file = os.path.splitext(save_path)
save_final_path = f'{partial_file[0]}_{start_date}_-_{end_date}{partial_file[1]}'

if os.path.exists(save_final_path):
    tweets_df = pd.read_csv(save_final_path, sep='\t', index_col='timestamp')

    tweets_df = tweets_df.set_index(
        pd.to_datetime(tweets_df.index)
    )

else:
    print("Start tweetsPreprocess")
    tweets_df = tweetsPreprocess(
        tweets_path,
        freq=freq,
        sentiment_cols=sentiment_cols,
        # sentiment_cols=['Compound', 'Polarity'],
        aggregate_cols=['replies', 'likes', 'retweets'], # TODO: Also by volume of tweets??
        start_date=start_date,
        end_date=end_date,
        nrows=100000,
        chunksize=5e5,
        save_path='data/preprocess/twitter.csv',
        write_files=False
    )

remove_cols = [
    'replies_sum',
    'replies_mean',
    'likes_sum',
    'likes_mean',
    'retweets_sum',
    'retweets_mean',
]
tweets_df = tweets_df.drop(remove_cols, axis=1)

In [10]:
data = prices_df.merge(tweets_df, how='left', left_index=True, right_index=True)
data = data.reset_index(drop=True)

In [11]:
FEATURE_COLUMNS = []
for key in ranges_dict:
    FEATURE_COLUMNS += ranges_dict[key]['cols'] if ranges_dict[key]['normalize'] else []

FEATURE_COLUMNS += list(tweets_df.columns)

In [12]:
signal_columns = prices_df.columns.str.contains('_XA_') | prices_df.columns.str.contains('_XB_') | prices_df.columns.str.contains('_A_') | prices_df.columns.str.contains('_B_')
FEATURE_COLUMNS = list(prices_df.columns[signal_columns])

In [13]:
if USE_PRICE_RANGE_COLUMNS:

    diff_cols = len(ranges_dict['prices']['cols']) - len(FEATURE_COLUMNS) - int(POSITION_AS_OBSERVATION)
    print(f'Difference of {diff_cols} columns between prices cols and normalized cols')
    print('In order to use Group Normalization Layer with 2 groups, both groups should be equal and sorted to be one first and then the other.')

    if diff_cols > 0:
        remove_cols = ['LR_14']
        print(f'The following columns are going to be removed: {remove_cols}')
        prices_cols = [col for col in ranges_dict['prices']['cols'] if col not in remove_cols]
    else:
        prices_cols = ranges_dict['prices']['cols']

    # Add prices cols into the FEATURE_COLUMNS
    FEATURE_COLUMNS = prices_cols + FEATURE_COLUMNS

# Make sure that PRICE_COL is in data
ALL_COLS = [PRICE_COLUMN] if PRICE_COLUMN not in FEATURE_COLUMNS else []
ALL_COLS += FEATURE_COLUMNS

# Set the columns used in data PRICE_COL + FEATURE_COLS
data = data[ALL_COLS]

In [14]:
assert not np.isinf(data).any(1).any(), data[np.isinf(data).any(1)]
assert not data.isnull().any().any()

In [15]:
# unit_factor = 60*24*30 # months 
unit_factor = 24*30*12 # years 
print(f'Data for {len(data.index) / unit_factor:.3f} units')

Data for 2.969 units


In [16]:
train_time = 2
gap_time = 1/12
valid_time = (len(data.index) / unit_factor - train_time - 2 * gap_time) / 2
test_time = (len(data.index) / unit_factor - train_time - 2 * gap_time) / 2

train_end = int(train_time * unit_factor)
valid_start = train_end + int(gap_time * unit_factor)
valid_end = valid_start + int(valid_time * unit_factor)
test_start = valid_end + int(gap_time * unit_factor)
test_end = test_start + int(test_time * unit_factor)

train = data.iloc[0:train_end, :]
valid = data.iloc[valid_start:valid_end, :]
test = data.iloc[test_start:test_end, :]

In [17]:
from RL_env.stock_env import RLStocksEnv, REVENUE_REWARD, PRICE_REWARD

# TODO: Steps scheduling, starting from low number of steps to high
steps_schedule = [4, 8, 16, 32, 64, 128, 256]
steps_per_episode = 64 # steps_schedule[-1]
window_size = 1
POSITION_AS_OBSERVATION = True
CONSTANT_STEP = False

num_parallel_environments = 1

reward_type = REVENUE_REWARD
max_step_reward = 0
max_final_reward = 1

SEED = 12345

#### ONLY FOR TESTING OVERFITING

# steps_per_episode = 5
# factor = 2
# # factor = 20
# train = train[0:steps_per_episode*factor]
# valid = valid[0:steps_per_episode*factor]
# test = test[0:steps_per_episode*factor]

##############################################

In [18]:
from utils import generateSplitEnvs

tf_env, eval_tf_env, test_tf_env = generateSplitEnvs(
    train,
    valid,
    test,
    window_size,
    steps_per_episode,
    FEATURE_COLUMNS,
    reward_type=reward_type,
    max_final_reward=max_final_reward,
    max_step_reward=max_step_reward,
    num_parallel_environments=num_parallel_environments,
    position_as_observation=POSITION_AS_OBSERVATION,
    constant_step=CONSTANT_STEP,
    is_training=True,
    seed=SEED,
)

In [19]:
from absl import logging
# Added in last versions
# import tf_agents.system import multiprocessing

logging.set_verbosity(logging.INFO)
# tf.logging.set_verbosity(tf.logging.INFO)
tf.compat.v1.enable_v2_behavior()

# Added in last versions
# multiprocessing.enable_interactive_mode()

In [20]:
agent = 'PPO'

STEP = 'step'
EPISODE = 'episode'
agent_unit = {
    'DQN': STEP,
    'PPO': EPISODE,
    'REINFORCE': EPISODE,
}
unit = agent_unit[agent]

In [21]:
from tensorflow.keras.optimizers import Adam, SGD
from tf_agents.utils import common

# Params for train
num_iterations = 10000000
# TODO: Adapt values to the step scheduling (replay_buffer_capacity, num_eval_episodes and intervals if too small)

train_steps_per_iteration = 1
collect_per_iteration = 3 * num_parallel_environments

# TODO: Improve learning rate with schedule and on e-greedy too
batch_size = 32
learning_rate = 6e-5 # 3e-4
optimizer = Adam(learning_rate=learning_rate) # SGD(learning_rate=learning_rate) # Adam(learning_rate=learning_rate)
gradient_clipping = 5

if agent == 'DQN':
    # TODO: Use other kind of policy like Boltzam?
    epsilon_greedy = 0.1

    target_update_tau = 0.05
    target_update_period = 5

    initial_collect_steps = num_iterations // 1000 # 1000

    n_step_update = 1

    td_errors_loss_fn = common.element_wise_huber_loss # common.element_wise_squared_loss # common.element_wise_huber_loss

    gamma = 0.99
    reward_scale_factor = 1.0

elif agent == 'PPO':
    
    importance_ratio_clipping = 0.2
    
    kl_cutoff_factor = 0 # 2.0
    kl_cutoff_coef = 1000.0
    initial_adaptive_kl_beta = 0 # 1.0
    adaptive_kl_target = 0.01
    adaptive_kl_tolerance = 0.3

    normalize_observations=True
    normalize_rewards=True
    reward_norm_clipping=10.0 # Not used if normalize_rewards=False
    use_gae=True
    lambda_value=1 # 0.95 
    discount_factor=1 # TODO: Rethink on how to implement discount factor because reward by prices is accumulative

    entropy_regularization = 0
    policy_l2_reg = 0
    value_function_l2_reg = 0
    shared_vars_l2_reg = 0
    value_pred_loss_coef = 0.5
    use_td_lambda_return = False
    log_prob_clipping = 0.0
    value_clipping = None
    num_epochs = 25

use_tf_functions = True

# Params for summaries and logging
summary_interval = 400
summaries_flush_secs = 10
summary_interval = max(summary_interval, steps_per_episode)
log_interval = summary_interval * 1

debug_summaries = True
summarize_grads_and_vars = True
check_numerics = False

# Params for eval
num_eval_seeds = 1
eval_interval = summary_interval * 2

# Params for checkpoints
train_checkpoint_interval = eval_interval * 20
policy_checkpoint_interval = eval_interval * 10
rb_checkpoint_interval = eval_interval * 40

In [22]:
# !rmdir /s /q .\\logs\\dqn

In [25]:
TRAIN_MODEL = True

root_dir = 'logs\\' + agent

if TRAIN_MODEL:
    root_dir = os.path.join(root_dir, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
else:
    root_dir = os.path.join(root_dir, '20200529-095708')

In [26]:
from tf_agents.metrics import tf_metrics

from utils import AgentEarlyStopping

root_dir = os.path.expanduser(root_dir)
train_dir = os.path.join(root_dir, 'train')
eval_dir = os.path.join(root_dir, 'eval')
saved_model_dir = os.path.join(root_dir, 'policy_saved_model')

train_summary_writer = tf.summary.create_file_writer(
    train_dir, flush_millis=summaries_flush_secs * 1000)
train_summary_writer.set_as_default()

step_metrics = []
train_metrics = step_metrics + [
    # tf_metrics.NumberOfEpisodes(),
    # tf_metrics.EnvironmentSteps(),
    tf_metrics.AverageReturnMetric(batch_size=num_parallel_environments),
    # tf_metrics.AverageEpisodeLengthMetric(),
    # tf_metrics.ChosenActionHistogram(dtype=tf.int32),
]

eval_summary_writer = tf.summary.create_file_writer(
    eval_dir, flush_millis=summaries_flush_secs * 1000)

eval_metrics = [
    tf_metrics.AverageReturnMetric(buffer_size=1),
    # tf_metrics.AverageEpisodeLengthMetric(buffer_size=1)
]

eval_metrics_callback = AgentEarlyStopping(
    monitor='AverageReturn', min_delta=0.0001, patience=15, warmup=45, verbose=1, mode='max'
)

global_step = tf.compat.v1.train.get_or_create_global_step()

In [27]:
# Define Q-network

train_sequence_length = window_size

# dropout_layer = (0.2,0.2,0.2,0.2,0.2)
dropout_layer = None
activation_fn = tf.nn.leaky_relu # tf.keras.activations.relu # tf.keras.activations.tanh

if agent == 'DQN':
    if train_sequence_length > 1:
        input_fc_layer_params = (8,)
        lstm_size = (16,)
        output_fc_layer_params = (8,)
        
    else:
        fc_layer_params = (100,)

elif agent == 'PPO':
    if train_sequence_length > 1:
        actor_fc_layers = (32,64,128,64,32)
        actor_lstm_size = (32,64,32,)
        actor_output_fc_layer = (32,64,32,)
        
        value_fc_layers = (32,64,32,)
        value_lstm_size = (16,)
        value_output_fc_layers = (32,64,32,)
    else:
        actor_fc_layers = (512,1024,2048,1024,512,)
        
        value_fc_layers = (512,1024,2048,1024,512,)


if agent == 'DQN':
    from tf_agents.networks import q_network
    from tf_agents.networks import q_rnn_network

    if train_sequence_length > 1:
        q_net = q_rnn_network.QRnnNetwork(
            tf_env.observation_spec(),
            tf_env.action_spec(),
            input_fc_layer_params=input_fc_layer_params,
            lstm_size=lstm_size,
            output_fc_layer_params=output_fc_layer_params
        )
    else:
        q_net = q_network.QNetwork(
            tf_env.observation_spec(),
            tf_env.action_spec(),
            fc_layer_params=fc_layer_params,
            dropout_layer_params=dropout_layer,
        )
        train_sequence_length = n_step_update

    if train_sequence_length != 1 and n_step_update != 1:
        raise NotImplementedError(
            'Currently not supporting n-step updates with stateful networks (i.e., RNNs)')

elif agent == 'PPO':
    from tf_agents.networks import actor_distribution_network
    from tf_agents.networks import actor_distribution_rnn_network
    from tf_agents.networks import value_network
    from tf_agents.networks import value_rnn_network

    if train_sequence_length > 1:
        actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
            tf_env.observation_spec(),
            tf_env.action_spec(),
            input_fc_layer_params=actor_fc_layers,
            input_dropout_layer_params=dropout_layer,
            lstm_size=actor_lstm_size,
            activation_fn=activation_fn,
            output_fc_layer_params=actor_output_fc_layer)
        value_net = value_rnn_network.ValueRnnNetwork(
            tf_env.observation_spec(),
            input_fc_layer_params=value_fc_layers,
            input_dropout_layer_params=dropout_layer,
            lstm_size=value_lstm_size,
            activation_fn=activation_fn, # alredy relu on source code
            output_fc_layer_params=actor_output_fc_layer)
    else:
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            tf_env.observation_spec(),
            tf_env.action_spec(),
            fc_layer_params=actor_fc_layers,
            dropout_layer_params=dropout_layer,
            activation_fn=activation_fn)
        value_net = value_network.ValueNetwork(
            tf_env.observation_spec(),
            fc_layer_params=value_fc_layers,
            dropout_layer_params=dropout_layer,
            activation_fn=activation_fn)

In [28]:
# TODO: Adapt for using step or episodes as unit to then can switch easily between TF-Agents
# Compare here: https://github.com/tensorflow/agents/blob/master/tf_agents/agents/ppo/examples/v2/train_eval_clip_agent.py
from tf_agents.agents.dqn import dqn_agent
from tf_agents.agents.ppo import ppo_agent # TODO: Use ppo_clip_agent which is the proposed above
from tf_agents.drivers import dynamic_step_driver, dynamic_episode_driver
from tf_agents.eval import metric_utils
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common

from tf_agents.policies import policy_saver

from tensorflow.compat.v2 import summary
from tensorflow import equal as tf_equal

from utils import (
    train_eval,
    evaluate
)

with summary.record_if(
    lambda: tf_equal(global_step % summary_interval, 0)):

    if agent == 'DQN':
      # TODO(b/127301657): Decay epsilon based on global step, cf. cl/188907839
      tf_agent = dqn_agent.DqnAgent(
          tf_env.time_step_spec(),
          tf_env.action_spec(),
          q_network=q_net,
          epsilon_greedy=epsilon_greedy,
          n_step_update=n_step_update,
          target_update_tau=target_update_tau,
          target_update_period=target_update_period,
          optimizer=optimizer,
          td_errors_loss_fn=td_errors_loss_fn,
          gamma=gamma,
          reward_scale_factor=reward_scale_factor,
          gradient_clipping=gradient_clipping,
          debug_summaries=debug_summaries,
          summarize_grads_and_vars=summarize_grads_and_vars,
          check_numerics=check_numerics,
          train_step_counter=global_step)
    elif agent == 'PPO':
      # TODO: Use ppo_clip_agent which is the proposed above
      # tf_agent = ppo_clip_agent.PPOClipAgent(
      tf_agent = ppo_agent.PPOAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        optimizer=optimizer,
        actor_net=actor_net,
        value_net=value_net,
        importance_ratio_clipping=importance_ratio_clipping,
        kl_cutoff_factor=kl_cutoff_factor,
        kl_cutoff_coef=kl_cutoff_coef,
        initial_adaptive_kl_beta=initial_adaptive_kl_beta,
        adaptive_kl_target=adaptive_kl_target,
        adaptive_kl_tolerance=adaptive_kl_tolerance,
        lambda_value=lambda_value,
        discount_factor=discount_factor,
        entropy_regularization=entropy_regularization,
        policy_l2_reg=policy_l2_reg,
        value_function_l2_reg=value_function_l2_reg,
        # shared_vars_l2_reg=shared_vars_l2_reg,
        value_pred_loss_coef=value_pred_loss_coef,
        normalize_observations=normalize_observations,
        use_gae=use_gae,
        use_td_lambda_return=use_td_lambda_return,
        normalize_rewards=normalize_rewards,
        reward_norm_clipping=reward_norm_clipping,
        log_prob_clipping=log_prob_clipping,
        gradient_clipping=gradient_clipping,
        # value_clipping=value_clipping,
        num_epochs=num_epochs,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        check_numerics=check_numerics,
        train_step_counter=global_step)
    else:
      raise NotImplementedError('Other agents than DQN and PPO are not yet implemented')

    policy_loaded = False
    for steps_per_episode in steps_schedule:

      if not TRAIN_MODEL and policy_loaded:
          break

      logging.info(
        f'Steps per episode equal to {steps_per_episode}'
      )

      tf_env, eval_tf_env, test_tf_env = generateSplitEnvs(
        train,
        valid,
        test,
        window_size,
        steps_per_episode,
        FEATURE_COLUMNS,
        reward_type=reward_type,
        max_final_reward=max_final_reward,
        max_step_reward=max_step_reward,
        num_parallel_environments=num_parallel_environments,
        position_as_observation=POSITION_AS_OBSERVATION,
        constant_step=False,
        is_training=True,
        seed=SEED,
      )

      num_eval_episodes = eval_tf_env.envs[0].frame_bound[-1] // eval_tf_env.envs[0].steps_per_episode
      for metric in eval_metrics:
          metric.batch_size = num_eval_episodes

      summary.scalar(
        name='step_scheduling', data=steps_per_episode, step=global_step)

      eval_metrics_callback.reset()

      train_eval(
        tf_agent,
        num_iterations,
        batch_size,
        tf_env,
        eval_tf_env,
        train_metrics,
        step_metrics,
        eval_metrics,
        global_step,
        steps_per_episode,
        num_parallel_environments,
        collect_per_iteration,
        train_steps_per_iteration,
        train_dir,
        saved_model_dir,
        eval_summary_writer,
        num_eval_episodes,
        num_eval_seeds=num_eval_seeds,
        eval_metrics_callback=eval_metrics_callback,
        train_sequence_length=train_sequence_length,
        initial_collect_steps=initial_collect_steps if agent=='DQN' else None,
        log_interval=log_interval,
        eval_interval=eval_interval,
        policy_checkpoint_interval=policy_checkpoint_interval,
        train_checkpoint_interval=train_checkpoint_interval,
        rb_checkpoint_interval=rb_checkpoint_interval,
        train_model=TRAIN_MODEL,
        use_tf_functions=use_tf_functions,
        eval_early_stopping=True,
        seed=SEED
      )

      policy_loaded = True

      summary.scalar(
        name='step_scheduling', data=steps_per_episode, step=global_step)

INFO:absl:Steps per episode equal to 4
INFO:absl:No checkpoint available at logs\PPO\20200709-084133\train
INFO:absl:No checkpoint available at logs\PPO\20200709-084133\train\policy
INFO:absl:Initial eval metric
INFO:absl: 
		 AverageReturn = 0.0
INFO:absl:Starting training...
INFO:absl:step = 400, loss = 0.212587
INFO:absl:3.490 steps/sec
INFO:absl:collect_time = 1.181, train_time = 113.358, summary_time = 0.076
INFO:absl:step = 800, loss = 0.936127
INFO:absl:39.526 steps/sec
INFO:absl:collect_time = 0.689, train_time = 9.404, summary_time = 0.027
INFO:absl: 
		 AverageReturn = 0.0
INFO:absl:Calculate Evaluation lasts 13.521 s
INFO:absl:step = 1200, loss = 0.044658
INFO:absl:38.680 steps/sec
INFO:absl:collect_time = 0.749, train_time = 9.559, summary_time = 0.033
INFO:absl:step = 1600, loss = 5.457369
INFO:absl:38.038 steps/sec
INFO:absl:collect_time = 0.766, train_time = 9.711, summary_time = 0.039
INFO:absl: 
		 AverageReturn = 0.0
INFO:absl:Calculate Evaluation lasts 14.636 s
INFO:

KeyboardInterrupt: 

In [28]:
from tf_agents.utils import common

best_step = 339200
ckpt_dir = os.path.join(train_dir, 'policy')
eval_policy = tf_agent.policy

policy_checkpointer = common.Checkpointer(
        ckpt_dir=ckpt_dir,
        policy=eval_policy,
        global_step=global_step)

best_dir = os.path.join(ckpt_dir, f'ckpt-{best_step}')
policy_checkpointer \
    ._checkpoint \
    .restore(best_dir)

INFO:absl:Checkpoint available: logs\PPO\20200529-095708\train\policy\ckpt-344000


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1fa8f0ade48>

In [29]:
# One last evaluation
results = evaluate(eval_metrics, eval_tf_env, tf_agent.policy, num_eval_episodes, num_eval_seeds, global_step, eval_summary_writer, summary_prefix='Metrics')

INFO:absl: 
		 AverageReturn = 0.0


In [None]:
from tensorflow.compat.v2 import summary

In [23]:
# One last evaluation
results = evaluate(eval_metrics, eval_tf_env, tf_agent.policy, num_eval_episodes, num_eval_seeds, global_step, eval_summary_writer, summary_prefix='Metrics')

INFO:absl: 
		 AverageReturn = 0.09264443814754486


In [32]:
env_data = test

all_envs = {}

full_env = RLStocksEnv(
    df=env_data,
    window_size=window_size,
    frame_bound=(window_size, len(env_data)),
    steps_per_episode=len(env_data) - window_size, # steps_per_episode,
    constant_step=True,
    is_training=False,
    feature_columns=FEATURE_COLUMNS,
    position_as_observation=POSITION_AS_OBSERVATION,
    reward_type=reward_type,
    max_final_reward=max_final_reward,
    max_step_reward=max_step_reward,
)
all_envs['Full eval'] = full_env

#TODO: For the is_training=True we have to make that all executions are using same cases
step_env = OwnStocksEnv(
    df=env_data,
    window_size=window_size,
    frame_bound=(window_size, len(env_data)),
    steps_per_episode=steps_per_episode,
    constant_step=True,
    is_training=True,
    feature_columns=FEATURE_COLUMNS,
    position_as_observation=POSITION_AS_OBSERVATION,
    reward_type=reward_type,
    max_final_reward=max_final_reward,
    max_step_reward=max_step_reward,
)
all_envs[f'Eval step of {steps_per_episode}'] = step_env

large_step_env = OwnStocksEnv(
    df=env_data,
    window_size=window_size,
    frame_bound=(window_size, len(env_data)),
    steps_per_episode=10 * steps_per_episode,
    constant_step=True,
    is_training=True,
    feature_columns=FEATURE_COLUMNS,
    position_as_observation=POSITION_AS_OBSERVATION,
    reward_type=reward_type,
    max_final_reward=max_final_reward,
    max_step_reward=max_step_reward,
)
all_envs[f'Eval step of {10*steps_per_episode}'] = large_step_env

if int(0.1 * steps_per_episode) > 1:
    small_step_env = OwnStocksEnv(
        df=env_data,
        window_size=window_size,
        frame_bound=(window_size, len(env_data)),
        steps_per_episode=int(0.1 * steps_per_episode),
        constant_step=True,
        is_training=True,
        feature_columns=FEATURE_COLUMNS,
        position_as_observation=POSITION_AS_OBSERVATION,
        reward_type=reward_type,
        max_final_reward=max_final_reward,
        max_step_reward=max_step_reward,
    )
    all_envs[f'Eval step of {int(0.1 * steps_per_episode)}'] = small_step_env

In [33]:
from RL_env.stock_env import runAllTestEnv

In [34]:
# Apply random policy on env
runAllTestEnv(all_envs, select_action_func=full_env.action_space.sample, deterministic_policy=False);

Testing enviorment Full eval:
Total rewards: -998.46 ± 937.645 (mean ± std. dev. of 21 iterations)
Total profits: -15.46% ± 15.421% (mean ± std. dev. of 21 iterations)
Total revenue ratio: 0.00% ± 0.005% (mean ± std. dev. of 21 iterations)
--------------------------------------------------
Testing enviorment Eval step of 4:
Total rewards: -0.22 ± 38.521 (mean ± std. dev. of 866 iterations)
Total profits: -0.00% ± 0.803% (mean ± std. dev. of 866 iterations)
Total revenue ratio: 19.27% ± 29.181% (mean ± std. dev. of 866 iterations)
--------------------------------------------------
Testing enviorment Eval step of 40:
Total rewards: -32.79 ± 177.569 (mean ± std. dev. of 86 iterations)
Total profits: -0.46% ± 3.136% (mean ± std. dev. of 86 iterations)
Total revenue ratio: 8.57% ± 14.218% (mean ± std. dev. of 86 iterations)
--------------------------------------------------


In [35]:
# Applying long term policy (buy at initial and do not sell) on env
from gym_anytrading.envs import Actions 

def always_buy_func():
    return  Actions.Buy.value

runAllTestEnv(all_envs, select_action_func=always_buy_func);

Testing enviorment Full eval:
Total rewards: -2344.58 ± 0.000 (mean ± std. dev. of 1 iterations)
Total profits: -36.92% ± 0.000% (mean ± std. dev. of 1 iterations)
Total revenue ratio: 0.00% ± 0.000% (mean ± std. dev. of 1 iterations)
--------------------------------------------------
Testing enviorment Eval step of 4:
Total rewards: -0.64 ± 52.889 (mean ± std. dev. of 866 iterations)
Total profits: -0.01% ± 1.168% (mean ± std. dev. of 866 iterations)
Total revenue ratio: 25.06% ± 32.908% (mean ± std. dev. of 866 iterations)
--------------------------------------------------
Testing enviorment Eval step of 40:
Total rewards: -59.79 ± 236.158 (mean ± std. dev. of 86 iterations)
Total profits: -0.95% ± 4.572% (mean ± std. dev. of 86 iterations)
Total revenue ratio: 11.50% ± 15.855% (mean ± std. dev. of 86 iterations)
--------------------------------------------------


In [36]:
# Applying baseline policy on env
# Manual policy used as baseline
from gym_anytrading.envs import Positions, Actions

# rsi_col = 'RSI_14'
# rsi_col = 'RSI_336'
# rsi_index = full_env.feature_columns.index(rsi_col)

# RSI usually is between 0 and 100, here is normalized between -1 and 1
# The baseline strategy is buy at 30 and sell at 70 otherwise hold
# def select_baseline_action(observation, rsi_thresh_buy=-0.6, rsi_thresh_sell=0.4, rsi_index=rsi_index):
#     # Use only last observation
#     obs = observation[-1]

#     position_value = int(obs[-1])
#     rsi = obs[rsi_index]

#     if position_value == Positions.Short.value and rsi <= rsi_thresh_buy:
#         action = Actions.Buy.value
#     elif position_value == Positions.Long.value and rsi >= rsi_thresh_sell:
#         action = Actions.Sell.value
#     else:
#         # Hold
#         # if it was in short remain in short because is selling
#         # if it was in long remain in long because is buying
#         action = position_value
    
#     return action

# RSI usually is between 0 and 100, here is normalized between -1 and 1
# The baseline strategy is buy at 30 and sell at 70 otherwise hold
def select_baseline_action(observation, buy_index, sell_index, trend_index):
    # Use only last observation
    obs = observation[-1]

    position_value = int(obs[-1])
    buy_signal = obs[buy_index]
    sell_signal = obs[sell_index]
    trend_signal = obs[trend_index]

    if position_value == Positions.Short.value and buy_signal and not trend_signal:
        action = Actions.Buy.value
    elif position_value == Positions.Long.value and sell_signal and trend_signal:
        action = Actions.Sell.value
    else:
        # Hold
        # if it was in short remain in short because is selling
        # if it was in long remain in long because is buying
        action = position_value
    
    return action

buy_col = 'RSI_14_B_20' + '_' + 'hour'
sell_col = 'RSI_14_A_80' + '_' + 'hour'
trend_col = 'MACD_12_26_9_A_0' + '_' + 'hour'

runAllTestEnv(all_envs, select_action_func=select_baseline_action, use_observation=True, buy_index=full_env.feature_columns.index(buy_col), sell_index=full_env.feature_columns.index(sell_col), trend_index=full_env.feature_columns.index(trend_col));

Testing enviorment Full eval:
Total rewards: -2290.72 ± 0.000 (mean ± std. dev. of 1 iterations)
Total profits: -40.12% ± 0.000% (mean ± std. dev. of 1 iterations)
Total revenue ratio: 0.00% ± 0.000% (mean ± std. dev. of 1 iterations)
--------------------------------------------------
Testing enviorment Eval step of 4:
Total rewards: 0.18 ± 9.993 (mean ± std. dev. of 866 iterations)
Total profits: 0.00% ± 0.182% (mean ± std. dev. of 866 iterations)
Total revenue ratio: 1.84% ± 12.708% (mean ± std. dev. of 866 iterations)
--------------------------------------------------
Testing enviorment Eval step of 40:
Total rewards: -14.01 ± 106.598 (mean ± std. dev. of 86 iterations)
Total profits: -0.14% ± 1.721% (mean ± std. dev. of 86 iterations)
Total revenue ratio: 0.98% ± 4.123% (mean ± std. dev. of 86 iterations)
--------------------------------------------------


In [37]:
def select_TFEnv_action(TFEnv, policy, done, time_step=None, policy_state=None):
    
    action_step = policy.action(time_step, policy_state)
    # distribution_step = policy._distribution(  # pylint: disable=protected-access
    #     time_step, policy_state)
    # if distribution_step.action.log_prob(0) > distribution_step.action.log_prob(1):
    #     print(distribution_step)
    #     print(distribution_step.action.log_prob(0), distribution_step.action.log_prob(1))

    # TODO(b/134487572): TF2 while_loop seems to either ignore
    # parallel_iterations or doesn't properly propagate control dependencies
    # from one step to the next. Without this dep, self.env.step() is called
    # in parallel.
    with tf.control_dependencies(tf.nest.flatten([time_step])):
        next_time_step = TFEnv.step(action_step.action)

    policy_state = action_step.state

    action = action_step.action.numpy()[0]
    # print(action)

    done = next_time_step.discount.numpy()[0] == 0
    # if done:
    #     display(TFEnv.envs[0].max_possible_profit_df.iloc[-1,0])
    #     print(TFEnv.envs[0]._total_profit)
    #     print(TFEnv.envs[0].calculate_revenue_ratio())

    return action, done, next_time_step, policy_state

In [38]:
from tf_agents.environments import tf_py_environment, parallel_py_environment
from tf_agents.environments.gym_wrapper import GymWrapper

all_tf_envs = {}

for key, value in all_envs.items():
    all_tf_envs[key] = tf_py_environment.TFPyEnvironment(GymWrapper(value))

In [39]:
runAllTestEnv(all_tf_envs, select_action_func=select_TFEnv_action, use_model=True, isTFEnv=True, policy=tf_agent.policy)

Testing enviorment Full eval:
Total rewards: -2027.51 ± 0.000 (mean ± std. dev. of 1 iterations)
Total profits: -33.98% ± 0.000% (mean ± std. dev. of 1 iterations)
Total revenue ratio: 0.00% ± 0.000% (mean ± std. dev. of 1 iterations)
--------------------------------------------------
Testing enviorment Eval step of 4:
Total rewards: -0.28 ± 51.763 (mean ± std. dev. of 866 iterations)
Total profits: -0.00% ± 1.158% (mean ± std. dev. of 866 iterations)
Total revenue ratio: 25.33% ± 32.873% (mean ± std. dev. of 866 iterations)
--------------------------------------------------
Testing enviorment Eval step of 40:
Total rewards: -44.97 ± 196.577 (mean ± std. dev. of 86 iterations)
Total profits: -0.75% ± 4.221% (mean ± std. dev. of 86 iterations)
Total revenue ratio: 11.25% ± 15.863% (mean ± std. dev. of 86 iterations)
--------------------------------------------------
