In [None]:
# Required for using keras-rl if not gives error
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
tf.test.gpu_device_name()

In [None]:
import pandas as pd
import pandas_ta
import numpy as np

from gym import spaces
from gym_anytrading.envs import TradingEnv, StocksEnv, Actions, Positions 
# from gym_anytrading.datasets import FOREX_EURUSD_1H_ASK, STOCKS_GOOGL
import matplotlib.pyplot as plt

In [None]:
from features import (
    generateTAFeatures,
    classifyColsByRanges,
    normalizeFeatures,
)

import pickle

PRICE_COLUMN = 'close'
POSITION_AS_OBSERVATION = True

ranges_dict_path = 'data\\ranges_dict.pickle'

data = pd.read_csv('.\\data\\featured_prices.csv', sep='\t')
data = data.drop('Timestamp', axis=1)
with open(ranges_dict_path, 'rb') as f:
    ranges_dict = pickle.load(f)

# data_csv = '.\\data\\prices_freq-min_2019-01-01_2019-03-28.csv'
# df = pd.read_csv(data_csv, sep='\t', index_col='Timestamp', parse_dates=True)
# cols = ['open', 'high', 'low', 'close', 'volume']
# df = df[cols]

# df = generateTAFeatures(df, [], None, True)
# ranges_dict = classifyColsByRanges(df)
# data = normalizeFeatures(df, ranges_dict)
# data.to_csv('data\\featured_prices.csv', sep='\t', index_label='Timestamp')

# with open(ranges_dict_path, 'wb') as f:
#     pickle.dump(ranges_dict, f)

In [None]:
FEATURE_COLUMNS = []
for key in ranges_dict:
    FEATURE_COLUMNS += ranges_dict[key]['cols'] if ranges_dict[key]['normalize'] else []

In [None]:
diff_cols = len(ranges_dict['prices']['cols']) - len(FEATURE_COLUMNS) - int(POSITION_AS_OBSERVATION)
print(f'Difference of {diff_cols} columns between prices cols and normalized cols')
print('In order to use Group Normalization Layer with 2 groups, both groups should be equal and sorted to be one first and then the other.')

if diff_cols > 0:
    remove_cols = ['LR_14']
    print(f'The following columns are going to be removed: {remove_cols}')
    prices_cols = [col for col in ranges_dict['prices']['cols'] if col not in remove_cols]
else:
    prices_cols = ranges_dict['prices']['cols']

FEATURE_COLUMNS = prices_cols + FEATURE_COLUMNS

# Make sure that PRICE_COL is in data
ALL_COLS = PRICE_COLUMN if PRICE_COLUMN not in FEATURE_COLUMNS else []
ALL_COLS += FEATURE_COLUMNS

# Set the columns used in data PRICE_COL + FEATURE_COLS
data = data[ALL_COLS]

In [None]:
assert not np.isinf(data).any(1).any(), data[np.isinf(data).any(1)]

In [None]:
# Drop columns which have all columns as NaN
remove_cols = data.dtypes[data.isnull().all()].index
if len(remove_cols) > 0:
    data = data.drop(remove_cols, axis=1)
    print(f'The following columns have been removed: {list(remove_cols)}')

In [None]:
# Drop rows which have at least one NaN
print(f'Dropping {data.isnull().any(axis=1).sum()} rows because of NaN values')
data = data[data.notnull().all(axis=1)]

In [None]:
unit_factor = 60*24*30 # months 
print(f'Data for {len(data.index) / unit_factor:.3f} units')

In [None]:
train_time = 2
gap_time = 0.3
test_time = len(data.index) / unit_factor - train_time - gap_time

train_end = int(train_time * unit_factor)
test_start = train_end + int(gap_time * unit_factor)
test_end = test_start + int(test_time * unit_factor)

train = data.iloc[0:train_end, :]
test = data.iloc[test_start:test_end, :]

In [None]:
steps_per_episode = 120

In [None]:
def prices_process_data(env):
    start = env.frame_bound[0] - env.window_size
    end = env.frame_bound[1]
    prices = env.df.loc[:, PRICE_COLUMN].to_numpy()[start:end]
    signal_features = env.df[FEATURE_COLUMNS].to_numpy()[start:end]
    return prices, signal_features

# TODO: Normalize somehow the reward to be more standard between runs, independent on the data is processing
# TODO: Plot training info during training to be able to track it
class OwnEnv(StocksEnv):
    _process_data = prices_process_data

    def __init__(self, df, window_size, frame_bound, steps_per_episode, is_training, position_as_observation=True, constant_step=False, min_steps_per_episode=2, seed=None):
        super().__init__(df, window_size, frame_bound)
        
        if min_steps_per_episode <= 0:
            raise ValueError(f'min_steps_per_episode must be bigger than 0')

        self.seed(seed)
        self.steps_per_episode = steps_per_episode
        self.max_steps_per_episode = steps_per_episode
        self.min_steps_per_episode = min_steps_per_episode
        self.is_training = is_training

        self.trade_fee_bid_percent = 0.0 # 0.01  # unit
        self.trade_fee_ask_percent = 0.0 # 0.005  # unit

        self.position_as_observation = position_as_observation
        self.shape = (window_size, self.signal_features.shape[1] + int(position_as_observation))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)

        self.constant_step = constant_step

    def reset(self, start_tick=None):
        if not self.constant_step:
            self.steps_per_episode = self.np_random.randint(self.max_steps_per_episode - self.min_steps_per_episode) + self.min_steps_per_episode

        if self.is_training:
            if start_tick is None:
                self._start_tick = min(
                    self.np_random.randint(self.frame_bound[1] - 1 - self.steps_per_episode) + self.window_size,
                    self.frame_bound[1] - self.steps_per_episode - 1
                )
            else:
                self._start_tick = start_tick
            self._end_tick = min(
                self._start_tick + self.steps_per_episode,
                self.frame_bound[1] - 1
            )
            
        return super().reset()

    def step(self, action):
        observation, reward, done, info = super().step(action)
        #print(observation, done, info)

        # TODO: Check if better use only final reward or step_rewards
        reward = 0
        if done:
            max_possible_revenue = self.max_possible_profit() - 1
            revenue = (info['total_profit'] - 1)
            if max_possible_revenue > 0:
                if revenue >= 0:
                    reward = revenue / max_possible_revenue
                else:
                    reward = 0
            elif max_possible_revenue < 0:
                # reward = max_possible_revenue / revenue
                reward = 0
            else:
                reward = revenue
            # TODO: Should this be modified?
            # info = dict(
            #     total_reward = self._total_reward,
            #     total_profit = self._total_profit,
            #     position = self._position.value
            # )

        # Only for tracking of training
        # if done:    
        #     print(info['total_profit'] - 1, self.max_possible_profit() - 1)
            
        return observation, reward, done, info

    def _get_observation(self):
        features = self.signal_features[(self._current_tick-self.window_size):self._current_tick]
        
        if self.position_as_observation:
            positions = np.expand_dims(
                np.array(
                    list(
                        map(
                            lambda position: position.value if position is not None else 0,
                            self._position_history[-self.window_size:]
                        )
                    )
                ),
                axis=1
            )
            return np.append(
                features,
                positions,
                axis=1
            )
        else:
            return features

window_size = 3

#### ONLY FOR TESTING OVERFITING

# data = data[0:steps_per_episode*2]

##############################################

train_env = OwnEnv(
    df=train,
    window_size=window_size,
    frame_bound=(window_size, len(train)),
    steps_per_episode=steps_per_episode,
    is_training=True,
    position_as_observation=POSITION_AS_OBSERVATION,
)

test_env = OwnEnv(
    df=test,
    window_size=window_size,
    frame_bound=(window_size, len(test)),
    steps_per_episode=steps_per_episode,
    is_training=False,
    position_as_observation=POSITION_AS_OBSERVATION,
)

#env = gym.make('forex-v0', frame_bound=(50, 100), window_size=10)
# env = gym.make('stocks-v0', frame_bound=(50, 100), window_size=10)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, LSTM, Input, Reshape, Cropping2D
from tensorflow.keras.regularizers import L1L2

from tensorflow_addons.layers import GroupNormalization

# TODO: Look at initialization/normalization to start with 50/50 model
# Next, we build a very simple model.
activation = 'relu'
regularizer = None# L1L2(0.1, 0.1)

model = Sequential()
if window_size > 1:
    model.add(Input((window_size, ) + train_env.observation_space.shape))
    model.add(Cropping2D(cropping=((-1, 0), (0, 0))))
    model.add(Reshape((window_size, train_env.observation_space.shape[1])))
    # model.add(LayerNormalization(axis=1, center=True, scale=True))
    model.add(GroupNormalization(groups=2, axis=2))
    model.add(LSTM(8, kernel_regularizer=regularizer, return_sequences=True))
    model.add(LSTM(8, kernel_regularizer=regularizer))

else:
    model.add(Flatten(input_shape=(window_size,) + train_env.observation_space.shape))
    model.add(Dense(64, activation='relu', kernel_regularizer=regularizer))
    # model.add(Dense(1024, activation='relu', kernel_regularizer=regularizer))
    model.add(Dense(64, activation='relu', kernel_regularizer=regularizer))

model.add(Dense(train_env.action_space.n, kernel_regularizer=regularizer))
print(model.summary())

In [None]:
import datetime

logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, update_freq='batch')

callbacks = [tensorboard_callback]
callbacks = []

In [None]:
from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy, SoftmaxPolicy
from rl.memory import SequentialMemory

from tensorflow.keras.optimizers import Adam

memory_steps = 100000
memory_steps = min(memory_steps, len(train_env.df) - 1)
# Finally, we configure and compile our agent. You can use every built-in tensorflow.keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=memory_steps, window_length=window_size)
policy = BoltzmannQPolicy()
# policy = EpsGreedyQPolicy()
# dqn = DQNAgent(model=model, nb_actions=env.action_space.n, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy)
dqn = DQNAgent(model=model, nb_actions=train_env.action_space.n, memory=memory, target_model_update=1e-2, nb_steps_warmup=window_size, policy=policy, gamma=1, processor=None)
dqn.compile(Adam(learning_rate=1e-3, clipvalue=0.1), metrics=['mae'])

In [None]:
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
history = dqn.fit(train_env, nb_steps=memory_steps, visualize=False, verbose=2, callbacks=callbacks)

In [None]:
rewards = history.history['episode_reward']
rewards_df = pd.DataFrame(rewards)

In [None]:
plt.figure()
x = range(len(rewards))
plt.plot(x, rewards)
plt.plot(x, rewards_df.rolling(25).mean())
plt.show()

In [None]:
env_data = test

all_envs = {}

full_env = OwnEnv(
    df=env_data,
    window_size=window_size,
    frame_bound=(window_size, len(env_data)),
    steps_per_episode=len(env_data) - window_size, # steps_per_episode,
    constant_step=True,
    is_training=False,
    position_as_observation=POSITION_AS_OBSERVATION,
)
all_envs['Full test'] = full_env

#TODO: For the is_training=True we have to make that all executions are using same cases
step_env = OwnEnv(
    df=env_data,
    window_size=window_size,
    frame_bound=(window_size, len(env_data)),
    steps_per_episode=steps_per_episode,
    constant_step=True,
    is_training=True,
    position_as_observation=POSITION_AS_OBSERVATION,
)
all_envs[f'Test step of {steps_per_episode}'] = step_env

large_step_env = OwnEnv(
    df=env_data,
    window_size=window_size,
    frame_bound=(window_size, len(env_data)),
    steps_per_episode=10 * steps_per_episode,
    constant_step=True,
    is_training=True,
    position_as_observation=POSITION_AS_OBSERVATION,
)
all_envs[f'Test step of {10*steps_per_episode}'] = large_step_env

small_step_env = OwnEnv(
    df=env_data,
    window_size=window_size,
    frame_bound=(window_size, len(env_data)),
    steps_per_episode=int(0.1 * steps_per_episode),
    constant_step=True,
    is_training=True,
    position_as_observation=POSITION_AS_OBSERVATION,
)
all_envs[f'Test step of {int(0.1 * steps_per_episode)}'] = small_step_env

In [None]:
def runAllTestEnv(all_envs, select_action_func, iterations=None, use_steps=False, use_observation=False, use_model=False, **kwargs):
    if type(all_envs) is list:
        all_envs = dict(zip([f'Env_{i}' for i in range(len(all_envs))], all_envs))

    if type(all_envs) is not dict:
        raise ValueError('all_envs should be dictionary of name and enviorment or a list of enviorments')
    else:
        for env_name, env in all_envs.items():
            print(f'Testing enviorment {env_name}:')
            runTestEnv(env, select_action_func, iterations=iterations, use_steps=use_steps, use_observation=use_observation, use_model=use_model, **kwargs)
            print('-'*50)

def runTestEnv(env, select_action_func, iterations=None, use_steps=False, use_observation=False, use_model=False, **kwargs):
    if iterations is None:
        if env.is_training:
            iterations = int((env.frame_bound[1] - env.frame_bound[0]) / env.steps_per_episode)
        else:
            iterations = 1
    
    total_rewards = []
    total_profits = [] 

    start_tick = env.window_size
    for i in range(iterations):
        observation = env.reset(start_tick=start_tick)
        if use_model:
            done = False
            recent_observations = []
            recent_terminals = []
        step = 0
        while True:
            if use_model:
                action = select_action_func(observation, recent_observations, recent_terminals, done, **kwargs)
            else:
                if use_observation:
                    if use_steps:
                        action = select_action_func(observation=observation, step=step, **kwargs)
                    else:
                        action = select_action_func(observation=observation, **kwargs)
                else:
                    if use_steps:
                        action = select_action_func(step=step, **kwargs)
                    else:
                        action = select_action_func(**kwargs)

            observation, reward, done, info = env.step(action)
            
            if done:
                start_tick = env._current_tick
                break

            step += 1

        total_rewards.append(info['total_reward'])
        total_profits.append(info['total_profit'])
    
    print(f'Total rewards: {np.mean(total_rewards):.2f} ± {np.std(total_rewards):.3f} (mean ± std. dev. of {iterations} iterations)')
    print(f'Total profits: {(np.mean(total_profits) - 1):.2%} ± {np.std(total_profits):.3%} (mean ± std. dev. of {iterations} iterations)')

    return total_rewards, total_profits

In [None]:
# Apply random policy on env
runAllTestEnv(all_envs, select_action_func=full_env.action_space.sample);

In [None]:
# Applying long term policy (buy at initial and do not sell) on env

def always_buy_func():
    return  Actions.Buy.value

runAllTestEnv(all_envs, select_action_func=always_buy_func);

In [None]:
# Applying baseline policy on env

# Manual policy used as baseline

rsi_col = 'RSI_14'
# rsi_col = 'Close_rsi'
rsi_index=FEATURE_COLUMNS.index(rsi_col)

# RSI usually is between 0 and 100, here is normalized between -1 and 1
# The baseline strategy is buy at 30 and sell at 70 otherwise hold
def select_baseline_action(observation, rsi_thresh_buy=-0.6, rsi_thresh_sell=0.4, rsi_index=rsi_index):
    # Use only last observation
    obs = observation[-1]

    position_value = int(obs[-1])
    rsi = obs[rsi_index]

    if position_value == Positions.Short.value and rsi <= rsi_thresh_buy:
        action = Actions.Buy.value
    elif position_value == Positions.Long.value and rsi >= rsi_thresh_sell:
        action = Actions.Sell.value
    else:
        # Hold
        # if it was in short remain in short because is selling
        # if it was in long remain in long because is buying
        action = position_value
    
    return action

runAllTestEnv(all_envs, select_action_func=select_baseline_action, use_observation=True, rsi_thresh_buy=0.3, rsi_thresh_sell=0.7);

In [None]:
import numpy as np
from rl.memory import zeroed_observation

# Applying trained policy on env

def get_recent_state(current_observation, recent_observations, recent_terminals, window_length):
        """Return list of last observations

        # Argument
            current_observation (object): Last observation

        # Returns
            A list of the last observations
        """
        # This code is slightly complicated by the fact that subsequent observations might be
        # from different episodes. We ensure that an experience never spans multiple episodes.
        # This is probably not that important in practice but it seems cleaner.
        state = [current_observation]
        idx = len(recent_observations) - 1
        for offset in range(0, window_length - 1):
            current_idx = idx - offset
            current_terminal = recent_terminals[current_idx - 1] if current_idx - 1 >= 0 else False
            if current_idx < 0 or current_terminal:
                # The previously handled observation was terminal, don't add the current one.
                # Otherwise we would leak into a different episode.
                break
            state.insert(0, recent_observations[current_idx])
        while len(state) < window_length:
            state.insert(0, zeroed_observation(state[0]))
        return state

def select_model_action(observation, recent_observations, recent_terminals, done, window_size):
    obs = get_recent_state(observation, recent_observations, recent_terminals, window_size)
    obs = np.expand_dims(obs, axis=0)
    recent_observations.append(observation)
    recent_terminals.append(done)
    return np.argmax(model.predict(obs))

runAllTestEnv(all_envs, select_action_func=select_model_action, use_model=True, window_size=window_size);

In [None]:
# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_weights.h5f'.format('prices'), overwrite=True)