## Install TensorTrade

In [1]:
if not "cdDone" in globals():
    %cd -q ..
    cdDone = True

In [2]:
# !python3 -m pip install git+https://github.com/nsarang/tensortrade.git --force

## Setup

In [3]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline


import re
import sys
import time
import pandas as pd
pd.options.mode.use_inf_as_na = True

import numpy as np
from datetime import datetime, timedelta, timezone
from tenacity import retry, retry_if_exception_type, stop_after_attempt
import pytz


In [4]:
import asyncio
import ccxt

# import ccxt.async_support as ccxt

creds = {
    "binance": {
        "apiKey": "jxlzo1mxQ1PDckz4aYgH2WDgFxpJjBu47r3OB4vyLyZkEeyJ4xjOM6m32mvsIgmu",
        "secret": "EffQgaLRPl52q0YEpVKcIHDeqyrFBQWm2K1Er99egbQ1c75X7fDREg4UtzhSaCJM",
    },
    "ftx": {
        "apiKey": "4HO0ffan2qCuTHI06w-Wt-1Bj74WHWYeq4L4-5Ga",
        "secret": "jY85jbW05BWNCkhkathvKfUSl6lGdNgyZXrWnL3W",
    }
}


exchange = ccxt.binance(
    {
        **creds["binance"],
        "enableRateLimit": True,
        # 'options': {
        #     'defaultType': 'spot', // spot, future, margin
        # },
    }
)

## Data Util

In [5]:
@retry(retry=retry_if_exception_type(ccxt.NetworkError), stop=stop_after_attempt(3))
def get_historical_data(
    symbol,
    exchange,
    timeframe,
    start_date=None,
    limit=500,
    max_per_page=500,
    backup_fp=None,
):
    """Get historical OHLCV for a symbol pair

    Decorators:
        retry

    Args:
        symbol (str): Contains the symbol pair to operate on i.e. BURST/BTC
        exchange (str): Contains the exchange to fetch the historical data from.
        timeframe (str): A string specifying the ccxt time unit i.e. 5m or 1d.
        start_date (int, optional): Timestamp in milliseconds.
        max_periods (int, optional): Defaults to 100. Maximum number of time periods
          back to fetch data for.

    Returns:
        list: Contains a list of lists which contain timestamp, open, high, low, close, volume.
    """

    try:
        if timeframe not in exchange.timeframes:
            raise ValueError(
                "{} does not support {} timeframe for OHLCV data. Possible values are: {}".format(
                    exchange, timeframe, list(exchange.timeframes)
                )
            )
    except AttributeError:
        self.logger.error(
            "%s interface does not support timeframe queries! We are unable to fetch data!",
            exchange,
        )
        raise AttributeError(sys.exc_info())

    timeframe_regex = re.compile("([0-9]+)([a-zA-Z])")
    timeframe_matches = timeframe_regex.match(timeframe)
    time_quantity = timeframe_matches.group(1)
    time_period = timeframe_matches.group(2)
    timedelta_values = {
        "m": "minutes",
        "h": "hours",
        "d": "days",
        "w": "weeks",
        "M": "months",
        "y": "years",
    }

    timedelta_args = {timedelta_values[time_period]: int(time_quantity)}
    single_frame = timedelta(**timedelta_args)

    if not start_date:
        start_datetime = datetime.now() - (limit * single_frame)
        start_date = int(start_datetime.timestamp() * 1000)

    stop_limit = limit or np.inf

    try:
        historical_data = []
        cursor = int(start_date)
        while True:
            ohlcv = exchange.fetch_ohlcv(
                symbol, timeframe=timeframe, since=cursor, limit=limit
            )
            historical_data += ohlcv
            if not ohlcv:
                break
            if len(historical_data) >= stop_limit:
                historical_data = historical_data[:limit]
                break
            cursor = ohlcv[-1][0] + 1
    except:
        if historical_data and backup_fp:
            convert_to_dataframe(historical_data).to_csv(backup_fp)

    if not historical_data:
        raise ValueError("No historical data provided returned by exchange.")

    #     if len(historical_data) != total:
    #         raise ValueError("Gaps detected in historical data.")

    # Sort by timestamp in ascending order
    historical_data.sort(key=lambda d: d[0])

    return historical_data

In [6]:
def timestamp_to_datetime(
    timestamp, timezone=pytz.timezone("America/Montreal"), to_str=False
):
    time = datetime.fromtimestamp(timestamp, timezone)
    if to_str:
        time = time.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
    return time


def convert_to_dataframe(historical_data):
    """Converts historical data matrix to a pandas dataframe.

    Args:
        historical_data (list): A matrix of historical OHCLV data.

    Returns:
        pandas.DataFrame: Contains the historical data in a pandas dataframe.
    """

    dataframe = pd.DataFrame(historical_data)
    dataframe.transpose()

    #     print(dataframe.head())
    dataframe.columns = ["timestamp", "open", "high", "low", "close", "volume"]
    dataframe["datetime"] = dataframe.timestamp.apply(
        lambda x: timestamp_to_datetime(x / 1000)
    )

    dataframe.set_index("datetime", inplace=True, drop=True)
    dataframe.drop("timestamp", axis=1, inplace=True)

    return dataframe

**Notes**

- .replace(tzinfo=timezone.utc) wont't change the time!
- datetime.timestamp() automatically converts to UTC

In [7]:
since = datetime.strptime("2020-10-01", "%Y-%m-%d").timestamp() * 1000

In [8]:
# symbol = "ETH/USDT"
# filename = symbol.replace("/", "-") + ".csv"

# df = convert_to_dataframe(
#     get_historical_data(
#         symbol,
#         exchange,
#         timeframe="5m",
#         start_date=since,
#         limit=100,
# #         backup_fp=filename,
#     )
# )
# # df.to_csv(filename)

## Indicator Util

In [9]:
def shift(values: np.ndarray, periods: int, axis, fill_value) -> np.ndarray:
    new_values = values

    if periods == 0 or values.size == 0:
        return new_values.copy()

    # make sure array sent to np.roll is c_contiguous
    f_ordered = values.flags.f_contiguous
    if f_ordered:
        new_values = new_values.T
        axis = new_values.ndim - axis - 1

    if np.prod(new_values.shape):
        new_values = np.roll(new_values, periods, axis=axis)

    axis_indexer = [slice(None)] * values.ndim
    if periods > 0:
        axis_indexer[axis] = slice(None, periods)
    else:
        axis_indexer[axis] = slice(periods, None)
    new_values[tuple(axis_indexer)] = fill_value

    # restore original order
    if f_ordered:
        new_values = new_values.T

    return new_values


def crossing(a, b):
    a_plus = shift(a, 1, axis=0, fill_value=0)
    b_plus = shift(b, 1, axis=0, fill_value=0)
    cross = np.where(
        (a <= b) & (a_plus >= b_plus),
        1,
        np.where(((a >= b) & (a_plus <= b_plus)), -1, 0),
    )
    return cross


def smooth_range(series, period, mult):
    wper = period * 2 - 1
    diff = (series - series.shift(1, fill_value=0)).abs()
    average = ta.EMA(diff, period)
    smoothed = ta.EMA(average, wper) * mult
    smoothed = pd.Series(smoothed, index=series.index)
    return smoothed


def filter_range(series, smoothrng):
    result = series.shift(1, fill_value=0)
    for time, (close, smth) in enumerate(zip(series, smoothrng)):
        prev = result.iloc[time]
        if time == 0 or ((close >= prev - smth) and (close <= prev + smth)):
            continue

        if close > prev + smth:
            prev = close - smth
        else:
            prev = close + smth
        result.iloc[time] = prev
    return result

In [10]:
def SWING_CALLS(df):
    ema = ta.EMA(df.close, 5)
    sma = ta.SMA(df.close, 50)
    rsi = ta.RSI(df.close, 14)

    color = np.where(
        (rsi >= 85) | (rsi <= 15),
        "YELLOW",
        np.where(df.low > sma, "LIME", np.where(df.high < sma, "RED", "YELLOW")),
    )

    buyexit = rsi > 80
    sellexit = rsi < 30

    sellcall = (crossing(sma, ema) > 0) & (df.open > df.close)
    buycall = (crossing(sma, ema) < 0) & (df.high > sma)

    return buyexit, sellexit, sellcall, buycall


def Range_Filter_Buy_Sell(df, period=100, range_multiplier=3):
    # Smooth Average Range
    smoothed = smooth_range(df.close, period, range_multiplier)

    # Range Filter
    filtered = filter_range(df.close, smoothed)

    buycall = (
        (df.close > filtered)
        & (df.close > df.close.shift(1))
        & (filtered > filtered.shift(1))
    )
    sellcall = (
        (df.close < filtered)
        & (df.close < df.close.shift(1))
        & (filtered < filtered.shift(1))
    )
    return buycall, sellcall


def calculate_profit(ohlvc, buycall, sellcall, start_from=100, trade_fee=0.1):
    money = 1
    asset = 0
    last_buy = ohlvc.iloc[start_from]["close"]
    trade_cost = 0
    trade_fee /= 100
    for time, (buy, sell) in enumerate(zip(buycall, sellcall)):
        if time < start_from:
            continue

        if buy and money and (time != len(ohlvc) - 1):
            trade_cost += money * trade_fee
            money *= 1 - trade_fee
            asset = money / ohlvc.iloc[time]["close"]
            money = 0
            last_buy = ohlvc.iloc[time]["close"]

        elif (sell or (time == len(ohlvc) - 1)) and asset:
            money = asset * ohlvc.iloc[time]["close"]
            trade_cost += money * trade_fee
            money *= 1 - trade_fee
            asset = 0

    return money, trade_cost

## Data

### Load

In [11]:
import pandas as pd
import tensortrade as tt
import tensortrade.env.default as default

from tensortrade.data.cdd import CryptoDataDownload
from tensortrade.feed.core import Stream, DataFeed
from tensortrade.oms.exchanges import Exchange, ExchangeOptions
from tensortrade.oms.services.execution.simulated import execute_order
from tensortrade.oms.instruments import USD, BTC, ETH
from tensortrade.oms.wallets import Wallet, Portfolio
from tensortrade.agents import DQNAgent, A2CAgent

In [12]:
# cdd = CryptoDataDownload()
# data = cdd.fetch("Coinbase", "USD", "BTC", "1h")


df = pd.read_csv("data/BTC-USDT.csv")
date = df["datetime"]
date = date.apply(lambda x: x.rsplit("-", 1)[0].split(".")[0]) # remove ms and UTC offset (.%f%z)
date = pd.to_datetime(date)
df["date"] = date
df = df.sort_values("date")
df.head()

Unnamed: 0,datetime,open,high,low,close,volume,date
0,2017-08-17 00:00:00-04:00,4261.48,4280.56,4261.48,4261.48,2.189061,2017-08-17 00:00:00
1,2017-08-17 00:05:00-04:00,4261.48,4261.48,4261.48,4261.48,0.0,2017-08-17 00:05:00
2,2017-08-17 00:10:00-04:00,4261.48,4261.48,4261.48,4261.48,0.0,2017-08-17 00:10:00
3,2017-08-17 00:15:00-04:00,4261.48,4264.88,4261.48,4261.48,0.484666,2017-08-17 00:15:00
4,2017-08-17 00:20:00-04:00,4264.88,4266.29,4264.88,4266.29,2.32857,2017-08-17 00:20:00


In [13]:
import ta

data = ta.add_all_ta_features(
    df, open="open", high="high", low="low", close="close", volume="volume"
)


invalid value encountered in double_scalars


invalid value encountered in double_scalars



In [14]:
data.columns

Index(['datetime', 'open', 'high', 'low', 'close', 'volume', 'date',
       'volume_adi', 'volume_obv', 'volume_cmf', 'volume_fi', 'momentum_mfi',
       'volume_em', 'volume_sma_em', 'volume_vpt', 'volume_nvi', 'volume_vwap',
       'volatility_atr', 'volatility_bbm', 'volatility_bbh', 'volatility_bbl',
       'volatility_bbw', 'volatility_bbp', 'volatility_bbhi',
       'volatility_bbli', 'volatility_kcc', 'volatility_kch', 'volatility_kcl',
       'volatility_kcw', 'volatility_kcp', 'volatility_kchi',
       'volatility_kcli', 'volatility_dcl', 'volatility_dch', 'trend_macd',
       'trend_macd_signal', 'trend_macd_diff', 'trend_sma_fast',
       'trend_sma_slow', 'trend_ema_fast', 'trend_ema_slow', 'trend_adx',
       'trend_adx_pos', 'trend_adx_neg', 'trend_vortex_ind_pos',
       'trend_vortex_ind_neg', 'trend_vortex_ind_diff', 'trend_trix',
       'trend_mass_index', 'trend_cci', 'trend_dpo', 'trend_kst',
       'trend_kst_sig', 'trend_kst_diff', 'trend_ichimoku_conv',
       't

### Create features with the feed module

In [15]:
def rsi(price: Stream[float], period: float) -> Stream[float]:
    r = price.diff()
    upside = r.clamp_min(0).abs()
    downside = r.clamp_max(0).abs()
    rs = upside.ewm(alpha=1 / period).mean() / downside.ewm(alpha=1 / period).mean()
    return 100*(1 - (1 + rs) ** -1)


def macd(price: Stream[float], fast: float, slow: float, signal: float) -> Stream[float]:
    fm = price.ewm(span=fast, adjust=False).mean()
    sm = price.ewm(span=slow, adjust=False).mean()
    md = fm - sm
    signal = md - md.ewm(span=signal, adjust=False).mean()
    return signal

In [16]:
# features = [
#     Stream.source(list(data[c]), dtype="float").rename(data[c].name)
#     for c in data.columns[2:]
# ]

# close = Stream.select(features, lambda s: s.name == "close")

In [17]:
# class Listener:
#     def on_next(self, value):
#         print(value)

# close.attach(Listener())

In [18]:
# from tensortrade.feed.core import Stream
# ss = Stream.source([1, 2, 3, 4, 5], dtype="float")

# ff = DataFeed([ss.rolling(2).mean()])
# ff.next()

In [19]:
# features = [
#     close.ewm(span=14).mean().rename("ema"),
#     close.ewm(alpha=1).mean().rename("sma"),
#     close.log().diff().rename("lr"),
#     rsi(close, period=20).rename("rsi"),
#     macd(close, fast=10, slow=50, signal=5).rename("macd")
# ]

# feed = DataFeed(features)
# feed.compile()

In [20]:
# import json

# for i in range(5):
#     obsv = feed.next()
#     print(json.dumps(obsv, indent=4))

## Env

In [21]:
features = data[[x for x in data.columns if not x.startswith("date")]]
features = features.pct_change()
features = features.fillna(0)

In [22]:
commission = 0.005
window_size = 200

feed = DataFeed(
    [
        Stream.source(list(features[c]), dtype="float").rename(features[c].name)
        for c in features.columns
    ]
)

renderer_feed = DataFeed(
    [
        Stream.source(list(data["date"])).rename("date"),
        Stream.source(list(data["open"]), dtype="float").rename("open"),
        Stream.source(list(data["high"]), dtype="float").rename("high"),
        Stream.source(list(data["low"]), dtype="float").rename("low"),
        Stream.source(list(data["close"]), dtype="float").rename("close"),
        Stream.source(list(data["volume"]), dtype="float").rename("volume"),
    ]
)

exchange_opts = ExchangeOptions(commission=commission)
coinbase = Exchange("coinbase", service=execute_order, options=exchange_opts)(
    Stream.source(list(data["close"]), dtype="float").rename("USD/BTC")
)

cash = Wallet(coinbase, 10000 * USD)
asset = Wallet(coinbase, 0 * BTC)
portfolio = Portfolio(USD, [cash, asset])


reward_scheme = default.rewards.SimpleProfit()
action_scheme = default.actions.SimpleOrders()


env = default.create(
    feed=feed,
    renderer_feed=renderer_feed,
    renderer=default.renderers.PlotlyTradingChart(display=False, save_format="html"),
    portfolio=portfolio,
    action_scheme=action_scheme,
    reward_scheme=reward_scheme,
    window_size=window_size,
    min_periods=window_size,
    #         max_allowed_loss=0.5,
)

In [35]:
# # for i in range(1000):
# #     env.observer.observe(env)
 
# env.observer.observe(env)

In [36]:
# from tensortrade.agents import VPGAgent
# import tensortrade.agents.vpg.core as vcore

# env_fn = lambda: env
# agent = VPGAgent(
#     env_fn,
#     exp_name="test",
#     actor_critic=vcore.CNNActorCritic,
#     ac_kwargs=dict(device="cuda:0"),
#     steps_per_epoch=250,
#     train_v_iters=50,
#     epochs=1000
# )
# agent.train(render_interval=3, save_path="agents/")

## Ray

In [37]:
import ray
import numpy as np

from ray import tune
from ray.tune.registry import register_env

import tensortrade.env.default as default
from tensortrade.feed.core import DataFeed, Stream
from tensortrade.oms.instruments import Instrument
from tensortrade.oms.exchanges import Exchange, ExchangeOptions
from tensortrade.oms.services.execution.simulated import execute_order
from tensortrade.oms.wallets import Wallet, Portfolio


def create_env(config):
    features = config["features"].copy()
    data = config["data"].copy()
    feed = DataFeed(
        [
            Stream.source(list(features[c]), dtype="float").rename(features[c].name)
            for c in features.columns
        ]
    )

    renderer_feed = DataFeed(
        [
            Stream.source(list(data["date"])).rename("date"),
            Stream.source(list(data["open"]), dtype="float").rename("open"),
            Stream.source(list(data["high"]), dtype="float").rename("high"),
            Stream.source(list(data["low"]), dtype="float").rename("low"),
            Stream.source(list(data["close"]), dtype="float").rename("close"),
            Stream.source(list(data["volume"]), dtype="float").rename("volume"),
        ]
    )

    exchange_opts = ExchangeOptions(commission=config["commission"])
    coinbase = Exchange("coinbase", service=execute_order, options=exchange_opts)(
        Stream.source(list(data["close"]), dtype="float").rename("USD/BTC")
    )

    cash = Wallet(coinbase, 10000 * USD)
    asset = Wallet(coinbase, 0 * BTC)
    portfolio = Portfolio(USD, [cash, asset])

    reward_scheme = default.rewards.SimpleProfit(window_size=config["window_size"])
    action_scheme = default.actions.SimpleOrders()

    env = default.create(
        feed=feed,
        #         renderer_feed=renderer_feed,
        #         renderer=default.renderers.PlotlyTradingChart(display=False, save_format="html"),
        portfolio=portfolio,
        action_scheme=action_scheme,
        reward_scheme=reward_scheme,
        window_size=config["window_size"],
        min_periods=config["window_size"],
        #         max_allowed_loss=0.5,
    )
    return env

In [38]:
# data_norm = data.copy()

# z_score = lambda x: (x - x.mean()) / x.std(ddof=0)
# abs_max = lambda x: x / x.abs().quantile(0.9)
# data_norm[data_norm.columns[2:]] = data_norm[data_norm.columns[2:]].apply(abs_max)
# data_norm = data_norm.fillna(0)

In [39]:
env_config = {
    "data": data,
    "features": features,
    "commission": 0.005,
    "window_size": 50,
}
# environment = create_env(env_config)

In [40]:
# from stable_baselines.common.policies import MlpLnLstmPolicy
# from stable_baselines import PPO2

# policy = MlpLnLstmPolicy
# # params = { "learning_rate": 1e-5 }

# agent = PPO2(policy, environment, nminibatches=1)

In [41]:
# agent.learn(total_timesteps=10000)

In [46]:
ray.init()
register_env("TradingEnv", create_env)

RuntimeError: Maybe you called ray.init twice by accident? This error can be suppressed by passing in 'ignore_reinit_error=True' or by calling 'ray.shutdown()' prior to 'ray.init()'.

In [48]:
agents.ppo.DEFAULT_CONFIG

{'num_workers': 2,
 'num_envs_per_worker': 1,
 'rollout_fragment_length': 200,
 'sample_batch_size': -1,
 'batch_mode': 'truncate_episodes',
 'num_gpus': 0,
 'train_batch_size': 4000,
 'model': {'conv_filters': None,
  'conv_activation': 'relu',
  'fcnet_activation': 'tanh',
  'fcnet_hiddens': [256, 256],
  'free_log_std': False,
  'no_final_linear': False,
  'vf_share_layers': True,
  'use_lstm': False,
  'max_seq_len': 20,
  'lstm_cell_size': 256,
  'lstm_use_prev_action_reward': False,
  'state_shape': None,
  'framestack': True,
  'dim': 84,
  'grayscale': False,
  'zero_mean': True,
  'custom_model': None,
  'custom_model_config': {},
  'custom_action_dist': None,
  'custom_preprocessor': None,
  'custom_options': -1},
 'optimizer': {},
 'gamma': 0.99,
 'horizon': None,
 'soft_horizon': False,
 'no_done_at_end': False,
 'env_config': {},
 'env': None,
 'normalize_actions': False,
 'clip_rewards': None,
 'clip_actions': True,
 'preprocessor_pref': 'deepmind',
 'lr': 5e-05,
 'monito

In [51]:
from ray.rllib import agents

config = agents.ppo.DEFAULT_CONFIG.copy()
config.update(
    {
        "env": "TradingEnv",
        "env_config": env_config,
        "log_level": "WARN",
        "framework": "torch",
        #         "ignore_worker_failures": True,
        "num_workers": 4,
        "num_gpus": 2,
    }
)

agent = agents.ppo.PPOTrainer(config=config, env="TradingEnv")

[2m[36m(pid=22738)[0m Instructions for updating:
[2m[36m(pid=22738)[0m non-resource variables are not supported in the long term
[2m[36m(pid=22721)[0m Instructions for updating:
[2m[36m(pid=22721)[0m non-resource variables are not supported in the long term
[2m[36m(pid=22693)[0m Instructions for updating:
[2m[36m(pid=22693)[0m non-resource variables are not supported in the long term
[2m[36m(pid=22758)[0m Instructions for updating:
[2m[36m(pid=22758)[0m non-resource variables are not supported in the long term
[2m[36m(pid=22738)[0m 
[2m[36m(pid=22738)[0m 
[2m[36m(pid=22721)[0m 
[2m[36m(pid=22721)[0m 
[2m[36m(pid=22693)[0m 
[2m[36m(pid=22693)[0m 
[2m[36m(pid=22758)[0m 
[2m[36m(pid=22758)[0m 


In [52]:
agent.train()

{'episode_reward_max': -0.03425316733032879,
 'episode_reward_min': -32.364811487707875,
 'episode_reward_mean': -0.13789486113065083,
 'episode_len_mean': 2.0470829068577276,
 'episodes_this_iter': 1954,
 'policy_reward_min': {},
 'policy_reward_max': {},
 'policy_reward_mean': {},
 'custom_metrics': {},
 'hist_stats': {'episode_reward': [-32.19168565248159,
   -0.048791587473961284,
   -0.14637476242188385,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587473961284,
   -0.048791587

In [31]:
# import random
# from ray.tune.schedulers import PopulationBasedTraining

# # Postprocess the perturbed config to ensure it's still valid
# def explore(config):
#     # ensure we collect enough timesteps to do sgd
#     if config["train_batch_size"] < config["sgd_minibatch_size"] * 2:
#         config["train_batch_size"] = config["sgd_minibatch_size"] * 2
#     # ensure we run at least one sgd iter
#     if config["num_sgd_iter"] < 1:
#         config["num_sgd_iter"] = 1
#     return config


# pbt = PopulationBasedTraining(
#     time_attr="time_total_s",
#     metric="episode_reward_mean",
#     mode="max",
#     perturbation_interval=100,
#     resample_probability=0.25,
#     # Specifies the mutations of these hyperparams
#     hyperparam_mutations={
#         "lambda": lambda: random.uniform(0.9, 1.0),
#         "clip_param": lambda: random.uniform(0.01, 0.5),
#         "lr": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5],
#         "num_sgd_iter": lambda: random.randint(1, 30),
#         "sgd_minibatch_size": lambda: random.randint(32, 4096),
#         "train_batch_size": lambda: random.randint(2000, 16000),
#     },
#     custom_explore_fn=explore,
# )

# tune.run(
#     "PPO",
#     checkpoint_freq=10,
#     name="show_me_the_money_test",
#     #     scheduler=pbt,
#     #     num_samples=8,
#     config={
#         "env": "TradingEnv",
#         "env_config": env_config,
#         "framework": "tf",
# #         "log_level": "DEBUG",
#         "kl_coeff": 1.0,
# #         "num_workers": 8,
#         "num_gpus": 1,
#         #         "model": {
#         #             "free_log_std": True
#         #         },
#         # These params are tuned from a fixed starting value.
#         "lambda": 0.95,
#         "clip_param": 0.2,
#         "lr": 1e-4,
#         # These params start off randomly drawn from a set.
#         "num_sgd_iter": tune.sample_from(lambda spec: random.choice([10, 20, 30])),
#         "sgd_minibatch_size": tune.sample_from(
#             lambda spec: random.choice([32, 128, 512, 2048])
#         ),
#         "train_batch_size": tune.sample_from(
#             lambda spec: random.choice([1000, 2000, 4000])
#         ),
#     },
# )

In [32]:
analysis = tune.run(
    "PPO",
    stop={"episode_reward_mean": 100},
    config={
        "env": "TradingEnv",
        "env_config": env_config,
        "log_level": "WARN",
        "framework": "tf",
#         "ignore_worker_failures": True,
        "num_workers": 4,
        "num_gpus": 2,
#         "clip_rewards": True,
        "lr": 1e-4,
#         "lr_schedule": [
# #             [0, 1e-1],
# #             [int(1e2), 1e-2],
# #             [int(1e3), 1e-3],
#             [int(1e4), 1e-4],
#             [int(1e5), 1e-5],
#             [int(1e6), 1e-6],
#             [int(1e7), 1e-7],
#         ],
#         "gamma": 0,
#         "observation_filter": "MeanStdFilter",
#         "lambda": 0.72,
#         "vf_loss_coeff": 0.5,
#         "entropy_coeff": 0.01,
    },
#     local_dir="./ray",
#     resources_per_trial={"cpu": 4, "gpu": 1},
    reuse_actors=True,
    checkpoint_at_end=True,
    global_checkpoint_period=np.inf
)

2020-11-08 15:28:17,850	INFO resource_spec.py:231 -- Starting Ray with 75.93 GiB memory available for workers and up to 36.54 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-11-08 15:28:19,216	INFO services.py:1193 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m
[2m[36m(pid=22736)[0m Instructions for updating:
[2m[36m(pid=22736)[0m non-resource variables are not supported in the long term


Trial name,status,loc
PPO_TradingEnv_f2954_00000,RUNNING,


[2m[36m(pid=22736)[0m 2020-11-08 15:28:42,387	INFO trainer.py:605 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
[2m[36m(pid=22736)[0m 2020-11-08 15:28:42,387	INFO trainer.py:632 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=22798)[0m Instructions for updating:
[2m[36m(pid=22798)[0m non-resource variables are not supported in the long term
[2m[36m(pid=22803)[0m Instructions for updating:
[2m[36m(pid=22803)[0m non-resource variables are not supported in the long term
[2m[36m(pid=22697)[0m Instructions for updating:
[2m[36m(pid=22697)[0m non-resource variables are not supported in the long term
[2m[36m(pid=22696)[0m Instructions for updating:
[2m[36m(pid=22696)[0m non-resource variables are not supported in the long term
[2m[36m(pid=22736)[0m 2020-11-08 15:28:57,677	INFO trainable.py:251 -- Trainable.setup took 18.324 seconds. If your train

Result for PPO_TradingEnv_f2954_00000:
  custom_metrics: {}
  date: 2020-11-08_15-29-21
  done: false
  episode_len_mean: 2.0253164556962027
  episode_reward_max: -0.044111006657483354
  episode_reward_mean: -0.12648918699068065
  episode_reward_min: -32.29843022738116
  episodes_this_iter: 1975
  episodes_total: 1975
  experiment_id: ca850159fb35464ea22b7a512bd2a74c
  experiment_tag: '0'
  hostname: gra899
  info:
    learner:
      default_policy:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 9.999999747378752e-05
        entropy: 3.0136678218841553
        entropy_coeff: 0.0
        kl: 0.030988000333309174
        model: {}
        policy_loss: -0.24930539727210999
        total_loss: 0.26197585463523865
        vf_explained_var: 0.8822211027145386
        vf_loss: 0.5050836205482483
    num_steps_sampled: 4000
    num_steps_trained: 4000
  iterations_since_restore: 1
  node_ip: 10.29.85.73
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent



Trial name,status,loc,iter,total time (s),ts,reward
PPO_TradingEnv_f2954_00000,RUNNING,10.29.85.73:22736,1,23.7719,4000,-0.126489


Result for PPO_TradingEnv_f2954_00000:
  custom_metrics: {}
  date: 2020-11-08_15-29-40
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -0.044111006657483354
  episode_reward_mean: -0.06165849903349898
  episode_reward_min: -0.0787172587220103
  episodes_this_iter: 4000
  episodes_total: 5975
  experiment_id: ca850159fb35464ea22b7a512bd2a74c
  experiment_tag: '0'
  hostname: gra899
  info:
    learner:
      default_policy:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 9.999999747378752e-05
        entropy: 2.9884085655212402
        entropy_coeff: 0.0
        kl: 0.025158191099762917
        model: {}
        policy_loss: -0.23904326558113098
        total_loss: -0.2096727043390274
        vf_explained_var: -1.0
        vf_loss: 0.021823106333613396
    num_steps_sampled: 8000
    num_steps_trained: 8000
  iterations_since_restore: 2
  node_ip: 10.29.85.73
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 8.958333333333334
    gp



Trial name,status,loc,iter,total time (s),ts,reward
PPO_TradingEnv_f2954_00000,RUNNING,10.29.85.73:22736,2,38.1565,8000,-0.0616585


Result for PPO_TradingEnv_f2954_00000:
  custom_metrics: {}
  date: 2020-11-08_15-30-01
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -0.044111006657483354
  episode_reward_mean: -0.06165849903349898
  episode_reward_min: -0.0787172587220103
  episodes_this_iter: 4000
  episodes_total: 9975
  experiment_id: ca850159fb35464ea22b7a512bd2a74c
  experiment_tag: '0'
  hostname: gra899
  info:
    learner:
      default_policy:
        cur_kl_coeff: 0.44999998807907104
        cur_lr: 9.999999747378752e-05
        entropy: 2.965888261795044
        entropy_coeff: 0.0
        kl: 0.02458980865776539
        model: {}
        policy_loss: -0.22442153096199036
        total_loss: -0.2077280730009079
        vf_explained_var: -1.0
        vf_loss: 0.005628045182675123
    num_steps_sampled: 12000
    num_steps_trained: 12000
  iterations_since_restore: 3
  node_ip: 10.29.85.73
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 8.93478260869565
    gpu



Trial name,status,loc,iter,total time (s),ts,reward
PPO_TradingEnv_f2954_00000,RUNNING,10.29.85.73:22736,3,54.5699,12000,-0.0616585


Result for PPO_TradingEnv_f2954_00000:
  custom_metrics: {}
  date: 2020-11-08_15-30-24
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -0.044111006657483354
  episode_reward_mean: -0.06165849903349898
  episode_reward_min: -0.0787172587220103
  episodes_this_iter: 4000
  episodes_total: 13975
  experiment_id: ca850159fb35464ea22b7a512bd2a74c
  experiment_tag: '0'
  hostname: gra899
  info:
    learner:
      default_policy:
        cur_kl_coeff: 0.675000011920929
        cur_lr: 9.999999747378752e-05
        entropy: 2.9495604038238525
        entropy_coeff: 0.0
        kl: 0.021628987044095993
        model: {}
        policy_loss: -0.20630142092704773
        total_loss: -0.1902754008769989
        vf_explained_var: -1.0
        vf_loss: 0.0014264353085309267
    num_steps_sampled: 16000
    num_steps_trained: 16000
  iterations_since_restore: 4
  node_ip: 10.29.85.73
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 9.328000000000001
    



Trial name,status,loc,iter,total time (s),ts,reward
PPO_TradingEnv_f2954_00000,RUNNING,10.29.85.73:22736,4,73.0176,16000,-0.0616585


Result for PPO_TradingEnv_f2954_00000:
  custom_metrics: {}
  date: 2020-11-08_15-30-50
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -0.044111006657483354
  episode_reward_mean: -0.06165849903349898
  episode_reward_min: -0.0787172587220103
  episodes_this_iter: 4000
  episodes_total: 17975
  experiment_id: ca850159fb35464ea22b7a512bd2a74c
  experiment_tag: '0'
  hostname: gra899
  info:
    learner:
      default_policy:
        cur_kl_coeff: 1.0125000476837158
        cur_lr: 9.999999747378752e-05
        entropy: 2.93463134765625
        entropy_coeff: 0.0
        kl: 0.02068272791802883
        model: {}
        policy_loss: -0.2127881795167923
        total_loss: -0.1911633312702179
        vf_explained_var: -0.9764094948768616
        vf_loss: 0.0006836322136223316
    num_steps_sampled: 20000
    num_steps_trained: 20000
  iterations_since_restore: 5
  node_ip: 10.29.85.73
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 8.93928571



Trial name,status,loc,iter,total time (s),ts,reward
PPO_TradingEnv_f2954_00000,RUNNING,10.29.85.73:22736,5,94.324,20000,-0.0616585


Result for PPO_TradingEnv_f2954_00000:
  custom_metrics: {}
  date: 2020-11-08_15-31-17
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -0.044111006657483354
  episode_reward_mean: -0.06165849903349898
  episode_reward_min: -0.0787172587220103
  episodes_this_iter: 4000
  episodes_total: 21975
  experiment_id: ca850159fb35464ea22b7a512bd2a74c
  experiment_tag: '0'
  hostname: gra899
  info:
    learner:
      default_policy:
        cur_kl_coeff: 1.5187499523162842
        cur_lr: 9.999999747378752e-05
        entropy: 2.9247846603393555
        entropy_coeff: 0.0
        kl: 0.01923118531703949
        model: {}
        policy_loss: -0.24136170744895935
        total_loss: -0.2118106186389923
        vf_explained_var: -0.485330194234848
        vf_loss: 0.0003437565464992076
    num_steps_sampled: 24000
    num_steps_trained: 24000
  iterations_since_restore: 6
  node_ip: 10.29.85.73
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 9.200000



Trial name,status,loc,iter,total time (s),ts,reward
PPO_TradingEnv_f2954_00000,RUNNING,10.29.85.73:22736,6,115.839,24000,-0.0616585


Result for PPO_TradingEnv_f2954_00000:
  custom_metrics: {}
  date: 2020-11-08_15-31-44
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -0.044111006657483354
  episode_reward_mean: -0.06165849903349898
  episode_reward_min: -0.0787172587220103
  episodes_this_iter: 4000
  episodes_total: 25975
  experiment_id: ca850159fb35464ea22b7a512bd2a74c
  experiment_tag: '0'
  hostname: gra899
  info:
    learner:
      default_policy:
        cur_kl_coeff: 1.5187499523162842
        cur_lr: 9.999999747378752e-05
        entropy: 2.917729139328003
        entropy_coeff: 0.0
        kl: 0.01885765977203846
        model: {}
        policy_loss: -0.22041568160057068
        total_loss: -0.19156573712825775
        vf_explained_var: 0.05072556436061859
        vf_loss: 0.0002098778641084209
    num_steps_sampled: 28000
    num_steps_trained: 28000
  iterations_since_restore: 7
  node_ip: 10.29.85.73
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 9.54999



Trial name,status,loc,iter,total time (s),ts,reward
PPO_TradingEnv_f2954_00000,RUNNING,10.29.85.73:22736,7,138.598,28000,-0.0616585


Result for PPO_TradingEnv_f2954_00000:
  custom_metrics: {}
  date: 2020-11-08_15-32-14
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -0.044111006657483354
  episode_reward_mean: -0.06165849903349898
  episode_reward_min: -0.0787172587220103
  episodes_this_iter: 4000
  episodes_total: 29975
  experiment_id: ca850159fb35464ea22b7a512bd2a74c
  experiment_tag: '0'
  hostname: gra899
  info:
    learner:
      default_policy:
        cur_kl_coeff: 1.5187499523162842
        cur_lr: 9.999999747378752e-05
        entropy: 2.906156063079834
        entropy_coeff: 0.0
        kl: 0.019808780401945114
        model: {}
        policy_loss: -0.22781768441200256
        total_loss: -0.19754701852798462
        vf_explained_var: 0.1633036881685257
        vf_loss: 0.00018606946105137467
    num_steps_sampled: 32000
    num_steps_trained: 32000
  iterations_since_restore: 8
  node_ip: 10.29.85.73
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 9.2333



Trial name,status,loc,iter,total time (s),ts,reward
PPO_TradingEnv_f2954_00000,RUNNING,10.29.85.73:22736,8,163.859,32000,-0.0616585


Result for PPO_TradingEnv_f2954_00000:
  custom_metrics: {}
  date: 2020-11-08_15-32-43
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -0.044111006657483354
  episode_reward_mean: -0.06165849903349898
  episode_reward_min: -0.0787172587220103
  episodes_this_iter: 4000
  episodes_total: 33975
  experiment_id: ca850159fb35464ea22b7a512bd2a74c
  experiment_tag: '0'
  hostname: gra899
  info:
    learner:
      default_policy:
        cur_kl_coeff: 1.5187499523162842
        cur_lr: 9.999999747378752e-05
        entropy: 2.8905768394470215
        entropy_coeff: 0.0
        kl: 0.02421150915324688
        model: {}
        policy_loss: -0.2346608191728592
        total_loss: -0.19768841564655304
        vf_explained_var: 0.09161016345024109
        vf_loss: 0.00020118649990763515
    num_steps_sampled: 36000
    num_steps_trained: 36000
  iterations_since_restore: 9
  node_ip: 10.29.85.73
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 9.4285



Trial name,status,loc,iter,total time (s),ts,reward
PPO_TradingEnv_f2954_00000,RUNNING,10.29.85.73:22736,9,188.778,36000,-0.0616585


Result for PPO_TradingEnv_f2954_00000:
  custom_metrics: {}
  date: 2020-11-08_15-33-14
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -0.044111006657483354
  episode_reward_mean: -0.06165849903349898
  episode_reward_min: -0.0787172587220103
  episodes_this_iter: 4000
  episodes_total: 37975
  experiment_id: ca850159fb35464ea22b7a512bd2a74c
  experiment_tag: '0'
  hostname: gra899
  info:
    learner:
      default_policy:
        cur_kl_coeff: 2.278125047683716
        cur_lr: 9.999999747378752e-05
        entropy: 2.887251138687134
        entropy_coeff: 0.0
        kl: 0.021725798025727272
        model: {}
        policy_loss: -0.21904334425926208
        total_loss: -0.16936667263507843
        vf_explained_var: 0.1778380423784256
        vf_loss: 0.0001826071529649198
    num_steps_sampled: 40000
    num_steps_trained: 40000
  iterations_since_restore: 10
  node_ip: 10.29.85.73
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 9.77857



Trial name,status,loc,iter,total time (s),ts,reward
PPO_TradingEnv_f2954_00000,RUNNING,10.29.85.73:22736,10,214.549,40000,-0.0616585


Result for PPO_TradingEnv_f2954_00000:
  custom_metrics: {}
  date: 2020-11-08_15-33-45
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -0.044111006657483354
  episode_reward_mean: -0.06165849903349898
  episode_reward_min: -0.0787172587220103
  episodes_this_iter: 4000
  episodes_total: 41975
  experiment_id: ca850159fb35464ea22b7a512bd2a74c
  experiment_tag: '0'
  hostname: gra899
  info:
    learner:
      default_policy:
        cur_kl_coeff: 3.417187452316284
        cur_lr: 9.999999747378752e-05
        entropy: 2.881641149520874
        entropy_coeff: 0.0
        kl: 0.01824815385043621
        model: {}
        policy_loss: -0.18896375596523285
        total_loss: -0.12642009556293488
        vf_explained_var: 0.15114478766918182
        vf_loss: 0.0001862721110228449
    num_steps_sampled: 44000
    num_steps_trained: 44000
  iterations_since_restore: 11
  node_ip: 10.29.85.73
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 9.39285



Trial name,status,loc,iter,total time (s),ts,reward
PPO_TradingEnv_f2954_00000,RUNNING,10.29.85.73:22736,11,241.219,44000,-0.0616585


KeyboardInterrupt: 