Install dependencies

In [None]:
!sudo apt-get install -y xvfb ffmpeg
!pip install 'gym==0.10.11'
!pip install 'imageio==2.4.0'
!pip install PILLOW
!pip install 'pyglet==1.3.2'
!pip install pyvirtualdisplay
!pip install --pre tf-agents[reverb]

Import dependencies

In [3]:
from __future__ import absolute_import, division, print_function
import base64
import imageio
import IPython
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image
import pyvirtualdisplay
import tensorflow as tf
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network, q_rnn_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

In [4]:
tf.compat.v1.enable_v2_behavior()

StockMarket Environment Class

In [5]:
import abc
from enum import Enum
import tensorflow as tf
import numpy as np
from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts
import pandas as pd
tf.compat.v1.enable_v2_behavior()


class StockMarket(py_environment.PyEnvironment):

  def __init__(self, dataset, start_date, end_date, window_size):
      self._prices = dataset[start_date:end_date]['Close'].values
      self._window_size = window_size
      self._start_index = window_size-1
      self._end_index = len(self._prices) - 2
      self._current_index = self._start_index


      self._action_spec = array_spec.BoundedArraySpec(
          shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
      self._observation_spec = array_spec.BoundedArraySpec(
          shape=(self._window_size,), dtype=np.float32, minimum=-1, maximum=1, name='observation')
      self._state = self._get_observation()
      self._episode_ended = False

  def _get_observation(self):
    observ=self._prices[self._current_index-self._window_size+1:self._current_index+1]
    normalized_window = [((float(p) / float(observ[0])) - 1) for p in observ]
    return normalized_window

  def action_spec(self):
    # action = 0 --> sell
    # action = 1 --> buy
    return self._action_spec

  def observation_spec(self):
    return self._observation_spec

  def _reset(self):
    self._episode_ended = False
    self._current_index = self._start_index
    self._state = self._get_observation()
    return ts.restart(np.array(self._state, dtype=np.float32))

  def _step(self, action):
    if self._episode_ended:
      return self.reset()

    reward = self._calculate_reward(action)

    if self._current_index == self._end_index:
      self._episode_ended = True
      self._current_index += 1
      self._state = self._get_observation()
      return ts.termination(np.array(self._state, dtype=np.float32), reward)

    self._current_index += 1
    self._state = self._get_observation()
    return ts.transition(np.array(self._state, dtype=np.float32), reward , discount=1.0)

  def _calculate_reward(self, action):
    step_reward = 0
    now=self._prices[self._current_index]
    next=self._prices[self._current_index + 1]
    diff=(now-next)
    if action==0 :
      step_reward+=diff
    elif action==1 :
      step_reward-=diff
    else:
      raise ValueError('`action` should be 0 or 1.')
    
    return step_reward


Hyperparameters

In [6]:
num_iterations = 1000000 # @param {type:"integer"}

initial_collect_steps = 10000  # @param {type:"integer"} 
collect_steps_per_iteration = 1  # @param {type:"integer"}
replay_buffer_max_length = 1000000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
learning_rate = 1e-3  # @param {type:"number"}
log_interval = 200  # @param {type:"integer"}

num_eval_episodes = 10  # @param {type:"integer"}
eval_interval = 1000  # @param {type:"integer"}

Dateset

In [None]:
from google.colab import drive
drive.mount('/content/drive')
dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/googl.us.csv', parse_dates=['Date'])
dataset = dataset[['Date', 'Close']]
dataset.set_index('Date', inplace=True)

Create python Environment

In [8]:
train_py_env = StockMarket(dataset=dataset, start_date='2015-01-01', end_date='2016-11-30', window_size=30)
eval_py_env = StockMarket(dataset=dataset, start_date='2017-01-01', end_date='2017-11-10', window_size=30)


Convert python env to tensorflow env

In [9]:
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

Create DQN

In [10]:
preprocessing=tf.keras.layers.Lambda(
    lambda obs: (tf.cast(obs , np.float32)-train_py_env.time_step_spec().observation.minimum)/(train_py_env.time_step_spec().observation.maximum- train_py_env.time_step_spec().observation.minimum)
)
q_net = q_rnn_network.QRnnNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    preprocessing_layers=preprocessing,
    input_fc_layer_params=None,
    lstm_size=(30,30,),
    output_fc_layer_params=(512,128,),
    activation_fn=tf.keras.activations.relu)

Create Agent

In [11]:
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
train_step_counter = tf.Variable(0)
agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

Define policy

In [12]:
eval_policy = agent.policy
collect_policy = agent.collect_policy
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),train_env.action_spec())

Reply buffer

In [13]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

reply_observer Function

In [14]:
replay_observer = [replay_buffer.add_batch]

def collect_step(environment, policy, buffer):
  time_step = environment.current_time_step()
  action_step = policy.action(time_step)
  next_time_step = environment.step(action_step.action)
  traj = trajectory.from_transition(time_step, action_step, next_time_step)

  # Add trajectory to the replay buffer
  buffer.add_batch(traj)

def collect_data(env, policy, buffer, steps):
  for _ in range(steps):
    collect_step(env, policy, buffer)

collect_data(train_env, random_policy, replay_buffer, steps=100)

Create collect data driver

In [15]:
from tf_agents.metrics import tf_metrics
train_metrics = [tf_metrics.AverageReturnMetric()]
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver
collect_driver = DynamicStepDriver(
    train_env,
    collect_policy,
    observers=replay_observer + train_metrics,
    num_steps=collect_steps_per_iteration)

Create initialize driver (collect random data)

In [16]:
init_driver = DynamicStepDriver(
    train_env,
    random_policy,
    observers=replay_observer,
    num_steps=initial_collect_steps)

Create evaluate driver (test agent on eval_env)

In [17]:
from tf_agents.metrics import tf_metrics
eval_metrics = [tf_metrics.AverageReturnMetric()]

from tf_agents.drivers.dynamic_episode_driver import DynamicEpisodeDriver
eval_driver = DynamicEpisodeDriver(
    eval_env,
    eval_policy,
    observers=eval_metrics,
    num_episodes=num_eval_episodes)

Generate init data using init_driver

In [None]:
init_driver.run()

Create dataset

In [None]:
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=batch_size, 
    num_steps=2).prefetch(3)
iterator = iter(dataset)
print(iterator)

Train

In [None]:
returns_eval = []
returns_train = []

agent.train = common.function(agent.train)
collect_driver.run = common.function(collect_driver.run)
eval_driver.run = common.function(eval_driver.run)
time_step = None
policy_state = agent.collect_policy.get_initial_state(train_env.batch_size)

agent.train_step_counter.assign(0)
for _ in range(num_iterations):

  # Collect a few steps using collect_policy and save to the replay buffer.
  time_step, policy_state = collect_driver.run(time_step, policy_state)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience).loss

  step = agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss))

  if step % eval_interval == 0:
    eval_driver.run()
    print('step = {0}: Average Return (train) = {1}'.format(step, train_metrics[0].result()))
    print('step = {0}: Average Return (eval) = {1}'.format(step, eval_metrics[0].result()))
    returns_eval.append(eval_metrics[0].result())
    returns_train.append(train_metrics[0].result())

Visualization

In [None]:
iterations = range(1, num_iterations + 1, eval_interval)
plt.plot(iterations, returns_train)
plt.plot(iterations, returns_eval)
plt.ylabel('Average Return')
plt.xlabel('Iterations')
plt.legend(['train', 'eval'], loc='upper right')
plt.show()