Install dependencies

In [None]:
!sudo apt-get install -y xvfb ffmpeg
!pip install 'gym==0.10.11'
!pip install 'imageio==2.4.0'
!pip install PILLOW
!pip install 'pyglet==1.3.2'
!pip install pyvirtualdisplay
!pip install --pre tf-agents[reverb]

Import dependencies


In [76]:
import tensorflow as tf
import numpy as np
import keras
import matplotlib.pyplot as plt
from tf_agents.environments import py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts
import pandas as pd
from collections import deque
from google.colab import drive
from keras.layers import LSTM, Dense, Activation
tf.compat.v1.enable_v2_behavior()


Environment Class

In [77]:
class StockMarket(py_environment.PyEnvironment):

  def __init__(self, dataset, start_date, end_date, window_size):
      self._prices = dataset[start_date:end_date]['Close'].values
      self._window_size = window_size
      self._start_index = window_size-1
      self._end_index = len(self._prices) - 2
      self._current_index = self._start_index


      self._action_spec = array_spec.BoundedArraySpec(
          shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
      self._observation_spec = array_spec.BoundedArraySpec(
          shape=(self._window_size,), dtype=np.float32, minimum=-1, maximum=1, name='observation')
      self._state = self._get_observation()
      self._episode_ended = False

  def _get_observation(self):
    observ=self._prices[self._current_index-self._window_size+1:self._current_index+1]
    normalized_window = [((float(p) / float(observ[0])) - 1) for p in observ]
    return normalized_window

  def action_spec(self):
    # action = 0 --> sell
    # action = 1 --> buy
    return self._action_spec

  def observation_spec(self):
    return self._observation_spec

  def _reset(self):
    self._episode_ended = False
    self._current_index = self._start_index
    self._state = self._get_observation()
    return ts.restart(np.array(self._state, dtype=np.float32))

  def _step(self, action):
    if self._episode_ended:
      return self.reset()

    reward = self._calculate_reward(action)

    if self._current_index == self._end_index:
      self._episode_ended = True
      self._current_index += 1
      self._state = self._get_observation()
      return ts.termination(np.array(self._state, dtype=np.float32), reward)

    self._current_index += 1
    self._state = self._get_observation()
    return ts.transition(np.array(self._state, dtype=np.float32), reward , discount=1.0)

  def _calculate_reward(self, action):
    step_reward = 0
    now=self._prices[self._current_index]
    next=self._prices[self._current_index + 1]
    diff=(now-next)
    if action==0 :
      step_reward+=diff
    elif action==1 :
      step_reward-=diff
    else:
      raise ValueError('`action` should be 0 or 1.')
    
    return step_reward

Import dataset and make enviroment

In [78]:
drive.mount('/content/drive')
dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/googl.us.csv', parse_dates=['Date'])
dataset = dataset[['Date', 'Close']]
dataset.set_index('Date', inplace=True)
env = StockMarket(dataset=dataset, start_date='2015-01-01', end_date='2016-11-30', window_size=30)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Q-Network

In [79]:
input_shape = [30] # == env.observation_space.shape
n_outputs = 2 # == env.action_space.n

model = keras.models.Sequential()
model.add(LSTM(units=30, return_sequences=True, input_shape=(None,1)))
model.add(Dense(units=32,activation='linear'))
model.add(LSTM(units=30, return_sequences=False))
model.add(Dense(n_outputs))   

Reply buffer

In [80]:
replay_memory = deque(maxlen=1000000)

epsilon_greedy_policy Function

In [81]:
def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
      return np.random.randint(2)
    else:
      state=state.reshape((1,30,1))
      Q_values = model.predict(state)
      return np.argmax(Q_values[0])

sample_experiences Function

In [82]:
def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_memory), size=batch_size)
    batch = [replay_memory[index] for index in indices]
    states, actions, rewards, next_states = [
        np.array([experience[field_index] for experience in batch])
        for field_index in range(4)]
    return states, actions, rewards, next_states


play_one_step Function

In [83]:
def play_one_step(env, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)
    st = env.step(action)
    next_state=st.observation
    reward=float(st.reward)
    replay_memory.append((state, action, reward, next_state))
    return next_state, reward 

training_step Function

In [84]:
batch_size = 32
discount_rate = 0.95
optimizer = keras.optimizers.Adam(lr=1e-3)
loss_fn = keras.losses.mean_squared_error

def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states = experiences
    next_states=next_states.reshape(32,30,1)
    next_Q_values = model.predict(next_states)
    max_next_Q_values = np.max(next_Q_values, axis=1)
    target_Q_values = (rewards +
                       (1 - 0) * discount_rate * max_next_Q_values)
    target_Q_values = target_Q_values.reshape(-1, 1)
    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))


Training loop

In [None]:
rewards = [] 
best_score = 0
for episode in range(600):
  obs = env.reset().observation
  epsilon = max(1 - episode / 600, 0.01)  
  step,ret=0 ,0
  while True:
    time_step=env.current_time_step()
    obs, reward = play_one_step(env, obs, epsilon)
    ret+= reward
    if time_step.is_last():
      rewards.append(ret)
      break
    step+=1
    print("\rEpisode: {}, Steps: {}, eps: {:.3f} , return: {}".format(episode, step + 1, epsilon , ret), end="")
  if episode > 50:
    training_step(batch_size)

visualize 

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(reward)
plt.xlabel("Episode", fontsize=14)
plt.ylabel("Sum of rewards", fontsize=14)
plt.show()

Double DQN 

In [59]:
batch_size = 32
discount_rate = 0.95
optimizer = keras.optimizers.Adam(lr=1e-3)
loss_fn = keras.losses.mean_squared_error

target = keras.models.clone_model(model)
target.set_weights(model.get_weights())

def training_step(batch_size):
  experiences = sample_experiences(batch_size)
  states, actions, rewards, next_states = experiences
  next_states=next_states.reshape(32,30,1)
  next_Q_values = model.predict(next_states)
  best_next_actions = np.argmax(next_Q_values, axis=1)
  next_mask = tf.one_hot(best_next_actions, n_outputs).numpy()
  next_best_Q_values = (target.predict(next_states) * next_mask).sum(axis=1)
  target_Q_values = (rewards +
  (1 - 0) * discount_rate * next_best_Q_values)
  mask = tf.one_hot(actions, n_outputs)
  with tf.GradientTape() as tape:
      all_Q_values = model(states)
      Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
      loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [None]:
rewards = [] 
best_score = 0

for episode in range(600):
  obs = env.reset().observation
  epsilon = max(1 - episode / 600, 0.01)  
  step,ret=0 ,0
  while True:
    time_step=env.current_time_step()
    obs, reward = play_one_step(env, obs, epsilon)
    ret+= reward
    if time_step.is_last():
      rewards.append(ret)
      break
    step+=1
    print("\rEpisode: {}, Steps: {}, eps: {:.3f} , return: {}".format(episode, step + 1, epsilon , ret), end="")
  if episode > 50:
    training_step(batch_size)
  if episode % 50 == 0:
    target.set_weights(model.get_weights())

Dueling DQN

In [None]:
K = keras.backend
input_states = keras.layers.Input(shape=[4])
hidden1 = keras.layers.Dense(32, activation="elu")(input_states)
hidden2 = keras.layers.Dense(32, activation="elu")(hidden1)
state_values = keras.layers.Dense(1)(hidden2)
raw_advantages = keras.layers.Dense(n_outputs)(hidden2)
advantages = raw_advantages - K.max(raw_advantages, axis=1, keepdims=True)
Q_values = state_values + advantages
model = keras.models.Model(inputs=[input_states], outputs=[Q_values])

target = keras.models.clone_model(model)
target.set_weights(model.get_weights())

In [None]:
batch_size = 32
discount_rate = 0.95
optimizer = keras.optimizers.Adam(lr=1e-2)
loss_fn = keras.losses.Huber()

def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states = experiences
    next_states=next_states.reshape(32,30,1)
    next_Q_values = model.predict(next_states)
    best_next_actions = np.argmax(next_Q_values, axis=1)
    next_mask = tf.one_hot(best_next_actions, n_outputs).numpy()
    next_best_Q_values = (target.predict(next_states) * next_mask).sum(axis=1)
    target_Q_values = (rewards + 
                       (1 - 0) * discount_rate * next_best_Q_values)
    target_Q_values = target_Q_values.reshape(-1, 1)
    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [None]:
rewards = [] 
best_score = 0

for episode in range(600):
  obs = env.reset().observation
  epsilon = max(1 - episode / 600, 0.01)  
  step,ret=0 ,0
  while True:
    time_step=env.current_time_step()
    obs, reward = play_one_step(env, obs, epsilon)
    ret+= reward
    if time_step.is_last():
      rewards.append(ret)
      break
    step+=1
    print("\rEpisode: {}, Steps: {}, eps: {:.3f} , return: {}".format(episode, step + 1, epsilon , ret), end="")
  if episode > 50:
    training_step(batch_size)
  if episode % 200 == 0:
    target.set_weights(model.get_weights())
