- Try a simple stock market RL algo. 
- Construct a stock market env, keep original Actor-critic PG.


For a simple time series stock, we define:
- A, [Buy = 1, Hold = 0, Sell = -1]
- State Market, [O, H, L, C, V, P(portifolioProce)]
- Reawrd is calculated as holding return each trading day

In [1]:
import tushare as ts
%matplotlib inline


import itertools
import matplotlib
import numpy as np
import sys
import tensorflow as tf
import collections

from tensorflow.python.ops import rnn_cell


from lib import plotting

matplotlib.style.use('ggplot')

In [2]:
df = ts.get_k_data(code='000001', start='2000-01-01', index=True)

In [3]:
df = df.ix[:, 1:-1]*0.01
df['ma5'] = df.close.rolling(5).mean()

df.dropna(inplace=True)

df.head()

Unnamed: 0,open,close,high,low,volume,ma5
4,15.3171,15.4511,15.4672,15.064,312535.0,14.6834
5,15.4768,14.7978,15.4771,14.6876,219246.0,14.83022
6,14.7376,14.3802,14.8928,14.35,152229.0,14.8869
7,14.3745,14.2444,14.4407,14.1881,86129.2,14.8079
8,14.2622,14.0885,14.3347,14.0171,74470.1,14.5924


In [4]:
df.volume = df.volume / df.volume.max() * df.high.max()
df.head()

Unnamed: 0,open,close,high,low,volume,ma5
4,15.3171,15.4511,15.4672,15.064,2.232999,14.6834
5,15.4768,14.7978,15.4771,14.6876,1.566468,14.83022
6,14.7376,14.3802,14.8928,14.35,1.087645,14.8869
7,14.3745,14.2444,14.4407,14.1881,0.615376,14.8079
8,14.2622,14.0885,14.3347,14.0171,0.532074,14.5924


In [5]:
df.max()

open      60.57430
close     60.92060
high      61.24040
low       60.40710
volume    61.24040
ma5       59.94984
dtype: float64

In [6]:
class portfilio:
    def __init__(self, initial=10000, trans=0.0005):
        self.initial = initial
        self.balence = initial
        self.trans = trans
        self.record = [initial]
        
        self.hold = 0
        self.done = False
        self.total = initial
        self.reward = 0
    def order(self, o_type, close, amount=10):
        if o_type == 1: # sell
            if self.hold >= amount:
                self.balence += (1-self.trans)*(close*amount)
                self.hold -= amount
        elif o_type == 2: # buy
            cash_required = (1+self.trans)*(close*amount)
            if self.balence >= cash_required:
                self.balence -= (1+self.trans)*(close*amount)
                self.hold += amount
        

                
        self.total = self.balence + self.hold*close
        self.record += [self.total]
        if self.total <=5000:
            self.done = True
        #print('Balence: {0}, Holds: {1}, Total: {2}, Earn {3}'.format(self.balence,self.hold,self.total,-self.record[0]+self.total))
            

In [7]:
class Market:
    def __init__(self, df):
        self.market = df.values
        self.market_g = self._market_g()
        self.port = portfilio()
        
        self.num_state = 6
        self.num_action = 3
        
        
    
    
    def reset(self):
        self.market_g = self._market_g()
        self.port = portfilio()
        
        self.reward = self.port.total
        
        state = []
        for _ in range(5):
            o,h,l,c,v, m5 = next(self.market_g)
            
            self.c = c
            state += [[o,h,l,c,v, m5]]#self.port.balence]]
        self.state = state
        return state
        
    def _market_g(self):
        for o,h,l,c,v,m5 in self.market:
            yield o,h,l,c,v,m5
            
    def step(self, action):
        if not self.port.done:
            self.port.order(action, self.c)
            
            self.state.pop(0)
            
            o,h,l,c,v,m5 = next(self.market_g)
            self.state += [[o,h,l,c,v, m5]]# self.port.balence]]
            
            self.c = c
            
            
            reward = self.reward - self.port.total
            self.reward = self.port.total
            return self.state, reward, self.port.done
        else:
            print('bankrupt!')
        
        

In [8]:
class PolicyEstimator:
    def __init__(self, learning_rate = 0.0001, scope='PolicyEstimator'):
        num_units = 100
        num_layers = 3
        num_actions = 3
        time_steps = 5
        num_features = 6
        
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.float32, [1, time_steps, num_features])
            self.action = tf.placeholder(tf.int32)
            self.target = tf.placeholder(tf.float32)

            cell_fn = rnn_cell.BasicLSTMCell(num_units=num_units, state_is_tuple=True)
            cell = rnn_cell.MultiRNNCell([cell_fn]*num_layers, state_is_tuple=True)


            state = tf.unpack(tf.transpose(self.state, perm=[1, 0, 2]))

            outputs, state_rnn = tf.nn.rnn(cell, state, dtype=tf.float32)

            weight = tf.Variable(tf.truncated_normal([num_units, num_actions], stddev=0.1))
            bias = tf.Variable(tf.constant(0.1, shape=[3]))

            self.prediction = tf.nn.softmax(tf.matmul(outputs[-1], weight) + bias)
            self.action_prob = tf.gather(tf.squeeze(self.prediction), self.action)

            self.loss = - tf.log(self.action_prob) * self.target


            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())

    def predict(self, state, sess=None):
        sess = sess or tf.get_default_session()
        return sess.run(self.prediction, { self.state: [state] })[0]

    def update(self, state, target, action, sess=None):
        sess = sess or tf.get_default_session()
        feed_dict = { self.state: [state], self.target: target, self.action: action  }
        _, loss = sess.run([self.train_op, self.loss], feed_dict)

        return loss

In [9]:
class ValueEstimator:
    def __init__(self, learning_rate = 0.0001, scope='ValueEstimator'):
        num_units = 100
        num_layers = 3

        time_steps = 5
        num_features = 6
        
        
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.float32, [1, time_steps, num_features])
            self.target = tf.placeholder(tf.float32)

            cell_fn = rnn_cell.BasicLSTMCell(num_units=num_units, state_is_tuple=True)
            cell = rnn_cell.MultiRNNCell([cell_fn]*num_layers, state_is_tuple=True)


            state = tf.unpack(tf.transpose(self.state, perm=[1, 0, 2]))

            outputs, state_rnn = tf.nn.rnn(cell, state, dtype=tf.float32)

            weight = tf.Variable(tf.truncated_normal([num_units, 1], stddev=0.1))
            bias = tf.Variable(tf.constant(0.1, shape=[1]))


            self.value_estimate = tf.squeeze(tf.matmul(outputs[-1], weight) + bias)
            self.loss = tf.squared_difference(self.value_estimate, self.target)


            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())

    def predict(self, state, sess=None):
        sess = sess or tf.get_default_session()
        return sess.run(self.value_estimate, { self.state: [state] })

    def update(self, state, target, sess=None):
        sess = sess or tf.get_default_session()
        feed_dict = { self.state: [state], self.target: target }
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss

In [10]:
df.max()

open      60.57430
close     60.92060
high      61.24040
low       60.40710
volume    61.24040
ma5       59.94984
dtype: float64

In [11]:
def actor_critic(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0):
    """
    Actor Critic Algorithm. Optimizes the policy 
    function approximator using policy gradient.
    
    Args:
        env: OpenAI environment.
        estimator_policy: Policy Function to be optimized 
        estimator_value: Value function approximator, used as a baseline
        num_episodes: Number of episodes to run for
        discount_factor: Time-discount factor
    
    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))    
    
    Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    
    for i_episode in range(num_episodes):
        # Reset the environment and pick the fisrst action
        state = env.reset()
        
        episode = []
        
        # One step in the environment
        for t in itertools.count():
            if t > 4000: break
            # Take a step
            action_probs = estimator_policy.predict(state)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done = env.step(action)
            
            # Keep track of the transition
            episode.append(Transition(
              state=state, action=action, reward=reward, next_state=next_state, done=done))
            
            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            
            # Calculate TD Target
            value_next = estimator_value.predict(next_state)
            td_target = reward + discount_factor * value_next
            td_error = td_target - estimator_value.predict(state)
            
            # Update the value estimator
            lp = estimator_value.update(state, td_target)
            
            # Update the policy estimator
            # using the td error as our advantage estimate
            le = estimator_policy.update(state, td_error, action)
            
            # Print out which step we're on, useful for debugging.
            print("\rStep {} @ Episode {}/{} ({}), PG loss: {}, E loss: {}".format(
                    t, i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1], lp, le), end="")

            if done:
                break
                
            state = next_state
    
    return stats

In [12]:
env = Market(df)

tf.reset_default_graph()

global_step = tf.Variable(0, name="global_step", trainable=False)
policy_estimator = PolicyEstimator(scope='2541')
value_estimator = ValueEstimator(scope='220553')

with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    # Note, due to randomness in the policy the number of episodes you need to learn a good
    # policy may vary. ~300 seemed to work well for me.
    stats = actor_critic(env, policy_estimator, value_estimator, 9000)

Step 1233 @ Episode 340/9000 (-5608.10696360003), PG loss: 648.6074829101562, E loss: 22.615041732788086

KeyboardInterrupt: 

In [None]:
plotting.plot_episode_stats(stats, smoothing_window=10)

In [None]:
stats

In [None]:

df.head()