# Q Function
* used to approximate the reward based on a state
* Q(s,a) calculates the expected future value from state **s** and action **a**
* in DQN, we use a **neural network to approximate the reward**

# Classes
* Environment
* Agent
* Runner

## Environment

In [1]:
import numpy as np
actions = np.array(['a', 'b', 'c'])
days_to_holds = np.arange(1,6,1)

In [2]:
options = [(act, days) for act in actions for days in days_to_holds]

In [3]:
options

[('a', 1),
 ('a', 2),
 ('a', 3),
 ('a', 4),
 ('a', 5),
 ('b', 1),
 ('b', 2),
 ('b', 3),
 ('b', 4),
 ('b', 5),
 ('c', 1),
 ('c', 2),
 ('c', 3),
 ('c', 4),
 ('c', 5)]

In [4]:
class Action:
    def __init__(self, act, days, value):
        self.act = act
        self.days = days
        self.value = value

In [5]:
import numpy as np
import pandas_datareader as pdr
import datetime

BUY = 'buy'
SELL = 'sell'
SKIP = 'skip'

class Environment:
    
    max_days_to_hold = 5
    min_days_to_hold = 1
    default_stock_value = 100
    
    def __init__(self, 
                 ticker, 
                 initial_deposit = 100000,
                 from_date = datetime.datetime(2007, 1, 1), 
                 to_date = datetime.datetime(2017, 1, 1),
                 window = 20):
        self.initial_deposit = initial_deposit
        self.window = window
        self.data = pdr.get_data_google(ticker, from_date, to_date)
        self.data_length = len(self.data)
        
        self.min_date = self.data.index.min()
        self.max_date = self.data.index.max()
        
        actions = np.array([BUY, SELL, SKIP])
        days_to_holds = np.arange(Environment.min_days_to_hold, 
                                  Environment.max_days_to_hold + 1,
                                  1)
        
        self.action_space = [Action(act, days, Environment.default_stock_value) for act in actions for days in days_to_holds]
        self.reset()
        
    def reset(self):
        self.deposit = self.initial_deposit
        self.current_index = self.window
        return self.state()
    
    def score(self):
        return self.deposit
    
    def enough_data_provided(self):
        return self.current_index + Environment.max_days_to_hold <= self.data_length
    
    def _current_price(self):
        return self.data.iloc[self.current_index]['Close']
    
    def state(self):
        return self.data.iloc[self.current_index - self.window:self.current_index]['Close']
        
    def state_size(self):
        return self.window
    
    def action_size(self):
        return len(self.action_space)
        
    def step(self, action_idx: int):
        action = self.action_space[action_idx]
        #print('\t=> current action is: {} at {}'.format(action, self.data.index[self.current_index]))
        
        df = self.data.iloc[self.current_index: self.current_index + action.days]['Close']
        first_day_price = df.iloc[0]
        last_day_price = df.iloc[-1]
        
        if action.act == BUY:
            reward = last_day_price - first_day_price
        elif action.act == SELL:
            reward = first_day_price - last_day_price
        elif action.act == SKIP:
            reward = 0
        
        self.current_index += action.days
        self.deposit += reward * action.value
        
        next_state = self.state()
        done = False
        _ = None
        return next_state, reward, done, _ 

## Agent

In [6]:
import random
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.losses import mean_squared_error

class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()
    
    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss=mean_squared_error,
                      optimizer=Adam(lr=self.learning_rate))
        return model
    
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = self.model.predict(state)
            if done:
                target[0][action] = reward
            else:
                a = self.model.predict(next_state)[0]
                t = self.target_model.predict(next_state)[0]
                target[0][action] = reward + self.gamma * t[np.argmax(a)]
            self.model.fit(state, target, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

Using TensorFlow backend.


## Runner

In [12]:
env = Environment('AAPL')
state_size = env.state_size()
action_size = env.action_size()
print('Action size: {}, state size: {}'.format(action_size, state_size))

Action size: 15, state size: 20


In [13]:
agent = Agent(state_size, action_size)
EPISODES = 1000
batch_size = 32

In [None]:
for e in range(EPISODES):
    state = env.reset()
    state = state.values.reshape([1, state_size])
    while env.enough_data_provided():
        action_idx = agent.act(state)
        next_state, reward, done, _ = env.step(action_idx) # build these parameters into the NN model
        # reward = reward if not done else -10
        next_state = next_state.values.reshape([1, state_size])
        agent.remember(state, action_idx, reward, next_state, done)
        state = next_state
        
    agent.update_target_model()
    print("episode: {}/{}, score: {}, e: {:.2}".format(e, EPISODES, env.score(), agent.epsilon))
    agent.replay(batch_size)

episode: 0/1000, score: 101075.0, e: 1.0
episode: 1/1000, score: 99849.0, e: 0.99
episode: 2/1000, score: 101415.0, e: 0.99
episode: 3/1000, score: 102948.0, e: 0.99
episode: 4/1000, score: 97690.0, e: 0.98
episode: 5/1000, score: 98715.0, e: 0.98
episode: 6/1000, score: 99067.0, e: 0.97
episode: 7/1000, score: 100385.0, e: 0.97
episode: 8/1000, score: 96429.0, e: 0.96
episode: 9/1000, score: 93648.0, e: 0.96
episode: 10/1000, score: 97289.0, e: 0.95
episode: 11/1000, score: 108147.0, e: 0.95
episode: 12/1000, score: 92214.0, e: 0.94
episode: 13/1000, score: 97949.0, e: 0.94
episode: 14/1000, score: 95725.0, e: 0.93
episode: 15/1000, score: 101349.0, e: 0.93
episode: 16/1000, score: 97201.0, e: 0.92
episode: 17/1000, score: 108441.0, e: 0.92
episode: 18/1000, score: 93589.0, e: 0.91
episode: 19/1000, score: 99275.0, e: 0.91
episode: 20/1000, score: 103179.0, e: 0.9
episode: 21/1000, score: 91162.0, e: 0.9
episode: 22/1000, score: 99363.0, e: 0.9
episode: 23/1000, score: 106407.0, e: 0.