# Q Function
* used to approximate the reward based on a state
* Q(s,a) calculates the expected future value from state **s** and action **a**
* in DQN, we use a **neural network to approximate the reward**

# Classes
* Environment
* Agent
* Runner

## Environment

In [1]:
import numpy as np
import pandas_datareader as pdr
import datetime

BUY = 'buy'
SELL = 'sell'
SKIP = 'skip'


class Environment:
    max_days_to_hold = 5
    
    def __init__(self, 
                 ticker, 
                 initial_deposit = 100000,
                 from_date = datetime.datetime(2007, 1, 1), 
                 to_date = datetime.datetime(2017, 1, 1),
                 window = 20):
        self.initial_deposit = initial_deposit
        self.window = window
        self.data = pdr.get_data_google(ticker, from_date, to_date)
        self.data_length = len(self.data)
        
        self.min_date = self.data.index.min()
        self.max_date = self.data.index.max()
        
        self.action_space = [BUY, SELL, SKIP]
        self.reset()
        
    def reset(self):
        self.deposit = self.initial_deposit
        self.current_index = self.window
        return self.state()
    
    def score(self):
        return self.deposit
    
    def enough_data_provided(self):
        return self.current_index + Environment.max_days_to_hold <= self.data_length
    
    def _current_price(self):
        return self.data.iloc[self.current_index]['Close']
    
    def state(self):
        return self.data.iloc[self.current_index - self.window:self.current_index]['Close']
        
    def state_size(self):
        return self.window
    
    def action_size(self):
        return len(self.action_space)
        
    def step(self, action_tuple):
        action = self.action_space[action_tuple[0]] # BUY, SELL, ...
        value = action_tuple[1]                     # number of stocks to buy, always positive
        days_to_hold = action_tuple[2]              # 1-5
        
        #print('\t=> current action is: {} at {}'.format(action, self.data.index[self.current_index]))
        
        df = self.data.iloc[self.current_index: self.current_index + days_to_hold]['Close']
        first_day_price = df.iloc[0]
        last_day_price = df.iloc[-1]
        
        if action == BUY:
            reward = last_day_price - first_day_price
        elif action == SELL:
            reward = first_day_price - last_day_price
        elif action == SKIP:
            reward = 0
        
        self.current_index += days_to_hold
        self.deposit += reward*value
        
        next_state = self.state()
        done = False
        _ = None
        return next_state, reward, done, _ 

## Agent

In [2]:
import random
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.losses import mean_squared_error

class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.99
        self.learning_rate = 0.001
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()
    
    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss=mean_squared_error,
                      optimizer=Adam(lr=self.learning_rate))
        return model
    
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = self.model.predict(state)
            if done:
                target[0][action] = reward
            else:
                a = self.model.predict(next_state)[0]
                t = self.target_model.predict(next_state)[0]
                target[0][action] = reward + self.gamma * t[np.argmax(a)]
            self.model.fit(state, target, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

Using TensorFlow backend.


## Runner

In [3]:
env = Environment('AAPL')
state_size = env.state_size()
action_size = env.action_size()

In [None]:
agent = Agent(state_size, action_size)

done = False
batch_size = 32
EPISODES = 5000

In [None]:
for e in range(EPISODES):
    state = env.reset()
    state = state.values.reshape([1, state_size])
    while env.enough_data_provided():
        action = agent.act(state)
        next_state, reward, done, _ = env.step((action, 100, 3)) # build these parameters into the NN model
        # reward = reward if not done else -10
        next_state = next_state.values.reshape([1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        
    agent.update_target_model()
    print("episode: {}/{}, score: {}, e: {:.2}".format(e, EPISODES, env.score(), agent.epsilon))
    
    if len(agent.memory) > batch_size:
        agent.replay(batch_size)

episode: 0/5000, score: 100714.0, e: 1.0
episode: 1/5000, score: 101178.0, e: 0.99
episode: 2/5000, score: 101858.0, e: 0.98
episode: 3/5000, score: 97909.0, e: 0.97
episode: 4/5000, score: 96943.0, e: 0.96
episode: 5/5000, score: 96432.0, e: 0.95
episode: 6/5000, score: 101860.0, e: 0.94
episode: 7/5000, score: 98105.0, e: 0.93
episode: 8/5000, score: 105870.0, e: 0.92
episode: 9/5000, score: 97333.0, e: 0.91
episode: 10/5000, score: 96074.0, e: 0.9
episode: 11/5000, score: 93226.0, e: 0.9
episode: 12/5000, score: 94052.0, e: 0.89
episode: 13/5000, score: 99426.0, e: 0.88
episode: 14/5000, score: 100624.0, e: 0.87
episode: 15/5000, score: 98252.0, e: 0.86
episode: 16/5000, score: 98694.0, e: 0.85
episode: 17/5000, score: 100350.0, e: 0.84
episode: 18/5000, score: 100209.0, e: 0.83
episode: 19/5000, score: 101952.0, e: 0.83
episode: 20/5000, score: 97315.0, e: 0.82
episode: 21/5000, score: 102063.0, e: 0.81
episode: 22/5000, score: 103399.0, e: 0.8
episode: 23/5000, score: 99994.0, e: 

episode: 192/5000, score: 107726.0, e: 0.15
episode: 193/5000, score: 100529.0, e: 0.14
episode: 194/5000, score: 106020.0, e: 0.14
episode: 195/5000, score: 91049.0, e: 0.14
episode: 196/5000, score: 99422.0, e: 0.14
episode: 197/5000, score: 100104.0, e: 0.14
episode: 198/5000, score: 101461.0, e: 0.14
episode: 199/5000, score: 92552.0, e: 0.14
episode: 200/5000, score: 94928.0, e: 0.13
episode: 201/5000, score: 93185.0, e: 0.13
episode: 202/5000, score: 98748.0, e: 0.13
episode: 203/5000, score: 94170.0, e: 0.13
episode: 204/5000, score: 98590.0, e: 0.13
episode: 205/5000, score: 101407.0, e: 0.13
episode: 206/5000, score: 98502.0, e: 0.13
episode: 207/5000, score: 101324.0, e: 0.12
episode: 208/5000, score: 100128.0, e: 0.12
episode: 209/5000, score: 99340.0, e: 0.12
episode: 210/5000, score: 98577.0, e: 0.12
episode: 211/5000, score: 100696.0, e: 0.12
episode: 212/5000, score: 102645.0, e: 0.12
episode: 213/5000, score: 98156.0, e: 0.12
episode: 214/5000, score: 101654.0, e: 0.12


episode: 377/5000, score: 101464.0, e: 0.023
episode: 378/5000, score: 104731.0, e: 0.022
episode: 379/5000, score: 104422.0, e: 0.022
episode: 380/5000, score: 104783.0, e: 0.022
episode: 381/5000, score: 105440.0, e: 0.022
episode: 382/5000, score: 104741.0, e: 0.022
episode: 383/5000, score: 104138.0, e: 0.021
episode: 384/5000, score: 107424.0, e: 0.021
episode: 385/5000, score: 104524.0, e: 0.021
episode: 386/5000, score: 105567.0, e: 0.021
episode: 387/5000, score: 105689.0, e: 0.02
episode: 388/5000, score: 105013.0, e: 0.02
episode: 389/5000, score: 104013.0, e: 0.02
episode: 390/5000, score: 105830.0, e: 0.02
episode: 391/5000, score: 106064.0, e: 0.02
episode: 392/5000, score: 107007.0, e: 0.019
episode: 393/5000, score: 105943.0, e: 0.019
episode: 394/5000, score: 106120.0, e: 0.019
episode: 395/5000, score: 105518.0, e: 0.019
episode: 396/5000, score: 105517.0, e: 0.019
episode: 397/5000, score: 104718.0, e: 0.019
episode: 398/5000, score: 105806.0, e: 0.018
episode: 399/50