<a href="https://colab.research.google.com/github/komo135/forex/blob/master/actor_critic_duel_agent_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd

import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [0]:
# Google ドライブをマウントするには、このセルを実行してください。
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import preprocessing

from collections import deque
import random


class Actor:
    def __init__(self, name, input_size, output_size):
        with tf.variable_scope(name):
            self.X = tf.placeholder(tf.float32, input_size)
            cnn1 = tf.layers.conv1d(self.X, 16, 2, padding="same", activation=tf.nn.relu)
            cnn2 = tf.layers.conv1d(self.X, 32, 2, padding="same", activation=tf.nn.relu)
            cnn3 = tf.layers.conv1d(self.X, 48, 2, padding="same", activation=tf.nn.relu)
            cnn4 = tf.layers.conv1d(self.X, 62, 2, padding="same", activation=tf.nn.relu)
            cnn5 = tf.layers.conv1d(self.X, 80, 2, padding="same", activation=tf.nn.relu)
            feed_actor = tf.keras.layers.concatenate([cnn1, cnn2, cnn3, cnn4, cnn5])
            feed_actor = tf.layers.flatten(feed_actor)
            tensor_action, tensor_validation = tf.split(feed_actor, 2, 1)
            feed_action = tf.layers.dense(tensor_action, output_size)
            feed_validation = tf.layers.dense(tensor_validation, 1)
            self.logits = feed_validation + tf.subtract(feed_action,
                                                        tf.reduce_mean(feed_action, axis=1, keep_dims=True))


class Critic:
    def __init__(self, name, input_size, output_size, learning_rate):
        with tf.variable_scope(name):
            self.X = tf.placeholder(tf.float32, input_size)
            self.Y = tf.placeholder(tf.float32, (None, output_size))
            self.REWARD = tf.placeholder(tf.float32, (None, 1))
            cnn1 = tf.layers.conv1d(self.X, 16, 2, padding="same", activation=tf.nn.relu)
            cnn2 = tf.layers.conv1d(self.X, 32, 2, padding="same", activation=tf.nn.relu)
            cnn3 = tf.layers.conv1d(self.X, 48, 2, padding="same", activation=tf.nn.relu)
            cnn4 = tf.layers.conv1d(self.X, 62, 2, padding="same", activation=tf.nn.relu)
            cnn5 = tf.layers.conv1d(self.X, 80, 2, padding="same", activation=tf.nn.relu)
            feed_critic = tf.keras.layers.concatenate([cnn1, cnn2, cnn3, cnn4, cnn5])
            feed_critic = tf.layers.flatten(feed_critic)
            tensor_action, tensor_validation = tf.split(feed_critic, 2, 1)
            feed_action = tf.layers.dense(tensor_action, output_size)
            feed_validation = tf.layers.dense(tensor_validation, 1)
            feed_critic = feed_validation + tf.subtract(feed_action,
                                                        tf.reduce_mean(feed_action, axis=1, keep_dims=True))
            feed_critic = tf.nn.relu(feed_critic) + self.Y
            feed_critic = tf.layers.dense(feed_critic, 128 // 2, activation=tf.nn.relu)
            self.logits = tf.layers.dense(feed_critic, 1)
            self.cost = tf.reduce_mean(tf.square(self.REWARD - self.logits))
            self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)


class Agent:
    LEARNING_RATE = 0.001
    BATCH_SIZE = 32
    OUTPUT_SIZE = 3
    EPSILON = 0.5
    DECAY_RATE = 0.005
    MIN_EPSILON = 0.1
    GAMMA = 0.99
    MEMORIES = deque()
    MEMORY_SIZE = 5000
    COPY = 1000
    T_COPY = 0

    def __init__(self, path, window_size, skip):
        self.path = path
        self.window_size = window_size
        self._preproc()
        self.state_size = (None, 1, self.df.shape[-1])
        self.skip = skip
        tf.reset_default_graph()
        self.actor = Actor('actor-original', self.state_size, self.OUTPUT_SIZE)
        self.actor_target = Actor('actor-target', self.state_size, self.OUTPUT_SIZE)
        self.critic = Critic('critic-original', self.state_size, self.OUTPUT_SIZE, self.LEARNING_RATE)
        self.critic_target = Critic('critic-target', self.state_size, self.OUTPUT_SIZE, self.LEARNING_RATE)
        self.grad_critic = tf.gradients(self.critic.logits, self.critic.Y)
        self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.OUTPUT_SIZE])
        weights_actor = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor')
        self.grad_actor = tf.gradients(self.actor.logits, weights_actor, -self.actor_critic_grad)
        grads = zip(self.grad_actor, weights_actor)
        self.optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE).apply_gradients(grads)
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())

    def _preproc(self):
        df = pd.read_csv(self.path)
        X = df[["Close","ema4","ema8","ema16"]]
        X = preprocessing.MinMaxScaler().fit_transform(X)
         
        model = tf.keras.models.load_model("drive/My Drive/ema8.hdf5")
        gen = tf.keras.preprocessing.sequence.TimeseriesGenerator(X, X, self.window_size, batch_size=1000)
        pred = model.predict_generator(gen).reshape((-1,1,4))
        
        p = []
        for i in range(1,len(pred)):
          p.append(pred[i] - pred[i-1])
        p = np.asanyarray(p)
        y = np.asanyarray(df[["Open"]])
        y = y[-len(p)::]
        
        self.df = p
        self.trend = y

    def _assign(self, from_name, to_name):
        from_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=from_name)
        to_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=to_name)
        for i in range(len(from_w)):
            assign_op = to_w[i].assign(from_w[i])
            self.sess.run(assign_op)

    def _memorize(self, state, action, reward, new_state, dead):
        self.MEMORIES.append((state, action, reward, new_state, dead))
        if len(self.MEMORIES) > self.MEMORY_SIZE:
            self.MEMORIES.popleft()

    def _select_action(self, state):
        if np.random.rand() < self.EPSILON:
            action = np.random.randint(self.OUTPUT_SIZE)
        else:
            prediction = self.sess.run(self.actor.logits, feed_dict={self.actor.X: [state]})[0]
            action = np.argmax(prediction)
        return action

    def _construct_memories_and_train(self, replay):
        states = np.array([a[0] for a in replay])
        new_states = np.array([a[3] for a in replay])
        Q = self.sess.run(self.actor.logits, feed_dict={self.actor.X: states})
        Q_target = self.sess.run(self.actor_target.logits, feed_dict={self.actor_target.X: states})
        grads = self.sess.run(self.grad_critic, feed_dict={self.critic.X: states, self.critic.Y: Q})[0]
        self.sess.run(self.optimizer, feed_dict={self.actor.X: states, self.actor_critic_grad: grads})

        rewards = np.array([a[2] for a in replay]).reshape((-1, 1))
        rewards_target = self.sess.run(self.critic_target.logits,
                                       feed_dict={self.critic_target.X: new_states, self.critic_target.Y: Q_target})
        for i in range(len(replay)):
            if not replay[0][-1]:
                rewards[i] += self.GAMMA * rewards_target[i]
        cost, _ = self.sess.run([self.critic.cost, self.critic.optimizer],
                                feed_dict={self.critic.X: states, self.critic.Y: Q, self.critic.REWARD: rewards})
        return cost

    def get_state(self, t):
        return self.df[t]

    def buy(self, spread, pip_cost, sl):
        position = 3
        pip = 0
        spread = spread / pip_cost
        loscut = False

        for t in range(len(self.trend) - 28000, len(self.trend), self.skip):
            state = self.get_state(t)
            action = self._select_action(state)
            '''
                0 == Hold
                1 = buy
                2 = sell
                3 = start
                '''
            if action == 1:
                if position == 3:
                    states = self.trend[t] + spread
                    position = 1
                elif position == 2:
                    pip += ((states - self.trend[t]) * pip_cost)
                    states = self.trend[t] + spread
                    position = 1

            elif action == 2:
                if position == 3:
                    states = self.trend[t] - spread
                    position = 2
                elif position == 1:
                    pip += ((self.trend[t] - states) * pip_cost)
                    states = self.trend[t] - spread
                    position = 2

            # state = next_state
        return pip

    def train(self, iterations, checkpoint, spread, pip_cost):
        for i in range(iterations):
            position = 3
            total_pip = 0
            max_pip = 0
            pip = []
            spread = spread / pip_cost
            mean_pip = 0.0
            done = False
            state = self.get_state(0)
            for t in  range(len(self.trend) - 28000, len(self.trend)-1, self.skip):
                if (self.T_COPY + 1) % self.COPY == 0:
                    self._assign('actor-original', 'actor-target')
                    self._assign('critic-original', 'critic-target')

                action = self._select_action(state)
                next_state = self.get_state(t + 1)

                if action == 0:
                    if position == 3:
                        states = self.trend[t] + spread
                        position = 1
                    elif position == 2:
                        p = (states - self.trend[t]) * pip_cost
                        if p <= -40:
                            p = -40
                        pip.append(p)
                        total_pip = sum(pip)
                        states = self.trend[t] + spread
                        position = 1


                elif action == 1:
                    if position == 3:
                        states = self.trend[t] - spread
                        position = 2
                    elif position == 1:
                        p = (self.trend[t] - states) * pip_cost
                        if p <= -40:
                            p = -40
                        pip.append(p)
                        total_pip = sum(pip)
                        states = self.trend[t] - spread
                        position = 2

                total_pip = np.float(total_pip)
                if max_pip <= total_pip:
                    max_pip = total_pip

                if max_pip > total_pip:
                    s = max_pip - total_pip
                    if s < -200:
                      total_pip = -300
                      done = True

                if total_pip < -300:
                    done = True
                
                if len(pip) != 0:
                  mean_pip = np.mean(np.asanyarray(pip))
                self._memorize(state, action, mean_pip, next_state, done)
                batch_size = min(len(self.MEMORIES), self.BATCH_SIZE)
                state = next_state
                replay = random.sample(self.MEMORIES, batch_size)
                cost = self._construct_memories_and_train(replay)
                self.T_COPY += 1
                self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i)
                if done == True:
                  break
            trade_accuracy = np.mean(np.asanyarray(pip) > 0)
            print('trade accuracy = ', trade_accuracy)
            print('epoch: %d, total rewards: %f, cost: %f, mean rewards: %f' % (i + 1, total_pip, cost, mean_pip))

    def train1(self, iterations, checkpoint, initial_money):
        for i in range(iterations):
            total_profit = 0
            inventory = []
            state = self.get_state(0)
            starting_money = initial_money
            for t in  range(len(self.trend) - 28000, len(self.trend)-1, self.skip):
                if (self.T_COPY + 1) % self.COPY == 0:
                    self._assign('actor-original', 'actor-target')
                    self._assign('critic-original', 'critic-target')
                
                action = self._select_action(state)
                next_state = self.get_state(t + 1)
                
                if action == 0 and starting_money >= self.trend[t]:
                    inventory.append(self.trend[t])
                    starting_money -= self.trend[t]
                
                elif action == 1 and len(inventory) > 0:
                    bought_price = inventory.pop(0)
                    total_profit += self.trend[t] - bought_price
                    starting_money += self.trend[t]
                    
                invest = ((starting_money - initial_money) / initial_money)
                
                self._memorize(state, action, invest, next_state, starting_money < initial_money)
                batch_size = min(len(self.MEMORIES), self.BATCH_SIZE)
                state = next_state
                replay = random.sample(self.MEMORIES, batch_size)
                cost = self._construct_memories_and_train(replay)
                self.T_COPY += 1
                self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i)
            print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost,
                                                                                  starting_money))


In [0]:
window_size = 50
skip = 1
batch_size = 32
agent = Agent("drive/My Drive/audpred.csv", window_size, skip)

In [0]:
agent.train(2000, 10, 10, 1000)
# agent.train1(iterations = 200, checkpoint = 10, initial_money = 10000)