![QuantConnect Logo](https://cdn.quantconnect.com/web/i/logo-small.png)
## Welcome to The QuantConnect Research Page
#### Refer to this page for documentation https://www.quantconnect.com/docs#Introduction-to-Jupyter
#### Contribute to this template file https://github.com/QuantConnect/Lean/blob/master/Jupyter/BasicQuantBookTemplate.ipynb

In [1]:
%matplotlib inline
# Imports
from clr import AddReference
AddReference("System")
AddReference("QuantConnect.Common")
AddReference("QuantConnect.Jupyter")
AddReference("QuantConnect.Indicators")
from System import *
from QuantConnect import *
from QuantConnect.Data.Custom import *
from QuantConnect.Data.Market import TradeBar, QuoteBar
from QuantConnect.Data.Consolidators import QuoteBarConsolidator
from QuantConnect.Jupyter import *
from QuantConnect.Indicators import *
from QuantConnect.Indicators.CandlestickPatterns import *
from datetime import datetime, timedelta
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import random
import collections
import pdb

In [4]:
class Agent:
    
    def __init__(self, barsPerSequence, resolution = Resolution.Hour):
        self.nSteps = 0
        self.currentEpisode = 0
        self.randomProbabilityStart = 0.40
        self.randomProbabilityMin = 0.00
        self.randomProbabilityEpisodesToMin = 400
        self.randomProbability = self.randomProbabilityStart
        self.minHoldTimeBarsStart = 20
        self.minHoldTimeBarsFinal = 1
        self.minHoldTimeBarsEpisodesToFinal = 200
        self.minHoldTimeBars = self.minHoldTimeBarsStart
        self.discountFactor = 0.95
        self.barsPerSequence = 24
        self.environment = Environment(barsPerSequence, resolution)
        self.batchSize = 48
        self.numInputSignals = 4  
        self.stepSizeTraining = 10
        self.stepsUntilFirstTraining = self.batchSize + 50
        self.stepSizeCopyingWeights = 50
        self.stepsUntilFirstCopy = self.stepsUntilFirstTraining + 50
        self.numActions = self.environment.getNumberOfActions()
        self.replayMemory = ReplayMemory(2000)
        self.networkHandler = NetworkHandler(self.numInputSignals, self.barsPerSequence, self.numActions)
    
    def run(self, numEpisodes, session):
        
        self.networkHandler.session = session
        if self.currentEpisode == 0: tf.global_variables_initializer().run(session = self.networkHandler.session)

        for iEpisode in range(self.currentEpisode, self.currentEpisode + numEpisodes):

            holdTimeBars = 0
            self.currentEpisode += 1

            observation, possibleActions, isDone = self.environment.reset()
            nextAction = self.networkHandler.chooseAction(observation, possibleActions, self.randomProbability)
            self.nSteps += 1

            while not isDone:

                if self.environment.currentSituation > 0: holdTimeBars += 1

                observation, possibleActions, isDone = self.environment.step(nextAction)
                if holdTimeBars < self.minHoldTimeBars:
                    if self.environment.currentSituation == 1: possibleActions = [3]
                    if self.environment.currentSituation == 2: possibleActions = [4]

                nextAction = self.networkHandler.chooseAction(observation, possibleActions, self.randomProbability)
                self.nSteps += 1

                if self.isTrainingRequested(): self.networkHandler.trainNetwork(self.replayMemory, self.batchSize, self.discountFactor)
                if self.isCopyRequested(): self.networkHandler.copyWeightsToQNet()

            allSarsSamples = self.environment.getSarsSamples()
            for sarsSample in allSarsSamples:
                self.replayMemory.add(sarsSample)

            self.decayRandomProb(iEpisode)
            self.decayHoldTime(iEpisode)

            print("episode " + str(iEpisode+1) + " finished. ROI: " \
                  + str( round(100*self.environment.roiHistory[-1], 2)) )
    
    def isTrainingRequested(self):
        if self.nSteps > self.stepsUntilFirstTraining and self.nSteps % self.stepSizeTraining == 0 and not self.replayMemory.isEmpty():
            return True
        else:
            return False
    
    def isCopyRequested(self):
        if self.nSteps > self.stepsUntilFirstCopy and self.nSteps % self.stepSizeCopyingWeights == 0:
            return True
        else:
            return False
        
    def decayRandomProb(self, nEpisode):
        decayGradient = (self.randomProbabilityStart - self.randomProbabilityMin) / self.randomProbabilityEpisodesToMin
        self.randomProbability -= decayGradient
        self.randomProbability = max(self.randomProbability, self.randomProbabilityMin)
    
    def decayHoldTime(self, nEpisode):
        holdTimeGradient = (self.minHoldTimeBarsStart - self.minHoldTimeBarsFinal) / self.minHoldTimeBarsEpisodesToFinal
        self.minHoldTimeBars -= holdTimeGradient
        self.minHoldTimeBars = max(self.minHoldTimeBars, self.minHoldTimeBarsFinal)

class ReplayMemory:
    
    def __init__(self, length):
        self.experienceMemory = collections.deque(maxlen = length)
        
    def sampleMemories(self, batchSize):
        
        lengthMemory = len(self.experienceMemory)
        if lengthMemory < batchSize: raise ValueError("not enough samples to create a batch of the desired size")
        permutedMemory = np.random.permutation(lengthMemory)[0 : batchSize]
        sampledMemory = np.array(self.experienceMemory)[permutedMemory]
    
        observationBatch = sampledMemory[:, 0]
        actionBatch = sampledMemory[:, 1]
        rewardBatch = sampledMemory[:, 2]
        observationNextBatch = sampledMemory[:, 3]
        doneBatch = sampledMemory[:, 4] 
        
        return observationBatch, actionBatch, rewardBatch, observationNextBatch, doneBatch
        
    def add(self, sample):
        # sample is expected to be a [observation, action, reward, observationNext, done] list 
        
        self.experienceMemory.append(sample)
    
    def isEmpty(self):
        return len(self.experienceMemory) == 0
    
class NetworkHandler:
    
    def __init__(self, numInputSignals, barsPerSequence, numActions):
        self.learningRate = 0.001
        self.numInputSignals = numInputSignals
        self.barsPerSequence = barsPerSequence
        self.numActions = numActions
        self.session = None
        self.inputs = None
        self.qNet = None
        self.targetNet = None
        self.initNetworks()
        self.defineOptimization()   
                
    def initNetworks(self):
        tf.reset_default_graph()
        self.inputs = tf.placeholder(tf.float32, [None, self.numInputSignals, self.barsPerSequence])
        self.qNet = DeepQNet(self.inputs, self.numActions, 'QNet')
        self.qNet.buildNetwork()
        self.targetNet = DeepQNet(self.inputs, self.numActions, 'TargetNet')
        self.targetNet.buildNetwork()
        
    def chooseAction(self, observation, possibleActions, randomChance = 0):
        # chooses the best action according to epsilon greedy strategy
        
        randomChance = min( 1, max(0, randomChance) ) 
        if randomChance > random.uniform(0, 1):
            randomActionOfPossibleActions = random.randint(0, len(possibleActions) -1)
            randomAction = possibleActions[randomActionOfPossibleActions]
            return randomAction
        else:
            qValuesAction = self.qNet.output.eval(session = self.session, feed_dict = { self.inputs: [observation] } )
            bestActionOfPossibleActions = np.argmax(np.take(qValuesAction, possibleActions))
            bestAction = possibleActions[bestActionOfPossibleActions]
            return bestAction
            
    def defineOptimization(self):
        
        self.actionPerformed = tf.placeholder(tf.int32, shape = (None,))
        qValuePrediction = tf.reduce_sum( self.targetNet.output * tf.one_hot(self.actionPerformed, self.numActions), axis = -1, keepdims = True )
        self.qValueTarget = tf.placeholder( tf.float32, shape = (None, 1) )
        optimizer = tf.train.AdamOptimizer(learning_rate = self.learningRate)
        self.loss = tf.reduce_mean( tf.square(self.qValueTarget - qValuePrediction) )
        self.trainingOp = optimizer.minimize(self.loss)
    
    def trainNetwork(self, replayMemory, batchSize, gamma):
        
        observationBatch, actionBatch, rewardBatch, observationNextBatch, doneBatch = replayMemory.sampleMemories(batchSize)
        
        lstmInput = observationBatch.tolist()
        lstmInputNext = observationNextBatch.tolist()
        actionBatch = actionBatch.tolist()
        
        qValuesNextActionQNet = self.qNet.output
        qValuesNextActionTargetNet = self.targetNet.output
        bestNextAction = tf.argmax(qValuesNextActionQNet, axis = -1) # double DQN. Let main network choose the action ...
        qValueBestNextAction = tf.reduce_sum( qValuesNextActionTargetNet * tf.one_hot(bestNextAction, self.numActions), \
                               axis = -1, keepdims = False) # ... and use the Q-value of the target network  
        qValueTargetBatch = rewardBatch + (gamma * qValueBestNextAction) * (1 - doneBatch)
        qValueTargetBatch = qValueTargetBatch.eval(session = self.session, feed_dict = { self.inputs: lstmInputNext } )
        
        trainLoss, _ = self.session.run( [self.loss, self.trainingOp], feed_dict = { self.inputs: lstmInput, \
                self.qValueTarget: np.expand_dims(qValueTargetBatch, axis = -1), self.actionPerformed: actionBatch} )
    
    def copyWeightsToQNet(self):
        # copies the weights of the target network to the main q network
        
        copyOperations = []
        
        for variableName, networkVariable in self.qNet.networkVariables.items():
            copyTensor = tf.assign( networkVariable, self.targetNet.networkVariables[variableName] )
            copyOperations.append(copyTensor)
        
        tf.group(*copyOperations).run(session = self.session)

class DeepQNet:
    
    def __init__(self, networkInput, numActions, networkName):
        self.networkName = networkName
        self.lstmUnits = 16
        self.neuronsFcn1 = 64
        self.numOutputs = numActions
        self.networkInput = networkInput
        self.output = None
        self.networkVariables = None
    
    def buildNetwork(self):
        # 4x16 LSTM --> 16x64 FCN --> ReLu --> 64x7 FCN
        
        with tf.variable_scope(self.networkName) as scope:

            lstmCell = tf.contrib.rnn.LSTMCell(self.lstmUnits)
            wrappedLstmCell = tf.contrib.rnn.DropoutWrapper(cell = lstmCell, output_keep_prob = 0.8)
            outputs, _ = tf.nn.dynamic_rnn(wrappedLstmCell, self.networkInput, dtype = tf.float32)  # shape: [batchSize, sequenceLength, lstmUnits]
            outputs = tf.transpose(outputs, [1, 0, 2]) # shape: [sequenceLength, batchSize, lstmUnits]
            sequenceLength = int( outputs.get_shape()[0] )
            lastSequenceOutputs = tf.gather( outputs, sequenceLength - 1) # returns [batchSize, lstmUnits] of the last sequence sample
            
            weightsInitFcn1 = tf.truncated_normal_initializer( stddev = math.sqrt( 2 / (self.lstmUnits + self.neuronsFcn1) ) )
            biasInitFcn1 = tf.constant_initializer(0.05)
            fcn1 = tf.contrib.layers.fully_connected( lastSequenceOutputs, num_outputs = self.neuronsFcn1, \
                                                     weights_initializer = weightsInitFcn1, biases_initializer = biasInitFcn1)
            
            weightsInitFcn2 = tf.truncated_normal_initializer( stddev = math.sqrt( 2 / (self.neuronsFcn1 + self.numOutputs) ) )
            biasInitFcn2 = tf.constant_initializer(0.05)
            self.output = tf.contrib.layers.fully_connected( fcn1, num_outputs = self.numOutputs, activation_fn = None, \
                                                            weights_initializer = weightsInitFcn2, biases_initializer = biasInitFcn2, scope = None )
            
            self.networkVariables = { trainable_var.name[ len(scope.name): ] : trainable_var for trainable_var in tf.trainable_variables(scope=scope.name) }

class Environment:
    ''' Provides the observations, performs the actions and yields the rewards.
    Each time a new episode is started via the reset method, a random forex is selected at a random time,
    given the time boundary conditions and selected portfolio of forex. '''
    
    def __init__(self, barsPerSequence, resolution):
        self.actionSpace = {0: self.observeMarket, 1: self.orderLong, 2: self.orderShort, \
                            3: self.holdLong, 4: self.holdShort, 5: self.closeLong, 6: self.closeShort}
        self.sampler = Sampler(resolution)
        self.barsPerSequence = barsPerSequence
        self.currentSample = None
        self.possibleNextActions = [0, 1, 2]
        self.sampleObservationHistory = []
        self.sampleActionHistory = []
        self.sampleRewardHistory = []
        self.sampleDoneHistory = []
        self.roiHistory = []
        self.currentSituation = 0 # 0: no open position, 1: open long position, 2: open short position
   
    def reset(self):
        # use this method to initialize a new episode
        
        self.possibleNextActions = [0, 1, 2]
        self.sampleObservationHistory = []
        self.sampleActionHistory = []
        self.sampleRewardHistory = []
        self.sampleDoneHistory = []
        self.currentSituation = 0
        self.currentSample = self.sampler.getSample(self.barsPerSequence)
        self.currentSample.__iter__() # init the iteration
        self.sampleObservationHistory.append( self.currentSample.__next__() )
        
        observation = self.getLastObservation()
        isDone = False
        
        return observation, self.possibleNextActions, isDone
    
    def step(self, action):
        # each step returns a new observation, reward and information if the episode is done
        
        if not self.isSampleDone():
            try:
                self.actionSpace[action]() # performs the action
                self.sampleObservationHistory.append( self.currentSample.__next__() )
                isDone = self.sampleDoneHistory[-1]
            except StopIteration:
                isDone = True
        else:
            isDone = True
        
        observation = self.getLastObservation()
                
        return observation, self.possibleNextActions, isDone
    
    def getNumberOfActions(self):
        return len(self.actionSpace)
    
    def getSarsSamples(self):
        # returns all the (s,a,r,s')-samples from the last episode
        
        nTransitions = len(self.sampleActionHistory)
        sarsSamples = []
        for i in range(0, nTransitions):
            observation = self.sampleObservationHistory[i]
            action = self.sampleActionHistory[i]
            reward = self.sampleRewardHistory[i]
            observationNext = self.sampleObservationHistory[i+1]
            done = self.sampleDoneHistory[i]
            sarsSamples.append([observation, action, reward, observationNext, done])
        
        return sarsSamples
    
    def getLastObservation(self):
        return self.sampleObservationHistory[-1]
    
    def isSampleDone(self):
        if len(self.sampleActionHistory) == 0:
            return False
        else:
            return self.sampleActionHistory[-1] == 5 or self.sampleActionHistory[-1] == 6
    
    def observeMarket(self):
        self.sampleActionHistory.append(0)
        self.sampleDoneHistory.append(False)
        self.possibleNextActions = [0, 1, 2]
        self.currentSituation = 0
    
    def orderLong(self):
        self.sampleActionHistory.append(1)
        self.sampleDoneHistory.append(False)
        self.possibleNextActions = [3, 5]
        self.currentSituation = 1
    
    def orderShort(self):
        self.sampleActionHistory.append(2)
        self.sampleDoneHistory.append(False)
        self.possibleNextActions = [4, 6]
        self.currentSituation = 2
    
    def holdLong(self):
        self.sampleActionHistory.append(3)
        self.sampleDoneHistory.append(False)
        self.possibleNextActions = [3, 5]
        self.currentSituation = 1
        
    def holdShort(self):
        self.sampleActionHistory.append(4)
        self.sampleDoneHistory.append(False)
        self.possibleNextActions = [4, 6]
        self.currentSituation = 2
        
    def closeLong(self):
        self.sampleActionHistory.append(5)
        self.closePosition()
        self.possibleNextActions = [0, 1, 2]
        self.currentSituation = 0
    
    def closeShort(self):
        self.sampleActionHistory.append(6)
        self.closePosition() 
        self.possibleNextActions = [0, 1, 2]
        self.currentSituation = 0
    
    def closePosition(self):
        self.sampleDoneHistory.append(True)
        rewarder = Rewarder(self.currentSample, 12)
        self.sampleRewardHistory = rewarder.getRewards(self.sampleActionHistory)
        self.roiHistory.append(rewarder.getRoi(self.sampleActionHistory))

class Rewarder:
    # calculates the reward for any action
    
    def __init__(self, dataset, sphereSize):
        self.dataset = dataset
        self.sphereSize = sphereSize # defines how many neighbour bars we observe to estimate the goodness of the action
        self.rewardFunctions = {0: self.rewardObserveMarket, 1: self.rewardOpenLong, 2: self.rewardOpenShort, \
                                3: self.rewardHoldLong, 4: self.rewardHoldShort, 5: self.rewardCloseLong, 6: self.rewardCloseShort}
        self.orderPosition = None
        self.closePosition = None
        self.lengthPositionOpen = None
        self.orderType = None
        
    def getRewards(self, actionHistory):
        
        rewardHistory = []
        self.getOrderAndClosePosition(actionHistory)
        position = self.dataset.numObtainBars - 1 # the start position is the number of obtained bars in the sample
        
        for action in actionHistory:
            rewardFunctionHandle = self.rewardFunctions[action]
            rewardHistory.append( rewardFunctionHandle(position) )
            position += 1
        
        return rewardHistory
        
    def rewardObserveMarket(self, position):
        # calculates the reward for waiting before opening a position
        
        return 0
        
    def rewardOpenLong(self, position):
        # calculates the reward for opening a long position
        # orderPosition and closePosition must be the value dataset returns via the getPosition method
        
        bestOpeningPrice = self.dataset.getPeakPrice("min", self.orderPosition - self.sphereSize, self.closePosition)
        realOpeningPrice = self.dataset.priceAsk[self.orderPosition - 1]
        realClosingPrice = self.dataset.priceBid[self.closePosition - 1]
        
        roi = 100 * (realClosingPrice - realOpeningPrice) / realOpeningPrice
        missedRoi = 100 * (realOpeningPrice - bestOpeningPrice) / realOpeningPrice
        
        reward = (0.5*roi - missedRoi) / self.lengthPositionOpen
        return self.clipRewards(reward)
    
    def rewardOpenShort(self, position):
        # calculates the reward for opening a short position
        # orderPosition and closePosition must be the value dataset returns via the getPosition method  
        
        bestOpeningPrice = self.dataset.getPeakPrice("max", self.orderPosition - self.sphereSize, self.closePosition)
        realOpeningPrice = self.dataset.priceBid[self.orderPosition - 1]
        realClosingPrice = self.dataset.priceAsk[self.closePosition - 1]
        
        roi = -100 * (realClosingPrice - realOpeningPrice) / realOpeningPrice
        missedRoi = -100 * (realOpeningPrice - bestOpeningPrice) / realOpeningPrice
        
        reward = (0.5*roi - missedRoi) * (24/self.lengthPositionOpen)
        return self.clipRewards(reward)  
    
    def rewardHoldLong(self, position):
        # calculates the reward for waiting inside an open long position
        # position must be the value the dataset object returns via the getPosition method
        
        currentSellPrice = self.dataset.priceBid[position - 1]
        bestFutureSellPrice = self.dataset.getPeakPrice("max", position - 1, self.closePosition)
        worstFutureSellPrice = self.dataset.getPeakPrice("min", position - 1, self.closePosition)
        meanFutureSellPrice = (bestFutureSellPrice + worstFutureSellPrice) / 2
        
        reward = 100 * 2 * (meanFutureSellPrice - currentSellPrice) / currentSellPrice  * (24/self.lengthPositionOpen)
        return self.clipRewards(reward)
    
    def rewardHoldShort(self, position):
        # calculates the reward for waiting inside an open long position
        # position must be the value the dataset object returns via the getPosition method
        
        currentBuyPrice = self.dataset.priceAsk[position - 1]
        bestFutureBuyPrice = self.dataset.getPeakPrice("min", position, self.closePosition)
        worstFutureBuyPrice = self.dataset.getPeakPrice("max", position, self.closePosition)
        meanFutureSellPrice = (bestFutureBuyPrice + worstFutureBuyPrice) / 2
        
        reward = -100 * 2 * (meanFutureSellPrice - currentBuyPrice) / currentBuyPrice  * (24/self.lengthPositionOpen)
        return self.clipRewards(reward) 

    def rewardCloseLong(self, position):
        # calculates the reward for closing a long position
        # orderPosition and closePosition must be the value dataset returns via the getPosition method
        
        bestClosingPrice = self.dataset.getPeakPrice("max", self.orderPosition, self.closePosition + self.sphereSize)
        realClosingPrice = self.dataset.priceBid[self.closePosition -1]
        realOpeningPrice = self.dataset.priceAsk[self.orderPosition - 1]
        
        roi = 100 * (realClosingPrice - realOpeningPrice) / realOpeningPrice
        missedRoi = 100 * (bestClosingPrice - realClosingPrice) / realOpeningPrice
        
        reward = (0.5*roi - missedRoi)  * (24/self.lengthPositionOpen)
        return self.clipRewards(reward)
    
    def rewardCloseShort(self, position):
        # calculates the reward for closing a short position
        # orderPosition and closePosition must be the value dataset returns via the getPosition method    
        
        bestClosingPrice = self.dataset.getPeakPrice("min", self.orderPosition, self.closePosition + self.sphereSize)
        realClosingPrice = self.dataset.priceAsk[self.closePosition -1]
        realOpeningPrice = self.dataset.priceBid[self.orderPosition - 1]
        
        roi = -100 * (realClosingPrice - realOpeningPrice) / realOpeningPrice
        missedRoi = -100 * (bestClosingPrice - realClosingPrice) / realClosingPrice
        
        reward = (0.5*roi - missedRoi)  * (24/self.lengthPositionOpen)
        return self.clipRewards(reward)        
        
    def getRoi(self, actionHistory):
        # calculates the relative return on investment from the action history of the sample
        
        self.getOrderAndClosePosition(actionHistory)
        
        if self.orderType == 1:
            openingPrice = self.dataset.priceAsk[self.orderPosition - 1]
            closingPrice = self.dataset.priceBid[self.closePosition - 1]
            roi = (closingPrice - openingPrice) / openingPrice
        else:
            openingPrice = self.dataset.priceBid[self.orderPosition - 1]
            closingPrice = self.dataset.priceAsk[self.closePosition - 1]
            roi = -(closingPrice - openingPrice) / openingPrice
        
        return roi
    
    def getOrderAndClosePosition(self, actionHistory):
        # returns the positions of the order and the close position inside the sample and if the order was a long (1) or short (-1)
        
        actionHistory = np.array(actionHistory)
        
        orderLongPos = np.where(actionHistory == 1)[0].tolist()
        orderShortPos = np.where(actionHistory == 2)[0].tolist()
        closeLongPos = np.where(actionHistory == 5)[0].tolist()
        closeShortPos = np.where(actionHistory == 6)[0].tolist()
        
        orderPos = orderLongPos + orderShortPos
        closingPos = closeLongPos + closeShortPos
        
        orderType = 1 if len(orderLongPos) != 0 else -1
        
        sampleOffset = self.dataset.numObtainBars - 1
        self.orderPosition = orderPos[0] + sampleOffset
        self.closePosition = closingPos[0] + sampleOffset
        self.lengthPositionOpen = self.closePosition - self.orderPosition
        self.orderType = orderType
    
    @staticmethod
    def clipRewards(reward):
        # uses a sigmoid function to clip rewards from -1 to +1
        
        clippedReward = 2 / (1 + np.exp(-14*reward)) - 1
        return clippedReward

class Sampler:
    # returns a random dataset given the boundaries defined in the __init__ method
    
    def __init__(self, resolution):
        if resolution < 2: raise ValueError("Only minute (2), hour (3) and daily (4) data")
        
        self.qb = QuantBook()
        self.forexList = ["EURUSD", "GBPUSD", "USDJPY", "USDCAD", "AUSUSD", "USDCHF", \
                          "NZDUSD", "EURGBP", "EURJPY", "AUDJPY", "GBPJPY", "EURCHF"]
        self.MINIMUM_YEAR = 2008
        self.MAXIMUM_YEAR = 2015
        self.resolution = resolution
        self.MAX_RANDOMIZED_SAMPLE_LENGTH = {2:1440, 3:24, 4:0} # depending on resolution
        self.NUM_SAMPLES = 720
        self.WARUMUP_BUFFER_SAMPLES = 50
        self.DAYS_MONTH = {1:31, 2:28, 3:31, 4:30, 5:31, 6:30, \
                           7:31, 8:31, 9:30, 10:31, 11:30, 12:31}
        self.rsi = None
        self.macd = None
        self.standardDeviation = None
        self.adx = None
        self.addTradingObjects()
        self.addIndicators()
        
    def addTradingObjects(self):
        for fxSymbol in self.forexList:
            self.qb.AddForex(fxSymbol)
        
    def addIndicators(self):
        self.rsi = RelativeStrengthIndex(14)
        self.macd = MovingAverageConvergenceDivergence(12, 26, 9)
        self.standardDeviation = StandardDeviation(26)
        self.adx = AverageDirectionalIndex("ADX", 18)
    
    def getSample(self, snapshotLength):
        
        dataNotAvailable = True
        
        while dataNotAvailable:
            forexSymbol = self.getRandomForexSymbol()
            datetimeEnd = self.getRandomDatetime()
            self.qb.SetStartDate(datetimeEnd)
            randomSampleLength = random.randint(0, self.MAX_RANDOMIZED_SAMPLE_LENGTH[self.resolution]) # randomize the sample length to not start at the same time for each sample
            sampleLength = self.NUM_SAMPLES + randomSampleLength
            dataframeLength = sampleLength + self.WARUMUP_BUFFER_SAMPLES
            
            price = self.qb.History([forexSymbol], dataframeLength, self.resolution)
            dataNotAvailable = price.empty
            
        sample = Dataset(sampleLength, snapshotLength)
        sample.setPrice( self.qb.History([forexSymbol], dataframeLength, self.resolution) )
        sample.setRsi( self.qb.Indicator(self.rsi, forexSymbol, dataframeLength, self.resolution) )
        sample.setAdx( self.qb.Indicator(self.adx, forexSymbol, dataframeLength, self.resolution) )
        sample.setMacd( self.qb.Indicator(self.macd, forexSymbol, dataframeLength, self.resolution), \
                       self.qb.Indicator(self.standardDeviation, forexSymbol, dataframeLength, self.resolution) )
        
        return sample
        
    def getRandomForexSymbol(self):
        nSymbols = len(self.forexList)
        randomForexNumber = random.randint(0, nSymbols-1)
        return self.forexList[randomForexNumber]
    
    def getRandomDatetime(self, noMaximumLimitation = False):
        datetimeNow = datetime.now()
        maximumYear = datetimeNow.year if noMaximumLimitation else self.MAXIMUM_YEAR            
        randomYear = random.randint(self.MINIMUM_YEAR, maximumYear)
        randomMonth = random.randint(1, 12) if randomYear != datetimeNow.year else random.randint(1, datetimeNow.month)
        nDaysMonth = self.DAYS_MONTH[randomMonth] if not (self.isLeapYear(randomYear) and randomMonth == 2) else 29
        randomDay = random.randint(1, nDaysMonth)
        
        randomDatetime = datetime(randomYear, randomMonth, randomDay)
        latestDatetime = datetimeNow
        return min(randomDatetime, latestDatetime)
    
    def getSamplesRangeTimedelta(self):
        resolutionMinutes = self.RESOLUTION_MINUTES[self.resolution]
        minimumMinutes = resolutionMinutes * (self.NUM_SAMPLES + self.WARUMUP_BUFFER_SAMPLES)
        return timedelta(minutes = minimumMinutes)
    
    @staticmethod
    def isLeapYear(year):
        return True if year % 4 == 0 else False

class Dataset:
    ''' 
    The Dataset class is a wrapper for a data sample. Once the required dataframes are set,
    we can loop through the data, revealing us a snapshot of the last observed datapoints.
    The amount of datapoints revealed in one snapshot can be adjusted with the variable numObtainBars.
    Every step we iterate over a dataset the window steps one datapoint forward (as in a queue).
    '''
    
    def __init__(self, numBars, numObtainBars):
        self.length = numBars # defines how many bars with the beginning of the last datapoint we'll ingest from the dataframes
        self.numObtainBars = numObtainBars # defines how many last bars we should return when the iterator is called
        self.priceAsk = []
        self.priceBid = []
        self.rsi = []
        self.macdLine = []
        self.macdHistogram = []
        self.adx = []
        self.iterPosition = 0
        
    def __iter__(self):
        self.iterPosition = self.numObtainBars - 1
        return self
    
    def __next__(self):
        self.iterPosition += 1
        if self.iterPosition > self.length: raise StopIteration 

        startPoint = self.iterPosition - self.numObtainBars
        endPoint = self.iterPosition
        snapshot = self.createSnapshot(startPoint, endPoint)
        return snapshot
    
    def getPosition(self):
        return self.iterPosition
    
    def getPeakPrice(self, maxMin, startPoint, endPoint):
        # returns the maximum bid price / minimum ask price with respect to the referencePoint to the next <deltaBars> samples
                
        if maxMin == "min":
            return min( self.priceAsk[startPoint:endPoint] )
        elif maxMin == "max":
            return max( self.priceBid[startPoint:endPoint] )
        else:
            raise ValueError("maxMin must be defined by codewords max or min")
    
    def createSnapshot(self, startPoint, endPoint):
        snapshot = [ self.rsi[startPoint:endPoint], self.macdLine[startPoint:endPoint], \
                    self.macdHistogram[startPoint:endPoint], self.adx[startPoint:endPoint] ]         
        return snapshot
    
    def setPrice(self, priceDataframe):
        self.priceAsk = priceDataframe['askclose'].values[-self.length:]
        self.priceBid = priceDataframe['bidclose'].values[-self.length:]
    
    def setRsi(self, rsiDataframe):
        self.rsi = rsiDataframe['relativestrengthindex'].values[-self.length:]
    
    def setAdx(self, adxDataframe):
        self.adx = adxDataframe['averagedirectionalindex'].values[-self.length:]
    
    def setMacd(self, macdDataframe, stddevDataframe):
        macdLine = macdDataframe['movingaverageconvergencedivergence'].values[-self.length:]
        macdHistogram = macdDataframe['histogram'].values[-self.length:]
        standardDeviation = stddevDataframe['standarddeviation'].values[-self.length:]
        self.macdLine = [macd/stddev for macd, stddev in zip(macdLine, standardDeviation)] # norm the data due to comparison between different price levels
        self.macdHistogram = [histogram/stddev for histogram, stddev in zip(macdHistogram, standardDeviation)] # norm the data due to comparison between different price levels        

In [5]:
agent = Agent(24)
session = tf.Session()
agent.run(50, session)

episode 1 finished. ROI: 0.13
episode 2 finished. ROI: -0.19
episode 3 finished. ROI: -0.77
episode 4 finished. ROI: 0.38
episode 5 finished. ROI: -0.29
episode 6 finished. ROI: -0.09
episode 7 finished. ROI: -0.06
episode 8 finished. ROI: -0.84
episode 9 finished. ROI: 0.59
episode 10 finished. ROI: -0.14
episode 11 finished. ROI: -0.21
episode 12 finished. ROI: 0.06
episode 13 finished. ROI: 0.24
episode 14 finished. ROI: 0.11
episode 15 finished. ROI: 0.38
episode 16 finished. ROI: -1.22
episode 17 finished. ROI: 0.27
episode 18 finished. ROI: -0.1
episode 19 finished. ROI: 0.44
episode 20 finished. ROI: -0.46
episode 21 finished. ROI: 1.18
episode 22 finished. ROI: -0.5
episode 23 finished. ROI: 1.0
episode 24 finished. ROI: 0.89
episode 25 finished. ROI: 0.24
episode 26 finished. ROI: -0.11
episode 27 finished. ROI: 0.47
episode 28 finished. ROI: -0.23
episode 29 finished. ROI: -0.39
episode 30 finished. ROI: -0.18
episode 31 finished. ROI: -1.0
episode 32 finished. ROI: -0.54
epi