In [155]:
# FROM https://github.com/llSourcell/deep_q_learning/blob/master/03_PlayingAgent.ipynb


# INITIALIZATION: libraries, parameters, network...

from keras.models import Sequential      # One layer after the other
from keras.layers import Dense, Flatten  # Dense layers are fully connected layers, Flatten layers flatten out multidimensional inputs
from collections import deque            # For storing moves 

import numpy as np
import gym                                # To train our network
env = gym.make('MountainCar-v0')          # Choose game (any in the gym should work)

import random     # For sampling batches from the observations


# Create network. Input is two consecutive game states, output is Q-values of the possible moves.
model = Sequential()
model.add(Dense(20, input_shape=(2,) + env.observation_space.shape, init='uniform', activation='relu'))
model.add(Flatten())       # Flatten input so as to have no problems with processing
model.add(Dense(18, init='uniform', activation='relu'))
model.add(Dense(10, init='uniform', activation='relu'))
model.add(Dense(env.action_space.n, init='uniform', activation='linear'))    # Same number of outputs as possible actions

model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

# Parameters
D = deque()                                # Register where the actions will be stored

observetime = 500                          # Number of timesteps we will be acting on the game and observing results
epsilon = 0.7                              # Probability of doing a random move
gamma = 0.9                                # Discounted future reward. How much we care about steps further in time
mb_size = 500                               # Learning minibatch size



In [156]:
# FIRST STEP: Knowing what each action does (Observing)

observation = env.reset()                     # Game begins
obs = np.expand_dims(observation, axis=0)     # (Formatting issues) Making the observation the first element of a batch of inputs 
state = np.stack((obs, obs), axis=1)
done = False
for t in range(observetime):
    if np.random.rand() <= epsilon:
        action = np.random.randint(0, env.action_space.n, size=1)[0]
    else:
        Q = model.predict(state)          # Q-values predictions
        action = np.argmax(Q)             # Move with highest Q-value is the chosen one
    observation_new, reward, done, info = env.step(action)     # See state of the game, reward... after performing the action
    obs_new = np.expand_dims(observation_new, axis=0)          # (Formatting issues)
    state_new = np.append(np.expand_dims(obs_new, axis=0), state[:, :1, :], axis=1)     # Update the input with the new state of the game
    D.append((state, action, reward, state_new, done))         # 'Remember' action and consequence
    state = state_new         # Update state
    if done:
        env.reset()           # Restart game if it's finished
        obs = np.expand_dims(observation, axis=0)     # (Formatting issues) Making the observation the first element of a batch of inputs 
        state = np.stack((obs, obs), axis=1)
print('Observing Finished')

Observing Finished


In [157]:
# SECOND STEP: Learning from the observations (Experience replay)

minibatch = random.sample(D, mb_size)                              # Sample some moves

inputs_shape = (mb_size,) + state.shape[1:]
inputs = np.zeros(inputs_shape)
targets = np.zeros((mb_size, env.action_space.n))

for i in range(0, mb_size):
    state = minibatch[i][0]
    action = minibatch[i][1]
    reward = minibatch[i][2]
    state_new = minibatch[i][3]
    done = minibatch[i][4]
    
# Build Bellman equation for the Q function
    inputs[i:i+1] = np.expand_dims(state, axis=0)
    targets[i] = model.predict(state)
    Q_sa = model.predict(state_new)
    
    if done:
        targets[i, action] = reward
    else:
        targets[i, action] = reward + gamma * np.max(Q_sa)

# Train network to output the Q function
    model.train_on_batch(inputs, targets)
print('Learning Finished')

Learning Finished


In [158]:
# THIRD STEP: Play!

observation = env.reset()
obs = np.expand_dims(observation, axis=0)
state = np.stack((obs, obs), axis=1)
done = False
tot_reward = 0.0
while not done:
    env.render()                    # Uncomment to see game running
    Q = model.predict(state)        
    action = np.argmax(Q)         
    observation, reward, done, info = env.step(action)
    obs = np.expand_dims(observation, axis=0)
    state = np.append(np.expand_dims(obs, axis=0), state[:, :1, :], axis=1)    
    tot_reward += reward
print('Game ended! Total reward: {}'.format(reward))
env.close()

Game ended! Total reward: -1.0


# New algorithm that works for real (from https://keon.io/deep-q-learning/):

In [83]:
# -*- coding: utf-8 -*-
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [84]:
EPISODES = 60
outofscreen_malus = 50

In [85]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            # 1-Compute target, which is the highest expected reward
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            # 2-Replace output taken by action with highest expected value
            # Note that then (output - replaced_output) equals (highest_expected_value - actual_value)
            target_f = self.model.predict(state)
            target_f[0][action] = target
            # By fitting model with self.model.predict(state) and target_f, we actually adapt the weights so that 
            # the difference between output taken by action and maximum reward is minimized!
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [86]:
# Train algorithm:

if __name__ == "__main__":
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    # agent.load("./save/cartpole-dqn.h5")
    done = False
    batch_size = 32

    for e in range(EPISODES):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        for time in range(500):
            # env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -outofscreen_malus
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                print("episode: {}/{}, score: {}, e: {:.2}"
                      .format(e+1, EPISODES, time, agent.epsilon))
                break
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
#agent.save("./save/cartpole-dqn.h5")

episode: 1/60, score: 12, e: 1.0
Instructions for updating:
Use tf.cast instead.
episode: 2/60, score: 24, e: 0.98
episode: 3/60, score: 18, e: 0.89
episode: 4/60, score: 15, e: 0.83
episode: 5/60, score: 14, e: 0.77
episode: 6/60, score: 16, e: 0.71
episode: 7/60, score: 42, e: 0.58
episode: 8/60, score: 13, e: 0.54
episode: 9/60, score: 8, e: 0.52
episode: 10/60, score: 7, e: 0.5
episode: 11/60, score: 14, e: 0.47
episode: 12/60, score: 10, e: 0.44
episode: 13/60, score: 8, e: 0.43
episode: 14/60, score: 37, e: 0.35
episode: 15/60, score: 42, e: 0.29
episode: 16/60, score: 32, e: 0.24
episode: 17/60, score: 36, e: 0.2
episode: 18/60, score: 38, e: 0.17
episode: 19/60, score: 30, e: 0.15
episode: 20/60, score: 32, e: 0.12
episode: 21/60, score: 43, e: 0.1
episode: 22/60, score: 69, e: 0.071
episode: 23/60, score: 68, e: 0.05
episode: 24/60, score: 113, e: 0.028
episode: 25/60, score: 141, e: 0.014
episode: 26/60, score: 139, e: 0.01
episode: 27/60, score: 87, e: 0.01
episode: 28/60, s

In [90]:
import time
state = env.reset()
state = np.reshape(state, [1, state_size])
done = False
epoch = 0
while not done:
    env.render()
    time.sleep(0.001)
    action = agent.act(state)
    next_state, reward, done, _ = env.step(action)
    next_state = np.reshape(next_state, [1, state_size])
    state = next_state
    epoch += 1
    if done:
        print("score: {}"
              .format(epoch))
        break
env.close()

score: 154


# Same code but with different loss, with Huber loss, which is more robust:

In [1]:
# -*- coding: utf-8 -*-
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras import backend as K

import tensorflow as tf

Using TensorFlow backend.


In [2]:
EPISODES = 5
outofscreen_malus = 100

In [3]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.99
        self.learning_rate = 0.001
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()

    """Huber loss for Q Learning
    References: https://en.wikipedia.org/wiki/Huber_loss
                https://www.tensorflow.org/api_docs/python/tf/losses/huber_loss
    """

    def _huber_loss(self, y_true, y_pred, clip_delta=1.0):
        error = y_true - y_pred
        cond  = K.abs(error) <= clip_delta

        squared_loss = 0.5 * K.square(error)
        quadratic_loss = 0.5 * K.square(clip_delta) + clip_delta * (K.abs(error) - clip_delta)

        return K.mean(tf.where(cond, squared_loss, quadratic_loss))

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss=self._huber_loss,
                      optimizer=Adam(lr=self.learning_rate))
        return model

    def update_target_model(self):
        # copy weights from model to target_model
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = self.model.predict(state)
            if done:
                target[0][action] = reward
            else:
                t = self.target_model.predict(next_state)[0]
                target[0][action] = reward + self.gamma * np.amax(t)
            self.model.fit(state, target, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [4]:
if __name__ == "__main__":
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    # agent.load("./save/cartpole-ddqn.h5")
    done = False
    batch_size = 32

    for e in range(EPISODES):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        for time in range(500):
            # env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -outofscreen_malus
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                agent.update_target_model()
                print("episode: {}/{}, score: {}, e: {:.2}"
                      .format(e+1, EPISODES, time, agent.epsilon))
                break
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)

Instructions for updating:
Colocations handled automatically by placer.
episode: 1/5, score: 18, e: 1.0
episode: 2/5, score: 27, e: 0.87
episode: 3/5, score: 29, e: 0.65
episode: 4/5, score: 13, e: 0.57
episode: 5/5, score: 23, e: 0.45


In [8]:
import time
state = env.reset()
state = np.reshape(state, [1, state_size])
done = False
epoch = 0
while not done:
    env.render()
    time.sleep(0.1)
    action = agent.act(state)
    next_state, reward, done, _ = env.step(action)
    next_state = np.reshape(next_state, [1, state_size])
    state = next_state
    epoch += 1
    if done:
        print("score: {}"
              .format(epoch))
        break
env.close()

score: 8


# Now we are going to upgrade the algorithm by implementing a few tips, given by https://www.freecodecamp.org/news/improvements-in-deep-q-learning-dueling-double-dqn-prioritized-experience-replay-and-fixed-58b130cc5682/

## First of, fixed Q-target (cf Deepmind):
As we update network, our Q-target, that is the quality of the selected action is also updated, so we are chasing after a value that is constantly changing. This can lead to oscillations during the training. An option is to have a training network and a target network, initialized to be the same in the beginning. Then at each step, training network is updated while target network stays the same. Periodically, after a defined number of time steps, weights of training network are transfered to target network
--> ACTUALLY THIS ONE WAS ALREADY IMPLEMENTED

## Double deep Q-networks:
At the beginning of the training, false positive are possible due to noisy maximum Q-value. This can lead to over-estimation of Q-value and therefore biased training. We can use a double estmation: the training network evaluates the best action possible, then target network evaluates the Q-value for taking this action at this state

## Dueling deep Q-networks:
Sometimes there is no real point in computing actions Q-values from a state, because the whole state is shit. A dueling DQN allows to separate the network between a state analysis and an actions analysis before joining these two streams for the final layer. Dueling DQN will intuitively be able to learn which states are shit without having to compute the effects of each action at each state. As we want to be able to identify influence of state and action we will not sum the two, but rather use the following formula: Q(s,a) = V(s) + (A(s,a) - 1/|A| * sum A(s,a'))

## Prioritized experience replay:
During training, experiences are taken uniformly. But this means experiences with a high Q-value difference but in low number are less likely to be taken. We can remedy that by sampling with a probability of being sampled equal to the difference between the Q-values (NB: these differences will be updated but as we use a FIFO deque to store experiences, old differences get rid of after some time so it is not a problem).
A few adjustements are made to this difference (https://pemami4911.github.io/paper-summaries/deep-rl/2016/01/26/prioritizing-experience-replay.html), and sampling can be done using an unsorted sum-tree (https://jaromiru.com/2016/11/07/lets-make-a-dqn-double-learning-and-prioritized-experience-replay/), which works as a dichotomy algorithm

In [162]:
# -*- coding: utf-8 -*-
import random
import gym
import numpy as np
from collections import deque
from keras.models import Model
from keras.layers import Input, Dense, Add, Subtract, Concatenate, Lambda
from keras.optimizers import Adam
from keras import backend as K

import tensorflow as tf

In [193]:
class DQNAgent:
    def __init__(self, state_size, action_size, fixedQ=False, double=False, PER=False):
        self.state_size = state_size
        self.action_size = action_size
        # Parameters for better but longer network:
        self.fixedQ = fixedQ
        self.double = double
        self.PER = PER
        if self.PER:
            self.IS_weight = 0.01
            self.IS_weight_rise = 1.05
            self.IS_weight_max = 1
            self.randomness = 0.4
            self.no_null = 0.001
        self.memory_Qdiff = deque(maxlen=2000)
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.99
        self.learning_rate = 0.001
        self.model = self._build_model()
        # 1 - start - Fixed Q-target
        self.target_model = self._build_model()
        self.update_target_model()
        # 1 - end - Fixed Q-target

    """Huber loss for Q Learning
    References: https://en.wikipedia.org/wiki/Huber_loss
                https://www.tensorflow.org/api_docs/python/tf/losses/huber_loss
    """

    def _huber_loss(self, y_true, y_pred, clip_delta=1.0):
        error = y_true - y_pred
        cond  = K.abs(error) <= clip_delta

        squared_loss = 0.5 * K.square(error)
        quadratic_loss = 0.5 * K.square(clip_delta) + clip_delta * (K.abs(error) - clip_delta)

        return K.mean(tf.where(cond, squared_loss, quadratic_loss))

    def _build_model(self):
        # Neural Net for Deep-Q learning Model        
        X_input = Input(shape=(self.state_size,))
        X = Dense(24, activation='relu')(X_input)
        X = Dense(24, activation='relu')(X)
        # 3 - start - Dueling DQN
        Xv = Dense(24, activation='elu')(X)
        Xv = Dense(1, activation='linear')(Xv)
        Xv = Concatenate()([Xv for i in range(self.action_size)])
        Xa = Dense(24, activation='elu')(X)
        Xa = Dense(self.action_size, activation='linear')(Xa)
        Xa = Lambda(lambda x: x - K.mean(Xa))(Xa)
        X = Add()([Xv, Xa])
        # 3 - end - Dueling DQN
        model = Model(inputs=X_input, outputs=X)
        model.compile(loss=self._huber_loss,
                      optimizer=Adam(lr=self.learning_rate))
        return model

    # 1 - start - Fixed Q-target
    def update_target_model(self):
        # copy weights from model to target_model
        self.target_model.set_weights(self.model.get_weights())
    # 1 - end - Fixed Q-target

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        # 4 - start - Prioritized experience replay
        if self.PER:
            target = self.model.predict(state)[0][action]
            if done:
                Qval = reward
            else:
                t = self.target_model.predict(next_state)[0]
                if self.double:
                    tm = self.model.predict(next_state)[0]
                    tm_action = np.argmax(tm)
                    t_action = t[tm_action]
                else:
                    t_action = np.amax(t)
                Qval = reward + self.gamma * t_action
            self.memory_Qdiff.append(Qval-target)
        # 4 - end - Prioritized experience replay

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
    
    # This next one is for actions after training where we want exploitation and no random due to exploration:
    def act_IRL(self, state):
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        # 4 - start - Prioritized experience replay
        if self.PER:
            minibatch = self.sumtree(batch_size)
        else:
            minibatch = random.sample(self.memory, batch_size)
        # 4 - end - Prioritized experience replay
        for state, action, reward, next_state, done in minibatch:
            target = self.model.predict(state)
            if done:
                target[0][action] = reward
            else:
                t = self.target_model.predict(next_state)[0]
                if self.double:
                    # 2 - start - Double DQN
                    tm = self.model.predict(next_state)[0] # Prediction for next state
                    tm_action = np.argmax(tm) # Action taken
                    t_action = t[tm_action] # Target associated to action taken
                    # 2 - end - Double DQN
                else:
                    t_action = np.amax(t)
                target[0][action] = reward + self.gamma * t_action
            self.model.fit(state, target, epochs=1, verbose=0)
            if not self.fixedQ:
                self.update_target_model()
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        if self.PER:
            if self.IS_weight > self.IS_weight_max:
                self.IS_weight = self.IS_weight_max
            elif self.IS_weight < self.IS_weight_max:
                self.IS_weight *= self.IS_weight_rise
            
    # 4 - start - Prioritized experience replay
    def sumtree(self, batch_size):
        # Define probabilities to sample:
        array_samp = np.abs(np.array(self.memory_Qdiff))
        proba_samp = np.power(array_samp+self.no_null, self.randomness) / np.sum(np.power(array_samp+self.no_null, self.randomness))
        proba_samp = np.power(1/(len(proba_samp)*proba_samp), self.IS_weight) * proba_samp
        proba_samp = proba_samp / np.sum(proba_samp)
        # Unsorted sum-tree to sample:
        minibatch = list()
        for i in range(batch_size):
            rand_samp = random.random()
            first_sel = 0
            end_sel = len(proba_samp) - 1
            middle_sel = int(end_sel/2)
            while True:
                if np.sum(proba_samp[0:middle_sel+1]) <= rand_samp:
                    if np.sum(proba_samp[0:middle_sel+2]) > rand_samp:
                        minibatch.append(self.memory[middle_sel+1])
                        break
                    first_sel = middle_sel
                else:
                    if rand_samp < proba_samp[0]:
                        minibatch.append(self.memory[0])
                        break
                    elif np.sum(proba_samp[0:middle_sel]) <= rand_samp:
                        minibatch.append(self.memory[middle_sel])
                        break
                    end_sel = middle_sel + 1
                middle_sel = int((end_sel-first_sel)/2) + first_sel
        return minibatch
    # 4 - end - Prioritized experience replay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [194]:
EPISODES = 50
outofscreen_malus = 100


if __name__ == "__main__":
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size, PER=True)
    # agent.load("./save/cartpole-ddqn.h5")
    done = False
    batch_size = 32

    for e in range(EPISODES):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        for time in range(500):
            # env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -outofscreen_malus
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                # 1 - start - Fixed Q-target
                agent.update_target_model()
                # 1 - end - Fixed Q-target
                print("episode: {}/{}, score: {}, e: {:.2}"
                      .format(e+1, EPISODES, time, agent.epsilon))
                break
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)

episode: 1/50, score: 18, e: 1.0
episode: 2/50, score: 25, e: 0.89
episode: 3/50, score: 20, e: 0.72
episode: 4/50, score: 9, e: 0.66
episode: 5/50, score: 40, e: 0.44
episode: 6/50, score: 67, e: 0.23
episode: 7/50, score: 68, e: 0.11
episode: 8/50, score: 94, e: 0.044
episode: 9/50, score: 62, e: 0.024
episode: 10/50, score: 72, e: 0.012
episode: 11/50, score: 89, e: 0.0099
episode: 12/50, score: 103, e: 0.0099
episode: 13/50, score: 35, e: 0.0099
episode: 14/50, score: 38, e: 0.0099
episode: 15/50, score: 79, e: 0.0099
episode: 16/50, score: 29, e: 0.0099
episode: 17/50, score: 51, e: 0.0099
episode: 18/50, score: 56, e: 0.0099
episode: 19/50, score: 50, e: 0.0099
episode: 20/50, score: 120, e: 0.0099
episode: 21/50, score: 15, e: 0.0099
episode: 22/50, score: 38, e: 0.0099
episode: 23/50, score: 27, e: 0.0099
episode: 24/50, score: 87, e: 0.0099
episode: 25/50, score: 138, e: 0.0099
episode: 26/50, score: 144, e: 0.0099
episode: 27/50, score: 18, e: 0.0099
episode: 28/50, score: 18

In [86]:
import time
state = env.reset()
state = np.reshape(state, [1, state_size])
done = False
epoch = 0
while not done:
    env.render()
    time.sleep(0.01)
    action = agent.act_IRL(state)
    next_state, reward, done, _ = env.step(action)
    next_state = np.reshape(next_state, [1, state_size])
    state = next_state
    epoch += 1
    if done:
        print("score: {}"
              .format(epoch))
        break
env.close()

score: 89


In [59]:
# -*- coding: utf-8 -*-
import random
import gym
import numpy as np
from collections import deque
from keras.models import Model
from keras.layers import Input, Dense, Add, Subtract, Concatenate, BatchNormalization
from keras.optimizers import Adam
from keras import backend as K

import tensorflow as tf


X_input = Input(shape=(3,))
X1 = Dense(3, activation='relu')(X_input)
X2 = Lambda(lambda x: x - K.mean(X1))(X1)
model = Model(inputs=X_input, outputs=[X1, X2])
Inp = np.array([1, 100, -10])
print(model.predict(Inp.reshape(1, 3)))
print(model.get_weights())
temp = np.dot(Inp, model.get_weights()[0])
print(temp)
temp = temp * (temp > 0)
print(temp, temp - np.mean(temp))

[array([[ 1.4990888, 54.68012  , 29.258003 ]], dtype=float32), array([[-26.979982 ,  26.201048 ,   0.7789326]], dtype=float32)]
[array([[ 0.5203731 ,  0.251148  , -0.5022273 ],
       [ 0.05490851,  0.49811053,  0.23003411],
       [ 0.4512136 , -0.46179175, -0.67568207]], dtype=float32), array([0., 0., 0.], dtype=float32)]
[ 1.49908853 54.6801188  29.25800467]
[ 1.49908853 54.6801188  29.25800467] [-26.97998214  26.20104814   0.778934  ]


In [170]:
batch_size = 32
for i in range(batch_size):
    proba_samp = np.random.rand(15)
    proba_samp = proba_samp / np.sum(proba_samp)
    # Unsorted sum-tree to sample:
    print(np.cumsum(proba_samp))
    rand_samp = random.random()
    print(rand_samp)
    first_sel = 0
    end_sel = len(proba_samp) - 1
    middle_sel = int(end_sel/2)
    while True:
        if np.sum(proba_samp[0:middle_sel+1]) <= rand_samp:
            if np.sum(proba_samp[0:middle_sel+2]) > rand_samp:
                print(middle_sel+1)
                break
            first_sel = middle_sel
        elif np.sum(proba_samp[0:middle_sel+1]) > rand_samp:
            if proba_samp[0] > rand_samp:
                print(0)
                break
            elif np.sum(proba_samp[0:middle_sel]) <= rand_samp:
                print(middle_sel)
                break
            end_sel = middle_sel + 1
        middle_sel = int((end_sel-first_sel)/2) + first_sel
        #print(first_sel, middle_sel, end_sel, rand_samp)
        #print(np.cumsum(proba_samp)[middle_sel-1:middle_sel+2])

[0.00791717 0.0168565  0.11991364 0.23900601 0.34653985 0.37891076
 0.43218354 0.49957527 0.5822315  0.64666872 0.72784883 0.7984135
 0.89831963 0.96904642 1.        ]
0.5811313679412535
8
[0.01676241 0.03416264 0.04656094 0.05730124 0.23668796 0.23930462
 0.38021629 0.46614756 0.59806309 0.77450875 0.78647649 0.81800521
 0.84428393 0.89480457 1.        ]
0.23736743406980065
5
[0.00821581 0.12230818 0.14336769 0.21201671 0.28305316 0.34939459
 0.44059203 0.55391343 0.64056189 0.69595006 0.80659201 0.8845502
 0.95356537 0.95490499 1.        ]
0.926350172264023
12
[0.05255921 0.12571889 0.28420254 0.31025959 0.33795938 0.36757822
 0.52402066 0.55202311 0.59316153 0.60062338 0.73371988 0.74023833
 0.79076051 0.94136576 1.        ]
0.2044220214094079
2
[0.02972535 0.16439583 0.20736382 0.24978783 0.33352903 0.39658575
 0.47043968 0.48817094 0.49343799 0.56040198 0.68193032 0.76235318
 0.90479855 0.9559224  1.        ]
0.3549371375461178
5
[0.05433734 0.12105218 0.16198144 0.23825933 0.2434

0