In [1]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow import keras
import os
import datetime
from gym import wrappers
from particle_envs.make_env import make_env

In [2]:
class MyModel(tf.keras.Model):
    def __init__(self, num_states, hidden_units, num_actions):
        super(MyModel, self).__init__()
        self.input_layer = tf.keras.layers.InputLayer(input_shape=(num_states,))
        self.hidden_layers = []
        for output in hidden_units:
            self.hidden_layers.append(Dense(output, activation='tanh', kernel_initializer='RandomNormal'))
        self.output_layer = Dense(num_actions, activation='linear', kernel_initializer='RandomNormal')
        
    @tf.function
    def call(self, inputs):
        z = self.input_layer(inputs)
        for layer in self.hidden_layers:
            z = layer(z)
        output = self.output_layer(z)
        return output

In [3]:
class DQN:
    def __init__(self, num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr):
        self.num_actions = num_actions
        self.batch_size = batch_size
        self.optimizer = tf.optimizers.Adam(lr)
        self.gamma = gamma
        self.model = MyModel(num_states, hidden_units, num_actions)
        self.experience = {'s': [], 'a': [], 'r': [], 's2': [], 'done': []}  # done (bool): is current state terminal
        self.max_experiences = max_experiences
        self.min_experiences = min_experiences
         
    def predict(self, inputs):
        return self.model(np.atleast_2d(inputs.astype('float32')))
    
    def train(self, TargetNet):
        if len(self.experience['s']) < self.min_experiences:
            return 0, 0
        ids = np.random.randint(low=0, high=len(self.experience['s']), size=self.batch_size)
        states = np.asarray([self.experience['s'][i] for i in ids])
        actions = np.asarray([self.experience['a'][i] for i in ids])
        rewards = np.asarray([self.experience['r'][i] for i in ids])
        states_next = np.asarray([self.experience['s2'][i] for i in ids])
        dones = np.asarray([self.experience['done'][i] for i in ids])
        value_next = np.max(TargetNet.predict(states_next), axis=1)
        actual_values = np.where(dones, rewards, rewards + self.gamma*value_next)
        
        with tf.GradientTape() as tape:
            selected_action_values = tf.math.reduce_sum(
                self.predict(states) * actions)
            loss = tf.math.reduce_mean(tf.square(actual_values - selected_action_values))
    
        variables = self.model.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))
        return loss, selected_action_values
        
    def get_action(self, states, epsilon):
        if np.random.random() < epsilon:
            return np.random.choice(self.num_actions)
        else:
            return np.argmax(self.predict(np.atleast_2d(states))[0])


    def add_experience(self, exp):
        if len(self.experience['s']) >= self.max_experiences:
            for key in self.experience.keys():
                self.experience[key].pop(0)
        for key, value in exp.items():
            self.experience[key].append(value)

    def copy_weights(self, TrainNet):
        variables1 = self.model.trainable_variables
        variables2 = TrainNet.model.trainable_variables
        for v1, v2 in zip(variables1, variables2):
            v1.assign(v2.numpy())

In [4]:
def play_game(env, TrainNet, TargetNet, epsilon, copy_step):
    rewards = 0
    iter = 0
    done = False
    observations = env.reset()[0]
    losses = list()
    episode_length = 200
    for i in range(episode_length):
#         env.render()
        action = TrainNet.get_action(observations, epsilon)
        # Convert action to one-hot
        action = [np.identity(env.action_space[0].n, dtype=int)[action]]
        prev_observations = observations
        observations, reward, done, _ = env.step(action)
        observations, reward, done = observations[0], reward[0], done[0]
        rewards += reward
#         if done:
#             reward = -200
#             env.reset()

        exp = {'s': prev_observations, 'a': action, 'r': reward, 's2': observations, 'done': done}
        TrainNet.add_experience(exp)
        loss, selected_action_values = TrainNet.train(TargetNet)
        if isinstance(loss, int):
            losses.append(loss)
        else:
            losses.append(loss.numpy())
        iter += 1
        if iter % copy_step == 0:
            TargetNet.copy_weights(TrainNet)
    return rewards, np.mean(losses), np.mean(selected_action_values)

In [5]:
def make_video(env, TrainNet):
    env = wrappers.Monitor(env, os.path.join(os.getcwd(), "videos"), force=True)
    rewards = 0
    steps = 0
    done = False
    observation = env.reset()
    while not done:
        action = TrainNet.get_action(observation, 0)
        observation, reward, done, _ = env.step(action)
        steps += 1
        rewards += reward
    print("Testing steps: {} rewards {}: ".format(steps, rewards))

In [6]:
def main():
    env = make_env('simple')
    gamma = 0.99
    copy_step = 25
    num_states = len(env.observation_space[0].sample())
    num_actions = env.action_space[0].n
    hidden_units = [200, 200]
    max_experiences = 10000
    min_experiences = 100
    batch_size = 32
    lr = 1e-2
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = 'logs/dqn/' + current_time
    summary_writer = tf.summary.create_file_writer(log_dir)
    TrainNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
    TargetNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
    N = 10000
    total_rewards = np.empty(N)
    epsilon = 0.99
    decay = 0.9999
    min_epsilon = 0.1
    for n in range(N):
        epsilon = max(min_epsilon, epsilon * decay)
        total_reward, losses, action_values_avg = play_game(env, TrainNet, TargetNet, epsilon, copy_step)
        total_rewards[n] = total_reward
        avg_rewards = total_rewards[max(0, n - 100):(n + 1)].mean()
        with summary_writer.as_default():
            tf.summary.scalar('episode reward', total_reward, step=n)
            tf.summary.scalar('running avg reward(100)', avg_rewards, step=n)
            tf.summary.scalar('average loss)', losses, step=n)
            tf.summary.scalar('average Q value', action_values_avg, step=n)
        if True:  # n % 100 == 0:
            print("episode:", n, "episode reward:", total_reward, "epsilon:", epsilon, "avg reward (last 100):", avg_rewards,
                  "episode loss: ", losses)
    print("avg reward for last 100 episodes:", avg_rewards)
#     make_video(env, TrainNet)
    env.close()

In [None]:
if __name__ == '__main__':
    main()



episode: 0 episode reward: -2351.87633023031 epsilon: 0.989901 avg reward (last 100): -2351.87633023031 episode loss:  8305.412998809814
episode: 1 episode reward: -647.5958264123548 epsilon: 0.9898020099 avg reward (last 100): -1499.7360783213323 episode loss:  127.67553
episode: 2 episode reward: -1190.1174313025242 epsilon: 0.98970302969901 avg reward (last 100): -1396.5298626483964 episode loss:  64.92725
episode: 3 episode reward: -2470.472923902548 epsilon: 0.9896040593960401 avg reward (last 100): -1665.0156279619341 episode loss:  108.05304
episode: 4 episode reward: -105.87776936564642 epsilon: 0.9895050989901005 avg reward (last 100): -1353.1880562426766 episode loss:  107.6651
episode: 5 episode reward: -305.3991402700159 epsilon: 0.9894061484802015 avg reward (last 100): -1178.5565702472331 episode loss:  47.257256
episode: 6 episode reward: -303.20406160689305 epsilon: 0.9893072078653534 avg reward (last 100): -1053.5062118700419 episode loss:  41.080708
episode: 7 episode

episode: 58 episode reward: -347.6349974948461 epsilon: 0.9841759067610987 avg reward (last 100): -712.7626867144248 episode loss:  500109.16
episode: 59 episode reward: -559.7136222204308 epsilon: 0.9840774891704226 avg reward (last 100): -710.2118689728582 episode loss:  17065.604
episode: 60 episode reward: -428.8934599013523 epsilon: 0.9839790814215056 avg reward (last 100): -705.6000917749647 episode loss:  6173.777
episode: 61 episode reward: -556.5338238425292 epsilon: 0.9838806835133634 avg reward (last 100): -703.1957971308932 episode loss:  3552.835
episode: 62 episode reward: -2149.9601510146526 epsilon: 0.9837822954450122 avg reward (last 100): -726.1603106846036 episode loss:  3507.86
episode: 63 episode reward: -533.8637285024191 epsilon: 0.9836839172154677 avg reward (last 100): -723.155676588007 episode loss:  2910.9897
episode: 64 episode reward: -719.60263152219 epsilon: 0.9835855488237462 avg reward (last 100): -723.1010143562253 episode loss:  4726.5645
episode: 65 

episode: 116 episode reward: -240.95243512497453 epsilon: 0.9784839246036026 avg reward (last 100): -751.4733026592149 episode loss:  17089.043
episode: 117 episode reward: -798.3238900213445 epsilon: 0.9783860762111423 avg reward (last 100): -751.0234917326062 episode loss:  21784.418
episode: 118 episode reward: -2366.5598113345413 epsilon: 0.9782882376035211 avg reward (last 100): -771.1707313549467 episode loss:  46893.07
episode: 119 episode reward: -101.26876202032082 epsilon: 0.9781904087797608 avg reward (last 100): -756.136486106569 episode loss:  51557.71
episode: 120 episode reward: -454.90507641180085 epsilon: 0.9780925897388829 avg reward (last 100): -740.811868688305 episode loss:  16288.855
episode: 121 episode reward: -591.8490304296564 epsilon: 0.9779947804799091 avg reward (last 100): -735.4972792918635 episode loss:  148630.9
episode: 122 episode reward: -1209.1377645877562 epsilon: 0.9778969810018611 avg reward (last 100): -741.0760919674146 episode loss:  59248.37


episode: 174 episode reward: -51.077051056809545 epsilon: 0.9728248620295451 avg reward (last 100): -813.2620592649732 episode loss:  28855.922
episode: 175 episode reward: -182.16674009318507 epsilon: 0.9727275795433421 avg reward (last 100): -793.7308622023437 episode loss:  52334.246
episode: 176 episode reward: -654.7284875258223 epsilon: 0.9726303067853878 avg reward (last 100): -795.9185614341517 episode loss:  12923.022
episode: 177 episode reward: -777.2928674275222 epsilon: 0.9725330437547093 avg reward (last 100): -792.5705267532194 episode loss:  10089.935
episode: 178 episode reward: -1462.5248318296815 epsilon: 0.9724357904503338 avg reward (last 100): -805.5104682102435 episode loss:  15493.063
episode: 179 episode reward: -451.1738670240759 epsilon: 0.9723385468712887 avg reward (last 100): -806.3296261358021 episode loss:  16217.531
episode: 180 episode reward: -1619.758144582676 epsilon: 0.9722413130166017 avg reward (last 100): -800.0040899269808 episode loss:  76254.