In [1]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow import keras
import os
import datetime
from gym import wrappers
from particle_envs.make_env import make_env

In [2]:
class MyModel(tf.keras.Model):
    def __init__(self, num_states, hidden_units, num_actions):
        super(MyModel, self).__init__()
        self.input_layer = tf.keras.layers.InputLayer(input_shape=(num_states,))
        self.hidden_layers = []
        for output in hidden_units:
            self.hidden_layers.append(Dense(output, activation='relu', kernel_initializer='RandomNormal'))
        self.output_layer = Dense(num_actions, activation='linear', kernel_initializer='RandomNormal')
        
    @tf.function
    def call(self, inputs):
        z = self.input_layer(inputs)
        for layer in self.hidden_layers:
            z = layer(z)
        output = self.output_layer(z)
        return output

In [3]:
class DQN:
    def __init__(self, num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr):
        self.num_actions = num_actions
        self.batch_size = batch_size
        self.optimizer = tf.optimizers.Adam(lr)
        self.gamma = gamma
        self.model = MyModel(num_states, hidden_units, num_actions)
        self.experience = {'s': [], 'a': [], 'r': [], 's2': [], 'done': []}  # done (bool): is current state terminal
        self.max_experiences = max_experiences
        self.min_experiences = min_experiences
         
    def predict(self, inputs):
        return self.model(np.atleast_2d(inputs.astype('float32')))
    
    def train(self, TargetNet):
        if len(self.experience['s']) < self.min_experiences:
            return 0, 0
        ids = np.random.randint(low=0, high=len(self.experience['s']), size=self.batch_size)
        states = np.asarray([self.experience['s'][i] for i in ids])
        actions = np.asarray([self.experience['a'][i] for i in ids])
        rewards = np.asarray([self.experience['r'][i] for i in ids])
        states_next = np.asarray([self.experience['s2'][i] for i in ids])
        dones = np.asarray([self.experience['done'][i] for i in ids])
        value_next = np.max(TargetNet.predict(states_next), axis=1)
        actual_values = np.where(dones, rewards, rewards + self.gamma*value_next)
        
        with tf.GradientTape() as tape:
            selected_action_values = tf.math.reduce_sum(
                self.predict(states) * actions)
            loss = tf.math.reduce_mean(tf.square(actual_values - selected_action_values))
    
        variables = self.model.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))
        return loss, selected_action_values
        
    def get_action(self, states, epsilon):
        if np.random.random() < epsilon:
            return np.random.choice(self.num_actions)
        else:
            return np.argmax(self.predict(np.atleast_2d(states))[0])


    def add_experience(self, exp):
        if len(self.experience['s']) >= self.max_experiences:
            for key in self.experience.keys():
                self.experience[key].pop(0)
        for key, value in exp.items():
            self.experience[key].append(value)

    def copy_weights(self, TrainNet):
        variables1 = self.model.trainable_variables
        variables2 = TrainNet.model.trainable_variables
        for v1, v2 in zip(variables1, variables2):
            v1.assign(v2.numpy())

In [4]:
def play_game(env, TrainNet, TargetNet, epsilon):
    rewards = 0
    done = False
    observations = env.reset()[0]
    losses = list()
    episode_length = 200
    for i in range(episode_length):
        env.render()
        action = TrainNet.get_action(observations, epsilon)
        # Convert action to one-hot
        action = [np.identity(env.action_space[0].n, dtype=int)[action]]
        prev_observations = observations
        observations, reward, done, _ = env.step(action)
        observations, reward, done = observations[0], reward[0], done[0]
        rewards += reward
        if done:
            env.reset()

        exp = {'s': prev_observations, 'a': action, 'r': reward, 's2': observations, 'done': done}
        TrainNet.add_experience(exp)
        loss, selected_action_values = TrainNet.train(TargetNet)
        if isinstance(loss, int):
            losses.append(loss)
        else:
            losses.append(loss.numpy())
        iters += 1
    return rewards, np.mean(losses), np.mean(selected_action_values)

In [5]:
def make_video(env, TrainNet):
    env = wrappers.Monitor(env, os.path.join(os.getcwd(), "videos"), force=True)
    rewards = 0
    steps = 0
    done = False
    observation = env.reset()
    observation = observation[0]
    for i in range(100):
#         env.render()
        action = TrainNet.get_action(observation, 0)
        # Convert action to one-hot
        action = [np.identity(env.action_space[0].n, dtype=int)[action]]
        observation, reward, done, _ = env.step(action)
        observation, reward, done = observation[0], reward[0], done[0]
        steps += 1
        rewards += reward
    print("Testing steps: {} rewards {}: ".format(steps, rewards))

In [6]:
def main():
    env = make_env('simple')
    gamma = 0.95
    copy_ep = 1000
    num_states = len(env.observation_space[0].sample())
    num_actions = env.action_space[0].n
    hidden_units = [24, 24]
    max_experiences = 100000
    min_experiences = 100
    batch_size = 32
    lr = 1e-4
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    test_model_path = './dqn_model/model_{}'.format(current_time)
    log_dir = 'logs/dqn/' + current_time
    summary_writer = tf.summary.create_file_writer(log_dir)
    TrainNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
    TargetNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
    N = 30000
    total_rewards = np.empty(N)
    epsilon = 0.99
    decay = 0.9999
    min_epsilon = 0.1
    for n in range(N):
        epsilon = max(min_epsilon, epsilon * decay)
        total_reward, losses, action_values_avg = play_game(env, TrainNet, TargetNet, epsilon)
        total_rewards[n] = total_reward
        avg_rewards = total_rewards[max(0, n - 100):(n + 1)].mean()
        with summary_writer.as_default():
            tf.summary.scalar('episode reward', total_reward, step=n)
            tf.summary.scalar('running avg reward(100)', avg_rewards, step=n)
            tf.summary.scalar('average loss)', losses, step=n)
            tf.summary.scalar('average Q value', action_values_avg, step=n)
        if n % copy_ep == 0:
            print("episode:", n, "episode reward:", total_reward, "epsilon:", epsilon, "avg reward (last 100):", avg_rewards,
                  "episode loss: ", losses)
            TrainNet.model.save(test_model_path+f'_{n}')
            # Copy weights to Target Network
            TargetNet.copy_weights(TrainNet)
    print("avg reward for last 100 episodes:", avg_rewards)

    TrainNet.model.save(test_model_path+'final')
    env.close()

In [7]:
if __name__ == '__main__':
    main()



episode: 0 episode reward: 289.3912869621781 epsilon: 0.98505 avg reward (last 100): 289.3912869621781 episode loss:  11.520329580307006
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: .\dqn_model\model_20200719-181628\assets
episode: 9 episode reward: 229.62983814065802 epsilon: 0.9415990291611142 avg reward (last 100): 216.77800407716464 episode loss:  24.004927
INFO:tensorflow:Assets written to: .\dqn_model\model_20200719-181628\assets
avg reward for last 100 episodes: 216.77800407716464
INFO:tensorflow:Assets written to: .\dqn_model\model_20200719-181628\assets


In [1]:
# import gym
# import numpy as np
# import tensorflow as tf
# from tensorflow.keras.layers import Input, Dense
# from tensorflow import keras
# import os
# import datetime
# from gym import wrappers
# from particle_envs.make_env import make_env
# import time

# env = make_env('simple')
# TestModel = keras.models.load_model('C:\\Parth\\CS Projects\\multiagent-rl\\dqn_model\\model_20200720-054405_30k_9001', compile=False)
# for i in range(10):
#     obs = env.reset()
#     obs = obs[0]
#     print('Episode {}:'.format(i+1))
#     for j in range(200):
#         env.render()
#         action = np.argmax(TestModel(np.atleast_2d(obs.astype('float32')))[0])
#         action = [np.identity(env.action_space[0].n, dtype=int)[action]]
#         obs, reward, done, info = env.step(action)
#         print(f'Step {j}:',action, obs, reward, done, info)
#         obs, done = obs[0], done[0]
#         time.sleep(0.05)
#         if done:
#             print("Out of boundary")
#             break
# env.close()



Episode 1:
Step 0: [array([0, 1, 0, 0, 0])] [array([ 0.5       ,  0.        ,  0.81311169, -0.09474272])] [-0.6701268061077762] [False] {'n': [{}]}
Step 1: [array([0, 1, 0, 0, 0])] [array([ 0.875     ,  0.        ,  0.72561169, -0.09474272])] [-0.5354885101655391] [False] {'n': [{}]}
Step 2: [array([0, 1, 0, 0, 0])] [array([ 1.15625   ,  0.        ,  0.60998669, -0.09474272])] [-0.3810599472240117] [False] {'n': [{}]}
Step 3: [array([0, 1, 0, 0, 0])] [array([ 1.3671875 ,  0.        ,  0.47326794, -0.09474272])] [-0.23295872797832887] [False] {'n': [{}]}
Step 4: [array([0, 1, 0, 0, 0])] [array([ 1.52539062,  0.        ,  0.32072888, -0.09474272])] [10] [False] {'n': [{}]}
Step 5: [array([0, 1, 0, 0, 0])] [array([ 1.64404297,  0.        ,  0.15632458, -0.09474272])] [10] [False] {'n': [{}]}
Step 6: [array([0, 1, 0, 0, 0])] [array([ 1.73303223,  0.        , -0.01697864, -0.09474272])] [10] [False] {'n': [{}]}
Step 7: [array([0, 1, 0, 0, 0])] [array([ 1.79977417,  0.        , -0.19695606, 

Step 68: [array([0, 0, 0, 1, 0])] [array([ 0.04084459,  0.09964449, -0.32463493, -0.26484938])] [10] [False] {'n': [{}]}
Step 69: [array([0, 0, 0, 0, 1])] [array([ 0.03063345, -0.42526664, -0.32769828, -0.22232272])] [10] [False] {'n': [{}]}
Step 70: [array([0, 0, 0, 1, 0])] [array([ 0.02297508,  0.18105002, -0.32999578, -0.24042772])] [10] [False] {'n': [{}]}
Step 71: [array([0, 0, 0, 0, 1])] [array([ 0.01723131, -0.36421248, -0.33171892, -0.20400647])] [10] [False] {'n': [{}]}
Step 72: [array([0, 0, 0, 1, 0])] [array([ 0.01292348,  0.22684064, -0.33301126, -0.22669053])] [10] [False] {'n': [{}]}
Step 73: [array([0, 0, 0, 0, 1])] [array([ 0.00969261, -0.32986952, -0.33398052, -0.19370358])] [10] [False] {'n': [{}]}
Step 74: [array([0, 0, 0, 1, 0])] [array([ 0.00726946,  0.25259786, -0.33470747, -0.21896337])] [10] [False] {'n': [{}]}
Step 75: [array([0, 0, 1, 0, 0])] [array([-0.4945479 ,  0.18944839, -0.28525268, -0.23790821])] [10] [False] {'n': [{}]}
Step 76: [array([0, 1, 0, 0, 0])

Step 136: [array([0, 1, 0, 0, 0])] [array([ 0.12908907,  0.1420863 , -0.29816159, -0.25211684])] [10] [False] {'n': [{}]}
Step 137: [array([0, 0, 0, 1, 0])] [array([ 0.0968168 ,  0.60656472, -0.30784327, -0.31277331])] [10] [False] {'n': [{}]}
Step 138: [array([0, 0, 0, 0, 1])] [array([ 0.0726126 , -0.04507646, -0.31510453, -0.30826566])] [10] [False] {'n': [{}]}
Step 139: [array([0, 0, 0, 0, 1])] [array([ 0.05445945, -0.53380734, -0.32055047, -0.25488493])] [10] [False] {'n': [{}]}
Step 140: [array([0, 0, 0, 1, 0])] [array([ 0.04084459,  0.09964449, -0.32463493, -0.26484938])] [10] [False] {'n': [{}]}
Step 141: [array([0, 0, 0, 0, 1])] [array([ 0.03063344, -0.42526663, -0.32769828, -0.22232271])] [10] [False] {'n': [{}]}
Step 142: [array([0, 0, 0, 1, 0])] [array([ 0.02297508,  0.18105003, -0.32999578, -0.24042772])] [10] [False] {'n': [{}]}
Step 143: [array([0, 0, 0, 0, 1])] [array([ 0.01723131, -0.36421248, -0.33171892, -0.20400647])] [10] [False] {'n': [{}]}
Step 144: [array([0, 0, 

Step 4: [array([0, 1, 0, 0, 0])] [array([ 0.5       ,  1.02539062, -0.59307317,  0.44827058])] [-0.5526823021472917] [False] {'n': [{}]}
Step 5: [array([0, 1, 0, 0, 0])] [array([ 0.875     ,  0.76904297, -0.68057317,  0.37136628])] [-0.6010927612046478] [False] {'n': [{}]}
Step 6: [array([0, 1, 0, 0, 0])] [array([ 1.15625   ,  0.57678223, -0.79619817,  0.31368806])] [-0.7323317315971456] [False] {'n': [{}]}
Step 7: [array([0, 1, 0, 0, 0])] [array([ 1.3671875 ,  0.43258667, -0.93291692,  0.27042939])] [-0.9434660442056785] [False] {'n': [{}]}
Step 8: [array([0, 1, 0, 0, 0])] [array([ 1.52539062,  0.32444   , -1.08545599,  0.23798539])] [-1.2348517465032447] [True] {'n': [{}]}
Out of boundary
Episode 3:
Step 0: [array([0, 0, 0, 1, 0])] [array([ 0.        ,  0.5       , -0.06639881,  0.37751294])] [10] [False] {'n': [{}]}
Step 1: [array([0, 0, 0, 1, 0])] [array([ 0.        ,  0.875     , -0.06639881,  0.29001294])] [10] [False] {'n': [{}]}
Step 2: [array([0, 1, 0, 0, 0])] [array([ 0.5    

Step 62: [array([0, 1, 0, 0, 0])] [array([ 0.43699057, -0.29650149, -0.33530163, -0.26143751])] [10] [False] {'n': [{}]}
Step 63: [array([0, 0, 1, 0, 0])] [array([-0.17225707, -0.22237611, -0.31807593, -0.2391999 ])] [10] [False] {'n': [{}]}
Step 64: [array([0, 1, 0, 0, 0])] [array([ 0.3708072 , -0.16678209, -0.35515665, -0.22252169])] [10] [False] {'n': [{}]}
Step 65: [array([0, 0, 1, 0, 0])] [array([-0.2218946 , -0.12508656, -0.33296719, -0.21001303])] [10] [False] {'n': [{}]}
Step 66: [array([0, 1, 0, 0, 0])] [array([ 0.33357905, -0.09381492, -0.36632509, -0.20063154])] [10] [False] {'n': [{}]}
Step 67: [array([0, 0, 1, 0, 0])] [array([-0.24981571, -0.07036119, -0.34134352, -0.19359542])] [10] [False] {'n': [{}]}
Step 68: [array([0, 1, 0, 0, 0])] [array([ 0.31263821, -0.05277089, -0.37260734, -0.18831833])] [10] [False] {'n': [{}]}
Step 69: [array([0, 0, 1, 0, 0])] [array([-0.26552134, -0.03957817, -0.34605521, -0.18436051])] [10] [False] {'n': [{}]}
Step 70: [array([0, 0, 0, 1, 0])

Step 133: [array([0, 0, 0, 0, 1])] [array([-0.08401261, -0.39533528, -0.29160259, -0.29108765])] [10] [False] {'n': [{}]}
Step 134: [array([0, 1, 0, 0, 0])] [array([ 0.43699054, -0.29650146, -0.33530164, -0.2614375 ])] [10] [False] {'n': [{}]}
Step 135: [array([0, 0, 1, 0, 0])] [array([-0.17225709, -0.22237609, -0.31807593, -0.23919989])] [10] [False] {'n': [{}]}
Step 136: [array([0, 1, 0, 0, 0])] [array([ 0.37080718, -0.16678207, -0.35515665, -0.22252168])] [10] [False] {'n': [{}]}
Step 137: [array([0, 0, 1, 0, 0])] [array([-0.22189462, -0.12508655, -0.33296719, -0.21001303])] [10] [False] {'n': [{}]}
Step 138: [array([0, 1, 0, 0, 0])] [array([ 0.33357904, -0.09381491, -0.36632509, -0.20063154])] [10] [False] {'n': [{}]}
Step 139: [array([0, 0, 1, 0, 0])] [array([-0.24981572, -0.07036119, -0.34134352, -0.19359542])] [10] [False] {'n': [{}]}
Step 140: [array([0, 1, 0, 0, 0])] [array([ 0.31263821, -0.05277089, -0.37260734, -0.18831833])] [10] [False] {'n': [{}]}
Step 141: [array([0, 0, 

KeyboardInterrupt: 