In [1]:
import os
import numpy as np
from Env.DC_gym import DiscreteGymDC
from Nets.DQN import DQN
from Utils.memory import Memory
import gym
import tensorflow as tf
import matplotlib.pyplot as plt

physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [2]:
env = gym.make('CartPole-v0')
env.reset()
n_actions = env.action_space.n

In [3]:
"""PARAM"""
total_episodes = 3000
target_updates = 50
mem_length = 500
gamma = 0.99
batch_size = 32

In [4]:
def eps_greedy(start_probability, end_probability, current_episode, total_episodes):
    epsilon = start_probability + (end_probability-start_probability)*current_episode/total_episodes
    random_number = np.random.sample(1)
    if epsilon > random_number: # exploit
        return True
    else: # explore
        return False

In [6]:
"""
# later can turn all of this into a DQN class
env = DiscreteGymDC(os.path.join(os.getcwd(), "Env\Flowsheet2_PR.fsd"))
DQN_model = DQN(env.n_actions, env.State.state.shape).model
target_model = DQN(env.n_actions, env.State.state.shape).model
"""
tf.keras.backend.set_floatx('float64')
DQN_model = DQN(n_actions, env.observation_space.shape, schedule_lr=True).model
targetDQN_model = DQN(n_actions, env.observation_space.shape).model
memory = Memory(max_size=mem_length)

In [7]:
#tf.keras.utils.plot_model(DQN_model, show_shapes=True)
None

# Populate memory

In [8]:
# first populate memory with random experience
for i in range(mem_length):
    state = env.reset()
    done = False
    while not done:
        action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)
        memory.add([state, action, reward, next_state, 1 - done])
        state = next_state

# Now learn

In [None]:
history = []
for i in range(total_episodes):
    state = env.reset()
    done = False
    k = 0
    total_reward = 0
    while not done:
        k += 1
        if eps_greedy(0.95, 0.05, i, total_episodes) is True:
            action = np.argmax(DQN_model.predict(state[np.newaxis, :]))
        else:
            action = env.action_space.sample()
        
        # now take action
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        
        memory.add((state, action, reward, next_state, 1 - done))
        batch = memory.sample(batch_size)
        state_batch = np.array([each[0] for each in batch])
        action_batch = np.array([each[1] for each in batch])
        reward_batch = np.array([each[2] for each in batch])
        next_state_batch = np.array([each[3] for each in batch])
        done_batch = np.array([each[4] for each in batch])
        
        next_action = np.argmax(DQN_model.predict(next_state_batch), axis=1)
        y = DQN_model.predict(state_batch) # dummy values for actions that aren't taken
        y[np.arange(batch_size), action_batch] = reward + done*gamma*targetDQN_model.predict(next_state_batch)[np.arange(batch_size), next_action]
        
        DQN_model.train_on_batch(x = state_batch, y=y)
        if k > 200: 
            done = True
        
        if done is True:
            history.append(total_reward)
            
            
    if i % target_updates == 0:
        targetDQN_model.set_weights(DQN_model.get_weights())
        print(total_reward)
        print(f"epsilon = {0.95 + (0.05-0.95)*i/total_episodes}")

26.0
epsilon = 0.95
10.0
epsilon = 0.9349999999999999
10.0
epsilon = 0.9199999999999999
11.0
epsilon = 0.9049999999999999


In [None]:
plt.plot(history)

In [None]:
test_hist = []
for _ in range(10):
    total_reward = 0
    done = False
    state = env.reset()
    while not done:
        env.render()
        action = np.argmax(DQN_model.predict(state[np.newaxis, :]))
        state, reward, done, info = env.step(action)
        total_reward += reward
    test_hist.append(total_reward)
env.close()
test_hist

In [None]:
np.argmax(DQN_model.predict(next_state_batch), axis=1)

In [None]:
[np.argmax(DQN_model.predict(next_state_batch), axis=1)]

In [None]:
DQN_model.predict(next_state_batch)

In [None]:
DQN_model.predict(next_state_batch)[np.arange(batch_size), np.argmax(DQN_model.predict(next_state_batch), axis=1)]

In [None]:
state_batch = np.array([each[0] for each in batch])

In [None]:
DQN_model.predict(state_batch)

In [None]:
Q = DQN_model.predict(state_batch)
Q

In [None]:
action_batch = np.array([each[1] for each in batch])
action_batch

In [None]:
Q[np.arange(batch_size), action_batch]