In [1]:
import numpy as np
import tensorflow as tf

# Define the states and actions
states = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']
numStates = len(states)
numActions = 4  # ['up', 'down', 'right', 'left']

# Define the transition matrix
R = np.array([
    ['A', 'E', 'B', 'A'],
    ['B', 'F', 'C', 'A'],
    ['C', 'G', 'D', 'B'],
    ['D', 'H', 'D', 'C'],
    ['A', 'I', 'F', 'E'],
    ['B', 'J', 'G', 'E'],
    ['C', 'K', 'H', 'F'],
    ['D', 'L', 'H', 'G'],
    ['E', 'M', 'J', 'I'],
    ['F', 'N', 'K', 'I'],
    ['G', 'O', 'L', 'J'],
    ['H', 'P', 'L', 'K'],
    ['I', 'M', 'N', 'M'],
    ['J', 'N', 'O', 'M'],
    ['K', 'O', 'P', 'N'],
    ['L', 'P', 'P', 'O']
])

listOfHoles = np.array(['F', 'H', 'L', 'M'])

# Function to choose an action using epsilon-greedy policy
def choose_action(state_vector):
    if np.random.uniform(0, 1) < epsilon:
        return np.random.randint(0, numActions)  # Explore
    else:
        q_values = model.predict(state_vector.reshape(1, -1), verbose=0)
        return np.argmax(q_values)  # Exploit

# Convert states to one-hot encoded vectors, ie: state A: (1, 0, 0, 0, 0, ... 0)
def state_to_one_hot(state):
    one_hot = np.zeros(numStates)
    one_hot[states.index(state)] = 1
    return one_hot

In [2]:
inputs = tf.keras.Input(shape=(numStates,))
x = tf.keras.layers.Dense(32, activation="relu")(inputs)
x = tf.keras.layers.Dense(32, activation="relu")(x)
outputs = tf.keras.layers.Dense(numActions, activation="linear")(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse')  # Using Adam optimizer with a learning rate and Mean Squared Error loss

In [4]:
# Hyperparameters
gamma = 0.9    # Discount factor
maxSteps = 99  # Maximum steps per episode
epsilon_decay = 0.995  # Epsilon decay rate
epsilon_min = 0.01  # Minimum epsilon value
epsilon = 1.0  # Exploration rate

# Initialize the Q-table with zeros
q_table = np.zeros((numStates, numActions))


# Q-network training algorithm
for episode in range(500):
    state = 'A'  # Start from state A
    state_index = states.index(state)
    state_vector = state_to_one_hot(state)  # One-hot encoded state vector

    total_reward = 0

    for step in range(maxSteps):
        action = choose_action(state_vector)

        next_state = R[state_index, action]
        next_state_index = states.index(next_state)
        next_state_vector = state_to_one_hot(next_state)

        reward = 0
        if next_state == 'P':
            reward = 1
        if next_state in listOfHoles:
            reward = -1

        # Predict Q-values for the current state
        q_values = model.predict(state_vector.reshape(1, -1), verbose=0)

        # Predict Q-values for the next state
        q_values_next = model.predict(next_state_vector.reshape(1, -1), verbose=0)

        # Compute target Q-value
        target_q_value = reward + gamma * np.max(q_values_next)
        q_values[0][action] = target_q_value

        # Update the Q-network
        model.fit(state_vector.reshape(1, -1), q_values, epochs=1, verbose=0)

        # Update the Q-table
        q_table[state_index, action] = q_values[0, action]

        # Transition to the next state
        state = next_state
        state_index = next_state_index
        state_vector = next_state_vector
        total_reward += reward

        # End the episode if the goal state or a terminal state is reached
        if state == 'P':
            break

    # Decay epsilon
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    # Print progress
    if (episode % 100 == 0):
      print(f"Episode {episode + 1}, Total Reward: {total_reward}, Epsilon: {epsilon:.4f}")

print("Training finished.")

Episode 1, Total Reward: 0, Epsilon: 0.9950
Episode 101, Total Reward: -5, Epsilon: 0.6027
Episode 201, Total Reward: 0, Epsilon: 0.3651
Episode 301, Total Reward: 1, Epsilon: 0.2212
Episode 401, Total Reward: 1, Epsilon: 0.1340
Training finished.


In [5]:
#q_table = np.zeros((numStates, numActions))
for i, state in enumerate(states):
    print(f"State: {state}")
    # Convert state to one-hot encoded vector
    #state_vector = np.zeros(numStates)
    #state_vector[i] = 1  # One-hot encode the state
    #print(state_vector)
    state_vector = state_to_one_hot(state)

    # Predict Q-values for this state
    q_values = model.predict(state_vector.reshape(1, -1), verbose=0)

    # Store Q-values in the Q-table
    print(q_values)
    #q_table[i] = q_values

State: A
[[1.3328207 1.51126   1.5010242 1.3186691]]
State: B
[[1.4500897 0.7682275 1.7311196 1.3010823]]
State: C
[[1.5903533 1.8798285 1.6465975 1.4859785]]
State: D
[[1.6033435  0.89933145 1.5336784  1.604383  ]]
State: E
[[1.3380239  1.6823069  0.74121207 1.4308952 ]]
State: F
[[1.4216535 1.9550904 1.7443982 1.3938609]]
State: G
[[1.514033   2.1128945  0.4778389  0.56738067]]
State: H
[[1.5082058 1.3433236 0.7977875 1.8707659]]
State: I
[[1.447344  0.975004  1.8684891 1.5659631]]
State: J
[[0.5895027 2.0089505 2.0927844 1.6538521]]
State: K
[[1.7925892 2.330699  1.3492628 1.8088256]]
State: L
[[0.69394743 2.5482488  1.35448    2.0937092 ]]
State: M
[[1.5784128 0.9652873 2.087463  0.4698567]]
State: N
[[1.7815741 2.0534625 2.2903192 0.7907319]]
State: O
[[2.0120642 2.4000647 2.5784242 2.1199555]]
State: P
[[1.0648654 1.7598311 1.3366537 1.196108 ]]


In [6]:
# After training, you can inspect the Q-table
print("Final Q-table values:")
print(q_table)

Final Q-table values:
[[1.41292155 1.51252019 1.51933265 1.41727674]
 [1.498631   0.74103463 1.7395221  1.33524847]
 [1.67111039 1.91287959 1.68872178 1.50105464]
 [1.69091606 0.82301164 1.55668736 1.69176376]
 [1.43605626 1.67859304 0.74833387 1.56016254]
 [1.49293113 1.86220431 1.79138494 1.50683486]
 [1.67500031 2.08572602 0.5446021  0.61860746]
 [1.60990405 1.32321012 0.76452166 1.79462039]
 [1.51083755 0.91600746 1.88782454 1.75934124]
 [0.49884856 2.11750293 2.0989356  1.70609534]
 [1.85334396 2.33225703 1.28151917 1.88663983]
 [0.78992724 2.57954025 1.37964344 2.15104461]
 [1.68263197 0.90771687 2.09026408 0.81890637]
 [2.00242043 2.18334699 2.35368085 0.89215791]
 [2.18191814 2.43542695 2.58343816 2.12376976]
 [0.         0.         0.         0.        ]]


# Testing with Model.Numpy()

In [7]:
import numpy as np
import tensorflow as tf

# Define the states and actions
states = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']
numStates = len(states)
numActions = 4  # ['up', 'down', 'right', 'left']

# Define the transition matrix
R = np.array([
    ['A', 'E', 'B', 'A'],
    ['B', 'F', 'C', 'A'],
    ['C', 'G', 'D', 'B'],
    ['D', 'H', 'D', 'C'],
    ['A', 'I', 'F', 'E'],
    ['B', 'J', 'G', 'E'],
    ['C', 'K', 'H', 'F'],
    ['D', 'L', 'H', 'G'],
    ['E', 'M', 'J', 'I'],
    ['F', 'N', 'K', 'I'],
    ['G', 'O', 'L', 'J'],
    ['H', 'P', 'L', 'K'],
    ['I', 'M', 'N', 'M'],
    ['J', 'N', 'O', 'M'],
    ['K', 'O', 'P', 'N'],
    ['L', 'P', 'P', 'O']
])

listOfHoles = np.array(['F', 'H', 'L', 'M'])

# Function to choose an action using epsilon-greedy policy
def choose_action(state_vector):
    if np.random.uniform(0, 1) < epsilon:
        return np.random.randint(0, numActions)  # Explore
    else:
        q_values = model.predict(state_vector.reshape(1, -1), verbose=0)
        return np.argmax(q_values)  # Exploit

# Convert states to one-hot encoded vectors, ie: state A: (1, 0, 0, 0, 0, ... 0)
def state_to_one_hot(state):
    one_hot = np.zeros(numStates)
    one_hot[states.index(state)] = 1
    return one_hot

In [8]:
inputs = tf.keras.Input(shape=(numStates,))
x = tf.keras.layers.Dense(32, activation="relu")(inputs)
x = tf.keras.layers.Dense(32, activation="relu")(x)
outputs = tf.keras.layers.Dense(numActions, activation="linear")(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse')  # Using Adam optimizer with a learning rate and Mean Squared Error loss

In [11]:
# Hyperparameters
gamma = 0.9    # Discount factor
maxSteps = 99  # Maximum steps per episode
epsilon_decay = 0.995  # Epsilon decay rate
epsilon_min = 0.01  # Minimum epsilon value
epsilon = 1.0  # Exploration rate

# Initialize the Q-table with zeros
q_table = np.zeros((numStates, numActions))


# Q-network training algorithm
for episode in range(500):
    state = 'A'  # Start from state A
    state_index = states.index(state)
    state_vector = state_to_one_hot(state)  # One-hot encoded state vector

    total_reward = 0

    for step in range(maxSteps):
        action = choose_action(state_vector)

        next_state = R[state_index, action]
        next_state_index = states.index(next_state)
        next_state_vector = state_to_one_hot(next_state)

        reward = 0
        if next_state == 'P':
            reward = 1
        if next_state in listOfHoles:
            reward = -1

        # Predict Q-values for the current state
        #q_values = model.predict(state_vector.reshape(1, -1), verbose=0)

        # Predict Q-values for the next state
        #q_values_next = model.predict(next_state_vector.reshape(1, -1), verbose=0)
        q_values = model(state_vector.reshape(1,-1)).numpy()
        q_values_next = model(next_state_vector.reshape(1,-1)).numpy()

        # Compute target Q-value
        target_q_value = reward + gamma * np.max(q_values_next)
        q_values[0][action] = target_q_value

        # Update the Q-network
        model.fit(state_vector.reshape(1, -1), q_values, epochs=1, verbose=0)

        # Update the Q-table
        q_table[state_index, action] = q_values[0, action]

        # Transition to the next state
        state = next_state
        state_index = next_state_index
        state_vector = next_state_vector
        total_reward += reward

        # End the episode if the goal state or a terminal state is reached
        if state == 'P':
            break

    # Decay epsilon
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    # Print progress
    if (episode % 100 == 0):
      print(f"Episode {episode + 1}, Total Reward: {total_reward}, Epsilon: {epsilon:.4f}")

print("Training finished.")

Episode 1, Total Reward: -4, Epsilon: 0.9950
Episode 101, Total Reward: -3, Epsilon: 0.6027
Episode 201, Total Reward: 0, Epsilon: 0.3651
Episode 301, Total Reward: 1, Epsilon: 0.2212
Episode 401, Total Reward: 1, Epsilon: 0.1340
Training finished.


In [12]:
#q_table = np.zeros((numStates, numActions))
for i, state in enumerate(states):
    print(f"State: {state}")
    # Convert state to one-hot encoded vector
    #state_vector = np.zeros(numStates)
    #state_vector[i] = 1  # One-hot encode the state
    #print(state_vector)
    state_vector = state_to_one_hot(state)

    # Predict Q-values for this state
    q_values = model.predict(state_vector.reshape(1, -1), verbose=0)

    # Store Q-values in the Q-table
    print(q_values)
    #q_table[i] = q_values

State: A
[[1.1262012 1.3194517 1.2075603 1.1374321]]
State: B
[[1.0744698  0.48568296 1.4033614  1.0688598 ]]
State: C
[[1.2371029 1.6719157 1.3061504 1.3156047]]
State: D
[[1.1751027 0.5113915 1.1944841 1.4433489]]
State: E
[[1.1287448  1.4483818  0.46822304 1.2560023 ]]
State: F
[[1.2923993 1.585337  1.6055459 1.1033381]]
State: G
[[1.3204914  1.7781868  0.29287404 0.47019634]]
State: H
[[1.2303267 1.0807898 0.4837227 1.4578196]]
State: I
[[1.3076162 0.6445928 1.6022233 1.4844861]]
State: J
[[0.46431753 1.7911094  1.848902   1.4417734 ]]
State: K
[[1.5465803  2.0447042  0.98030543 1.5636303 ]]
State: L
[[0.48448977 2.2461298  0.9532682  1.6939882 ]]
State: M
[[1.5111339  0.60150796 1.7242556  0.60496473]]
State: N
[[1.5779389  1.755153   2.0412478  0.51644015]]
State: O
[[1.7569519 1.9751118 2.2454386 1.7070776]]
State: P
[[0.6739152 1.3724157 1.0600637 1.0609677]]


In [13]:
# After training, you can inspect the Q-table
print("Final Q-table values:")
print(q_table)

Final Q-table values:
[[1.15917659 1.31634831 1.18085051 1.19001567]
 [1.15775573 0.45434201 1.3830086  1.18859851]
 [1.39777339 1.55836093 1.35365069 1.40368533]
 [1.24285328 0.44459453 1.2852664  1.48885334]
 [1.13642538 1.41032875 0.46954349 1.2989732 ]
 [1.32950115 1.63182855 1.61502826 1.19101334]
 [1.49608231 1.69825029 0.31144345 0.39495715]
 [1.35865843 0.99079388 0.40586913 1.47154796]
 [1.32184458 0.60135967 1.61806345 1.47508609]
 [0.45227489 1.83075523 1.81698179 1.47020042]
 [1.63840806 2.01182604 0.87376481 1.64678192]
 [0.386186   2.15031147 0.87297261 1.77129114]
 [1.50512099 0.63633609 1.74565935 0.59004593]
 [1.61184669 1.83036625 2.02066994 0.60559529]
 [1.79670632 2.01953459 2.23034763 1.80333149]
 [0.         0.         0.         0.        ]]
