In [1]:
import tensorflow as tf




In [5]:
import time
from collections import deque, namedtuple
import numpy as np
import tensorflow as tf
from keras import Sequential
from keras.layers import Dense, Input
from keras.losses import MSE
from keras.optimizers import Adam
from game import SnakeGameAI, Direction, Point
from agent import Agent
import helper

In [6]:
tf.random.set_seed(helper.SEED)

In [7]:
MEMORY_SIZE = 100_000     # size of memory buffer
GAMMA = 0.995             # discount factor
ALPHA = 1e-3              # learning rate  
NUM_STEPS_FOR_UPDATE = 4  # perform a learning update every C time steps


In [8]:
state_size = 11
num_actions = 3

In [9]:
# Store experiences as named tuples
experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])

In [10]:
# Create the Q-Network
q_network = Sequential([
    ### START CODE HERE ### 
    Input(shape=state_size),  
    Dense(64, activation='relu'),
    Dense(64, activation = 'relu'),
    Dense(num_actions, activation = 'linear'),
    ### END CODE HERE ### 
    ])

# Create the target Q^-Network
target_q_network = Sequential([
    ### START CODE HERE ### 
    Input(shape=state_size),  
    Dense(64, activation='relu'),
    Dense(64, activation = 'relu'),
    Dense(num_actions, activation = 'linear'),
    ### END CODE HERE ###
    ])

### START CODE HERE ### 
optimizer = Adam(learning_rate = ALPHA)
### END CODE HERE ###




In [11]:
def compute_loss(experiences, gamma, q_network, target_q_network):
    """ 
    Calculates the loss.
    
    Args:
      experiences: (tuple) tuple of ["state", "action", "reward", "next_state", "done"] namedtuples
      gamma: (float) The discount factor.
      q_network: (tf.keras.Sequential) Keras model for predicting the q_values
      target_q_network: (tf.keras.Sequential) Keras model for predicting the targets
          
    Returns:
      loss: (TensorFlow Tensor(shape=(0,), dtype=int32)) the Mean-Squared Error between
            the y targets and the Q(s,a) values.
    """

    # Unpack the mini-batch of experience tuples
    states, actions, rewards, next_states, done_vals = experiences
    
    # Compute max Q^(s,a)
    max_qsa = tf.reduce_max(target_q_network(next_states), axis=-1)
    
    # Set y = R if episode terminates, otherwise set y = R + γ max Q^(s,a).
    ### START CODE HERE ### 
    y_targets = rewards + gamma * (max_qsa * (1-done_vals))
    ### END CODE HERE ###
    
    # Get the q_values and reshape to match y_targets
    q_values = q_network(states)
    q_values = tf.gather_nd(q_values, tf.stack([tf.range(q_values.shape[0]),
                                                tf.cast(actions, tf.int32)], axis=1))
        
    # Compute the loss
    ### START CODE HERE ### 
    loss = MSE(y_targets, q_values)
    ### END CODE HERE ### 
    
    return loss

In [12]:
@tf.function
def agent_learn(experiences, gamma):
    """
    Updates the weights of the Q networks.
    
    Args:
      experiences: (tuple) tuple of ["state", "action", "reward", "next_state", "done"] namedtuples
      gamma: (float) The discount factor.
    
    """
    
    # Calculate the loss
    with tf.GradientTape() as tape:
        loss = compute_loss(experiences, gamma, q_network, target_q_network)

    # Get the gradients of the loss with respect to the weights.
    gradients = tape.gradient(loss, q_network.trainable_variables)
    
    # Update the weights of the q_network.
    optimizer.apply_gradients(zip(gradients, q_network.trainable_variables))

    # update the weights of target q_network
    helper.update_target_network(q_network, target_q_network)

In [16]:
start = time.time()

num_episodes = 2000
max_num_timesteps = 1500

total_point_history = []
record = 0

num_p_av = 100 # number of total point to use for averaging
epsilon = 1.0 # initial ε value for ε-greedy policy

# Create a memory buffer D with capacity N
memory_buffer = deque(maxlen=MEMORY_SIZE)

# Set the target network weights equal to the Q-Network weights
target_q_network.set_weights(q_network.get_weights())

agent = Agent()
game = SnakeGameAI()

while True:
    
    # reset the environment to the initial state and get the inital state
    state = agent.get_state(game)
    total_points = 0
    t = 0
    
    while True:
        t += 1
        
        state_qn = np.expand_dims(state, axis=0)
        q_values = q_network(state_qn)
        
        # get move
        action = agent.get_action(state)

        # perform move and get new state
        reward, done, score = game.play_step(action)
        next_state = agent.get_state(game)
        
        memory_buffer.append(experience(state, action, reward, next_state, done))
        
        update = helper.check_update_conditions(t, NUM_STEPS_FOR_UPDATE, memory_buffer)
        
        if update:
            
            experiences = helper.get_experiences(memory_buffer)
            
            agent_learn(experiences, GAMMA)
        
        state = next_state.copy()
        total_points += reward
        
        if done:
            game.reset()
            agent.n_games += 1
            if score > record:
                record = score
                agent.model.save()
                
            print('Game', agent.n_games, 'Score', score, 'Record:', record)

#             plot_scores.append(score)
#             total_score += score
#             mean_score = total_score / agent.n_games
#             plot_mean_scores.append(mean_score)
#             plot(plot_scores, plot_mean_scores)
    
    total_point_history.append(total_points)
    av_latest_points = np.mean(total_point_history[-num_p_av:])
    
    print(f"\rEpisode {i+1} | Total point average of the last {num_p_av} episodes: {av_latest_points:.2f}", end="")

    if (i+1) % num_p_av == 0:
        print(f"\rEpisode {i+1} | Total point average of the last {num_p_av} episodes: {av_latest_points:.2f}")

    # We will consider that the environment is solved if we get an
    # average of 200 points in the last 100 episodes.
    if av_latest_points >= 200.0:
        print(f"\n\nEnvironment solved in {i+1} episodes!")
        q_network.save('lunar_lander_model.h5')
        break

tot_time = time.time() - start
print(f"\nTotal Runtime: {tot_time:.2f} s ({(tot_time/60):.2f} min)")

Game 1 Score 0 Record: 0
Game 2 Score 0 Record: 0
Game 3 Score 0 Record: 0
Game 4 Score 0 Record: 0
Game 5 Score 0 Record: 0
Game 6 Score 0 Record: 0
Game 7 Score 0 Record: 0
Game 8 Score 0 Record: 0
Game 9 Score 0 Record: 0
Game 10 Score 0 Record: 0
Game 11 Score 0 Record: 0
Game 12 Score 0 Record: 0
Game 13 Score 0 Record: 0
Game 14 Score 0 Record: 0
Game 15 Score 0 Record: 0
Game 16 Score 0 Record: 0
Game 17 Score 0 Record: 0
Game 18 Score 0 Record: 0
Game 19 Score 0 Record: 0
Game 20 Score 0 Record: 0


KeyboardInterrupt: 

In [1]:
state_qn = np.expand_dims(state, axis=0)

NameError: name 'np' is not defined

In [36]:
state_qn

array([[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1]])

In [10]:
q_values = q_network(state_qn)

In [11]:
q_values

<tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[-0.22272423,  0.30968338,  0.02527073]], dtype=float32)>

In [33]:
agent = Agent()
game = SnakeGameAI()

In [12]:
agent.get_action(state)

1