In [3]:
print(tf.__version__)

1.15.0


In [1]:
import os
import gym
import random
import numpy as np
import tensorflow as tf
from collections import deque
from skimage.color import rgb2gray
from skimage.transform import resize
import tensorflow
from keras.models import Sequential
from keras.layers import Convolution2D, Flatten, Dense
from keras import backend as K

Using TensorFlow backend.


In [2]:
K.set_image_data_format('channels_first')

BATCH_SIZE = 32  # Mini batch size
LOAD_NETWORK = False
TRAIN = True
SAVE_INTERVAL = 1000  # The frequency with which the network is saved

In [5]:
FRAME_WIDTH = 84  # Resized frame width
FRAME_HEIGHT = 84  # Resized frame height
class Agent():
    
    def __init__(self, num_actions):
        self.ENV_NAME = 'Breakout-v0'  # Game and its version
        self.SAVE_NETWORK_PATH = 'saved_networks/' + self.ENV_NAME
        self.SAVE_SUMMARY_PATH = 'summary/' + self.ENV_NAME
        self.EXPLORATION_STEPS = 1000000  # Steps for linearly decreasing epsilon
        self.GAMMA = 0.99  # Discount factor
        self.INITIAL_EPSILON = 1.0  # Initial value of epsilon in epsilon-greedy
        self.FINAL_EPSILON = 0.1  # Final value of epsilon in epsilon-greedy
        self.TRAIN_INTERVAL = 4  # The agent selects 4 actions between successive updates

        self.num_actions = num_actions
        self.epsilon = self.INITIAL_EPSILON
        self.epsilon_step = (self.INITIAL_EPSILON - self.FINAL_EPSILON) / self.EXPLORATION_STEPS
        self.t = 0
        
        # Parameters used for summary
        self.total_reward = 0
        self.total_q_max = 0
        self.total_loss = 0
        self.duration = 0
        self.episode = 0

        # Create replay memory
        self.replay_memory = deque()

        # Create q network
        self.s, self.q_values, q_network = self.build_model()
        q_network_weights = q_network.trainable_weights
        
        # Create target network
        self.st, self.target_q_values, target_network = self.build_model()
        target_network_weights = target_network.trainable_weights

        # Define target network update operation
        self.update_target_network = [target_network_weights[i].assign(q_network_weights[i]) for i in range(len(target_network_weights))]

        # Define loss and gradient update operation
        self.a, self.y, self.loss, self.grads_update = self.build_training_op(q_network_weights)

        self.sess = tf.InteractiveSession()
        self.saver = tf.train.Saver(q_network_weights)

        if not os.path.exists(self.SAVE_NETWORK_PATH):
            os.makedirs(self.SAVE_NETWORK_PATH)

        self.sess.run(tf.initialize_all_variables())

        # Load network
        if LOAD_NETWORK:
            self.load()

        # Initialize target network
        self.sess.run(self.update_target_network)
        
   
    STATE_LENGTH = 4  # Number of most recent frames to produce the input to the network
    def build_model(self):
        model = Sequential()
        model.add(Convolution2D(32, 8, 8, subsample=(4, 4), activation='relu', input_shape=(self.STATE_LENGTH, FRAME_WIDTH, FRAME_HEIGHT)))
        model.add(Convolution2D(64, 4, 4, subsample=(2, 2), activation='relu'))
        model.add(Convolution2D(64, 3, 3, subsample=(1, 1), activation='relu'))
        model.add(Flatten())
        model.add(Dense(512, activation='relu'))
        model.add(Dense(self.num_actions))

        s = tf.placeholder(tf.float32, [None, self.STATE_LENGTH, FRAME_WIDTH, FRAME_HEIGHT])
        q_values = model(s)

        return s, q_values, model

    def build_training_op(self, q_network_weights):
        a = tf.placeholder(tf.int64, [None])
        y = tf.placeholder(tf.float32, [None])

        # Convert action to one hot vector
        a_one_hot = tf.one_hot(a, self.num_actions, 1.0, 0.0)
        q_value = tf.reduce_sum(tf.multiply(self.q_values, a_one_hot), reduction_indices=1)

        # Clip the error, the loss is quadratic when the error is in (-1, 1), and linear outside of that region
        error = tf.abs(y - q_value)
        quadratic_part = tf.clip_by_value(error, 0.0, 1.0)
        linear_part = error - quadratic_part
        loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part)
        
        MOMENTUM = 0.95  
        LEARNING_RATE = 0.00025  # Learning rate used by RMSProp
        MIN_GRAD = 0.01  # Constant added to the squared gradient in the denominator of the RMSProp update
        optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, momentum=MOMENTUM, epsilon=MIN_GRAD)
        grads_update = optimizer.minimize(loss, var_list=q_network_weights)

        return a, y, loss, grads_update

  
    INITIAL_REPLAY_SIZE = 20000  # Number of steps to populate the replay memory before training starts
    
    def get_action(self, state):
        if self.epsilon >= random.random() or self.t < self.INITIAL_REPLAY_SIZE:
            action = random.randrange(self.num_actions)
        else:
            action = np.argmax(self.q_values.eval(feed_dict={self.s: [np.float32(state / 255.0)]}))

        # Decreasing espsilon over time
        if self.epsilon > self.FINAL_EPSILON and self.t >= self.INITIAL_REPLAY_SIZE:
            self.epsilon -= self.epsilon_step

        return action
    
    def remember(self,state, action, reward, next_state, terminal):
        self.replay_memory.append((state, action, reward, next_state, terminal))
    
    def act(self, state, action, reward, terminal, observation):
        next_state = np.append(state[1:, :, :], observation, axis=0)

        # Clip all positive rewards at 1 and all negative rewards at -1, leaving 0 rewards unchanged
        reward = np.clip(reward, -1, 1)

        # Store transition in replay memory
        NUM_REPLAY_MEMORY = 400000  # Number of replay memory the agent uses for training
        
        self.remember(state, action, reward, next_state, terminal)
        
        if len(self.replay_memory) > NUM_REPLAY_MEMORY:
            self.replay_memory.popleft()

        TARGET_UPDATE_INTERVAL = 10000  # The frequency with which the target network is updated
        if self.t >= self.INITIAL_REPLAY_SIZE:
            # Train network
            if self.t % self.TRAIN_INTERVAL == 0:
                self.replay()

            # Update target network
            if self.t % TARGET_UPDATE_INTERVAL == 0:
                self.sess.run(self.update_target_network)

            # Save network
            if self.t % SAVE_INTERVAL == 0:
                self.save()

        self.total_reward += reward
        self.total_q_max += np.max(self.q_values.eval(feed_dict={self.s: [np.float32(state / 255.0)]}))
        self.duration += 1

        if terminal:
            # Debug
            if self.t < self.INITIAL_REPLAY_SIZE:
                mode = 'random'
            elif self.INITIAL_REPLAY_SIZE <= self.t < self.INITIAL_REPLAY_SIZE + self.EXPLORATION_STEPS:
                mode = 'explore'
            else:
                mode = 'exploit'
            print('EPISODE: {0:6d} / TIMESTEP: {1:8d} / DURATION: {2:5d} / EPSILON: {3:.5f} / TOTAL_REWARD: {4:3.0f} / AVG_MAX_Q: {5:2.4f} / AVG_LOSS: {6:.5f} / MODE: {7}'.format(
                self.episode + 1, self.t, self.duration, self.epsilon,
                self.total_reward, self.total_q_max / float(self.duration),
                self.total_loss / (float(self.duration) / float(self.TRAIN_INTERVAL)), mode))
            with open("BreakoutgameDQN.txt", "a") as f:
                f.write("Simulation {}: Total reward {}  total loss {}\n".format(str(self.episode + 1), self.total_reward, self.total_reward))

            self.total_reward = 0
            self.total_q_max = 0
            self.total_loss = 0
            self.duration = 0
            self.episode += 1

        self.t += 1

        return next_state
    
    #funtion for training network
    def replay(self):
        state_batch = []
        action_batch = []
        reward_batch = []
        next_state_batch = []
        terminal_batch = []
        y_batch = []

        # Sample random minibatch of transition from replay memory
        minibatch = random.sample(self.replay_memory, BATCH_SIZE)
        for data in minibatch:
            state_batch.append(data[0])
            action_batch.append(data[1])
            reward_batch.append(data[2])
            next_state_batch.append(data[3])
            terminal_batch.append(data[4])

        # Convert True to 1, False to 0
        terminal_batch = np.array(terminal_batch) + 0

        target_q_values_batch = self.target_q_values.eval(feed_dict={self.st: np.float32(np.array(next_state_batch) / 255.0)})
        y_batch = reward_batch + (1 - terminal_batch) * self.GAMMA * np.max(target_q_values_batch, axis=1)

        loss, _ = self.sess.run([self.loss, self.grads_update], feed_dict={
            self.s: np.float32(np.array(state_batch) / 255.0),
            self.a: action_batch,
            self.y: y_batch
        })

        self.total_loss += loss
        
    def load(self):
        checkpoint = tf.train.get_checkpoint_state(self.SAVE_NETWORK_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            
    def save(self):
        save_path = self.saver.save(self.sess, self.SAVE_NETWORK_PATH + '/' + self.ENV_NAME, global_step=self.t)
        print('Successfully saved: ' + save_path)

In [6]:
if __name__ == "__main__":
    env = gym.make('Breakout-v0')
    agent = Agent(num_actions=env.action_space.n)
    
    EPISODES = 30000  # times the game is played
    NO_OP_STEPS = 30  # Maximum number of "do nothing" actions to be performed by the agent at the start of an episode
    STATE_LENGTH= 4
    
    if TRAIN:  # Train mode
        for _ in range(EPISODES):
            terminal = False
            observation = env.reset()
            for _ in range(random.randint(1, NO_OP_STEPS)):
                last_observation = observation
                observation, _, _, _ = env.step(0)  # Do nothing
            processed_observation = np.maximum(observation, last_observation)
            processed_observation = np.uint8(resize(rgb2gray(processed_observation), (FRAME_WIDTH, FRAME_HEIGHT)) * 255)
            state = [processed_observation for _ in range(STATE_LENGTH)]
            state =  np.stack(state, axis=0)
            #state = agent.get_initial_state(observation, last_observation)
            while not terminal:
                last_observation = observation
                action = agent.get_action(state)
                observation, reward, terminal, _ = env.step(action)
                
                # doing pre processing here                
                processed_observation = np.maximum(observation, last_observation)
                processed_observation = np.uint8(resize(rgb2gray(processed_observation), (FRAME_WIDTH, FRAME_HEIGHT)) * 255)
                processed_observation = np.reshape(processed_observation, (1, FRAME_WIDTH, FRAME_HEIGHT))
                
                state = agent.act(state, action, reward, terminal, processed_observation)



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use `tf.global_variables_initializer` instead.
EPISODE:      1 / TIMESTEP:      233 / DURATION:   234 / EPSILON: 1.00000 / TOTAL_REWARD:   1 / AVG_MAX_Q: 0.0497 / AVG_LOSS: 0.00000 / MODE: random
EPISODE:      2 / TIMESTEP:      400 / DURATION:   167 / EPSILON: 1.00000 / TOTAL_REWARD:   0 / AVG_MAX_Q: 0.0484 / AVG_LOSS: 0.00000 / MODE: random
EPISODE:      3 / TIMESTEP:      641 / DURATION:   241 / EPSILON: 1.00000 / TOTAL_REWARD:   1 / AVG_MAX_Q: 0.0493 / AVG_LOSS: 0.00000 / MODE: random
EPISODE:      4 / TIMESTEP:      927 / DURATION:   286 / EPSILON: 1.00000 / TOTAL_REWARD:   2 / AVG_MAX_Q: 0.0465 / AVG_LOSS: 0.00000 / MODE: random
EPISODE:      5 / TIMESTEP:     1095 / DURATION:   168 / EPSILON: 1.00000 / TOTAL_REWARD:   0 / AVG_MAX_Q: 0.0474 / AV

KeyboardInterrupt: 