In [3]:
#importing required libraries.
import tensorflow as tf #comuptational model used to build machine learning models
import numpy as np #general-purpose array-processing package
import gym             #A toolkit for developing and comparing reinforcement learning algorithms.     Retro Environment
import random
import warnings   #alert
from skimage import transform # Help us to preprocess the frames
from skimage.color import rgb2gray # Help us to gray our frames
import matplotlib.pyplot as plt # Display graphs
from collections import deque # Ordered collection with ends
warnings.filterwarnings('ignore')

In [4]:
class GameEnv:
  """
  This Class creates Atari Game Enivroment and provides some preprocessing functions.
  """
  
  def __init__(self, game = 'SpaceInvaders-v0'):
    self.env = gym.make(game)
    self.n_actions = self.env.action_space.n
    self.frame_size = self.env.observation_space.shape
    self.hot_enc_actions = np.array(np.identity(self.n_actions).tolist()) 
    self.stack_size = 4
    self.stacked_frames = deque([np.zeros((110,84), dtype=np.int) for i in range(self.stack_size)], maxlen=self.stack_size)
    self.hyperparameters = {
                           'learning_rate' : 0.00025,
                           'total_episodes' : 5,#was 5 initially
                           'max_steps' : 50000,#was initially 50000
                           'btach_size': 64,
                           'explore_start' : 1,
                           'explore_end' : 0.01,
                           'decay_rate' : 0.00001,
                           'gamma' : 0.9,
                           'pretrain_length' : 64,
                           'memory_size' : 1000000,
                           'state_size' : [110, 84, 4]
                           }
    self.training = False
    self.render = False
    
    
    
#Converting the main frame into a simpler frame ready to be processed by CNN
  def _preprocess_frame(self,frame):
    gray_frame = rgb2gray(frame)
    cropped_frame = gray_frame[8:-12,4:-12]
    
    # Normalize Pixel Values
    normalized_frame = cropped_frame/255.0
    
    # Resize
    # Thanks to Mikołaj Walkowiak
    preprocessed_frame = transform.resize(normalized_frame, [110,84])
    
    return preprocessed_frame # 110x84x1 frame


# arranges the frames into a stack of 4 frames to recognise the temporal differences
  
  def stack_frame(self, state, new_epis = False):
    
    frame = self._preprocess_frame(state)
    
    if new_epis:
      self.stacked_frames  =  deque([frame for _ in range(self.stack_size)], maxlen=self.stack_size)
    else:
      self.stacked_frames.append(frame)
    
    self.stacked_state = np.stack(self.stacked_frames, axis=2)
    return self.stacked_state

In [5]:
class DeepQNN:
  
  def __init__(self, gamenv):
    self.gamenv = gamenv
    self.decay_step = 0
    with tf.variable_scope('DQNN'):
      self._inputs = tf.placeholder(tf.float32, [None, *self.gamenv.hyperparameters['state_size']], name='inputs')
      self._actions = tf.placeholder(tf.float32, [None, self.gamenv.n_actions], name='actions')
      self.target_Q = tf.placeholder(tf.float32, [None], name="target")
      
      self.conv1 = tf.layers.conv2d(inputs = self._inputs, 
                                    filters = 32,
                                    kernel_size = [8,8],
                                    strides = [4,4],
                                    padding = 'VALID',
                                    kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                    name = 'Conv1')
      self.actvf1 = tf.nn.elu(self.conv1, name='Elu1')
      
      self.conv2 = tf.layers.conv2d(inputs = self.conv1, 
                                    filters = 64,
                                    kernel_size = [4,4],
                                    strides = [2,2],
                                    padding = 'VALID',
                                    kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                    name = 'Conv2')
      self.actvf2 = tf.nn.elu(self.conv2, name='Elu2')
      
      self.conv3 = tf.layers.conv2d(inputs = self.conv2, 
                                    filters = 64,
                                    kernel_size = [3,3],
                                    strides = [2,2],
                                    padding = 'VALID',
                                    kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                    name = 'Conv3')
      self.actvf3 = tf.nn.elu(self.conv3, name='Elu3')
      
      self.flatten = tf.contrib.layers.flatten(self.actvf3)
      self.fc = tf.layers.dense(inputs = self.flatten,
                                units = 512,
                                activation = tf.nn.elu,
                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                name="fc1")
            
      self.output = tf.layers.dense(inputs = self.fc, 
                                   kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                   units = self.gamenv.n_actions, 
                                   activation=None)
      self.Q = tf.reduce_sum(tf.multiply(self.output, self._actions))
            
            # The loss is the difference between our predicted Q_values and the Q_target
            # Sum(Qtarget - Q)^2
      self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
      self.optimizer = tf.train.AdamOptimizer(self.gamenv.hyperparameters['learning_rate']).minimize(self.loss)
      
      
  def predict_action(self, state, sess):
    hyperp = self.gamenv.hyperparameters
    explore_probability = hyperp['explore_end'] + (hyperp['explore_start'] - hyperp['explore_end']) * np.exp(-hyperp['decay_rate'] * self.decay_step)
    
    if explore_probability > np.random.rand():
      action = self.gamenv.hot_enc_actions[self.gamenv.env.action_space.sample()]
      
    else:
      Qs = sess.run(self.output,feed_dict = {self._inputs:state.reshape((1,*state.shape))})
      action = self.gamenv.hot_enc_actions[np.argmax(Qs)]
      
    return action, explore_probability

In [6]:
class Memory:
  def __init__(self, max_size):
    self.buffer = deque(maxlen = max_size)
    
  def add(self, experience):
    self.buffer.append(experience)
    
  def sample(self, batch_size):
    buffer_len = len(self.buffer)
    index = np.random.choice(np.arange(buffer_len), size = batch_size, replace = False)
    return [self.buffer[i] for i in index]

In [7]:
def pre_populate_memory(memory, gamenv):
  state = gamenv.env.reset()
  state = gamenv.stack_frame(state,new_epis = True)
  for i in range(gamenv.hyperparameters['btach_size']):
    action = gamenv.hot_enc_actions[gamenv.env.action_space.sample()]
    next_state, reward, done, info = gamenv.env.step(np.argmax(action))
    next_state = gamenv.stack_frame(next_state, new_epis = False)
    if done:
      next_state = np.zeros(next_state.shape)
      memory.add((state, action, reward, next_state, done))
      state = gamenv.env.reset()
      state = gamenv.stack_frame(state,new_epis = True)
    else:
      memory.add((state, action, reward, next_state, done))
      state = next_state
  return memory

In [8]:
!pip install gym[atari]



In [9]:

tf.reset_default_graph()
spaceinvaders = GameEnv()
# spaceinvaders.training = True
dqnn = DeepQNN(spaceinvaders)
memory = Memory(spaceinvaders.hyperparameters['memory_size'])


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.conv2d instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.dense instead.


In [10]:
memory = pre_populate_memory(memory, spaceinvaders)

In [11]:
# Setup TensorBoard Writer
writer = tf.summary.FileWriter("./tensorboard/dqn/1")

## Losses
tf.summary.scalar("Loss", dqnn.loss)

write_op = tf.summary.merge_all()

In [12]:

spaceinvaders.training = True
spaceinvaders.render = True

In [None]:
# Saver will help us to save our model
saver = tf.train.Saver()
hyperp = spaceinvaders.hyperparameters
rewards_list = []
if spaceinvaders.training == True:
    with tf.Session() as sess:
        #saver.restore(sess, "./models/model.ckpt")
        # Initialize the variables
        sess.run(tf.global_variables_initializer())

        # Initialize the decay rate (that will use to reduce epsilon) 
#         decay_step = 0
        
        for episode in range(hyperp['total_episodes']):
            # Set step to 0
            step = 0
            
            # Initialize the rewards of the episode
            episode_rewards = []
            
            # Make a new episode and observe the first state
            state = spaceinvaders.env.reset()
            
            # Remember that stack frame function also call our preprocess function.
            state = spaceinvaders.stack_frame(state, True)
            
            while step < hyperp['max_steps']:
                step += 1
                
                #Increase decay_step
                dqnn.decay_step +=1
                
                # Predict the action to take and take it
                action, explore_probability = dqnn.predict_action(state, sess)
                
                #Perform the action and get the next_state, reward, and done information
                next_state, reward, done, _ = spaceinvaders.env.step(np.argmax(action))
                
                if spaceinvaders.render:
                    spaceinvaders.env.render()
                
                # Add the reward to total reward
                episode_rewards.append(reward)
                
                # If the game is finished
                if done:
                    # The episode ends so no next state
                    next_state = np.zeros((110,84), dtype=np.int)
                    
                    next_state = spaceinvaders.stack_frame(next_state, False)

                    # Set step = max_steps to end the episode
                    step = hyperp['max_steps']

                    # Get the total reward of the episode
                    total_reward = np.sum(episode_rewards)

                    print('Episode: {}'.format(episode),
                                  'Total reward: {}'.format(total_reward),
                                  'Explore P: {:.4f}'.format(explore_probability),
                                'Training Loss {:.4f}'.format(loss))

                    rewards_list.append((episode, total_reward))

                    # Store transition <st,at,rt+1,st+1> in memory D
                    memory.add((state, action, reward, next_state, done))

                else:
                    # Stack the frame of the next_state
                    next_state = spaceinvaders.stack_frame(next_state, False)
                
                    # Add experience to memory
                    memory.add((state, action, reward, next_state, done))

                    # st+1 is now our current state
                    state = next_state
                    

                ### LEARNING PART            
                # Obtain random mini-batch from memory
                batch = memory.sample(hyperp['btach_size'])
                states_mb = np.array([each[0] for each in batch], ndmin=3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch]) 
                next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                dones_mb = np.array([each[4] for each in batch])

                target_Qs_batch = []

                # Get Q values for next_state 
                Qs_next_state = sess.run(dqnn.output, feed_dict = {dqnn._inputs: next_states_mb})
                
                # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]

                    # If we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    else:
                        target = rewards_mb[i] + hyperp['gamma'] * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                        

                targets_mb = np.array([each for each in target_Qs_batch])

                loss, _ = sess.run([dqnn.loss, dqnn.optimizer],
                                        feed_dict={dqnn._inputs: states_mb,
                                                   dqnn.target_Q: targets_mb,
                                                   dqnn._actions: actions_mb})

                # Write TF Summaries
                summary = sess.run(write_op, feed_dict={dqnn._inputs: states_mb,
                                                       dqnn.target_Q: targets_mb,
                                                       dqnn._actions: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()

            # Save model every 5 episodes
            if episode % 1 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print("Model Saved")

Episode: 0 Total reward: 150.0 Explore P: 0.9929 Training Loss 19.3168
Model Saved
Episode: 1 Total reward: 160.0 Explore P: 0.9853 Training Loss 6.1762
Model Saved
Episode: 2 Total reward: 180.0 Explore P: 0.9787 Training Loss 7.6604
Model Saved
Episode: 3 Total reward: 155.0 Explore P: 0.9718 Training Loss 0.0089
Model Saved
Episode: 4 Total reward: 135.0 Explore P: 0.9660 Training Loss 0.0815
Model Saved
Episode: 5 Total reward: 80.0 Explore P: 0.9623 Training Loss 0.0213
Model Saved
Episode: 6 Total reward: 125.0 Explore P: 0.9575 Training Loss 6.1587
Model Saved
Episode: 7 Total reward: 230.0 Explore P: 0.9490 Training Loss 6.4367
Model Saved
Episode: 8 Total reward: 225.0 Explore P: 0.9412 Training Loss 0.1037
Model Saved
Episode: 9 Total reward: 30.0 Explore P: 0.9376 Training Loss 3.4616
Model Saved
Episode: 10 Total reward: 75.0 Explore P: 0.9329 Training Loss 0.0468
Model Saved
Episode: 11 Total reward: 85.0 Explore P: 0.9277 Training Loss 0.0271
Model Saved
Episode: 12 Total

In [1]:
saver = tf.train.Saver()
step = 0
with tf.Session() as sess:
    total_test_rewards = []
    
    # Load the model
    saver.restore(sess, "./models/model.ckpt")
    
    for episode in range(1):
        total_rewards = 0
        
        state = spaceinvaders.env.reset()
        state = spaceinvaders.stack_frame(state, True)
        
        print("****************************************************")
        print("EPISODE ", episode)
        
        while True:
            # Reshape the state
            state = state.reshape((1, *spaceinvaders.hyperparameters['state_size']))
            # Get action from Q-network 
            # Estimate the Qs values state
            Qs = sess.run(dqnn.output, feed_dict = {dqnn._inputs: state})
            step +=1
            # Take the biggest Q value (= the best action)
            choice = np.argmax(Qs)
#             print(choice)
            action = spaceinvaders.hot_enc_actions[choice]
#             print(choice)
            #Perform the action and get the next_state, reward, and done information
            next_state, reward, done, _ = spaceinvaders.env.step(choice)
            spaceinvaders.env.render()
            
            total_rewards += reward

            if done:
                print ("Score", total_rewards)
                total_test_rewards.append(total_rewards)
                print(step)
                break
                
                
            next_state = spaceinvaders.stack_frame(next_state, False)
            state = next_state
            
    spaceinvaders.env.close()
    del(spaceinvaders)
    spaceinvaders = GameEnv()

NameError: name 'tf' is not defined