In [None]:
#Inspired by and adapted to my needs from https://github.com/ageron
#Based on Mnih 2013, playing atari with deep reinforcement learning

In [None]:
%reset -f
import tensorflow as tf
from importlib import reload
sess = tf.InteractiveSession()
import os
import random
from params import * 
import world 
from copy import copy
import matplotlib.pyplot as plt
from collections import deque
import numpy as np

%matplotlib inline
from network import *

In [None]:

#Precision at what the goal state will be accepted. For more info look at my_world.game_over()
precision = 3
world = reload(world)
my_world = world.world(game_mode='constant_reward',goal_reward = 50,precision= precision)

In [None]:
# Defining the actions

def increase_height():
    #change in pixel
    my_world.change_height(1)
def decrease_height():
    #change in pixel
    my_world.change_height(-1)
def increase_width():
    #change in pixel
    my_world.change_width(1)
def decrease_width():
    #change in pixel
    my_world.change_width(-1)
    
def increase_height2():
    #change in pixel
    my_world.change_height(1,ellipse=1)
def decrease_height2():
    #change in pixel
    my_world.change_height(-1,ellipse=1)
def increase_width2():
    #change in pixel
    my_world.change_width(1,ellipse=1)
def decrease_width2():
    #change in pixel
    my_world.change_width(-1,ellipse=1)

    
#List of actions to use for the network.
actions = [increase_height,decrease_height,increase_width,decrease_width,\
          increase_height2,decrease_height2,increase_width2,decrease_width2]



### How does the model architecture look like?

- We have the input with dimensions [FRAME_DIM,FRAME_DIM,1] (in the DQN of Mnih, the 4 last images are taken. However, our task is much simpler and does not require to learn any correlations between the images, so 1 is sufficient as the last dimension (taking just 1 image.)
- Moreover, in our case, it is sufficient to take $\Phi$ to be the identity function, since we do not need preprocessing for image cropping etc.


- First hidden layer: 16 filters of size 8x8 with stride 4. acitvation : relu
- Second hidden layer: 32 filters of size 4x4 with stride 2, again relu (EXCLUDED FOR NOW)
- Final hidden 128 relus, fully connected.
- output layer: fully connected, one output for each action. 

In [None]:
dqn_args   = {"n_out_h1" : 16, "kernel_size_h1":(8,8), "strides_h1" : 4, "padding" : "SAME", \
              "actvt_fct" : tf.nn.relu, "n_out_h2" : 32, "kernel_size_h2":(4,4), "strides_h2" : 2, \
              "n_out_h3" : 128,"initializer" : tf.contrib.layers.variance_scaling_initializer() }


num_actions = len(actions)
num_targets = 3
targets = [{"vaxis" : vaxis, "haxis" : haxis,"vaxis2":vaxis2,"haxis2":haxis2} for vaxis,haxis,vaxis2,haxis2 in \
           zip(np.linspace(5,25,num_targets),np.linspace(5,20,num_targets),np.linspace(5,20,num_targets),np.linspace(5,20,num_targets))]


learning_rate  = 1*10**(-3)

#Batch of memories drawn from memory buffer
batch_size     = 32 

# start training after memory is somewhat filled.
training_start = 4*batch_size  
# gamma discount factor for future rewards. 
discount_rate  = .9


over = True

# We will keep track of the max Q-Value over time and compute the mean per game
prediction_loss = np.infty
game_length = 0
finished   = -1



num_total_updates = 100000
verbose = True

#Time interval after which the target network gets updated by the online learning network
learn_period = 20


game_length_over_t = []

time = []
t = 0 

#Epsilon greedy policy: initially fully exploratory then decaying epsilon
eps_min = 0.1
eps_max = 1.0 
exploratory_steps = num_total_updates//6
eps_decay_steps = num_total_updates-exploratory_steps

In [None]:


#x is our current state, i.e. just the current image.
#Same for learning network as for predicting network, therefore defined outside the scope so that it can be shared.

learner_out  , learner_vars, _   = network("learner"  ,**dqn_args,num_actions=num_actions,num_targets=num_targets)
predictor_out, predictor_vars, _ = network("predictor",**dqn_args,num_actions=num_actions,num_targets=num_targets)




In [None]:
#clever way in tensorflow of updating the predictor network!
#Since all operations are just stored as nodes in the graph, the reassignment operations can be put into 
#a single group of operations. Whenever this group is then called during a session, the assignment node
# gets activated -> Transfer of current learning parameter values into the predicting network.
update_operations = [predictor_var.assign(learner_vars[var_name])
                    for var_name, predictor_var in predictor_vars.items()]

update_predictor = tf.group(*update_operations)


### Training step of the network

In [None]:
#Define all the variables that will be used for the training, e.g. actions taken, target, q_value and so on.

with tf.variable_scope("training",reuse=tf.AUTO_REUSE):
    
    #Placeholder for batch of actions during training.
    act_idx = tf.placeholder(tf.int32, shape=[None])
    #Placeholder for estimated discounted future reward achieved in a step.
    y = tf.placeholder(tf.float32, shape=[None, 1])
    
    # The q_value of the online learning network for a certain step.
    q_value_per_target = [tf.reduce_sum(output * tf.one_hot(act_idx, num_actions), axis=1, keep_dims=True)\
                         for output in learner_out]
    
    ### As in the paper by Mnih, loss is squared difference between target and estimate. 
    loss_per_target = [tf.clip_by_value(tf.squared_difference(y,q_value,"loss"),0,100) for q_value in q_value_per_target]
    
    #Keeping track of how many batches the network has seen for training.
    global_step = tf.Variable(0, trainable=False, name='global_step')
    # Optimizer used by Mnih
    optimizer = tf.train.RMSPropOptimizer(learning_rate)
    training_step_per_target = [optimizer.minimize(loss, global_step=global_step) for loss in loss_per_target]

In [None]:
#Memory for replay
max_memory = 500
#Using deque so that max_length is automatically maintained when pushing new data into the replay buffer.
memory = [deque([],maxlen=max_memory) for _ in range(num_targets)]


In [None]:
def sample_memories(mem,batch_size):
    
    '''
    Sample memories from the replay buffer. Returns a batch of batch_size that includes the 
    remembered state-action-reward-next_state-game_over tuple.
    '''
    
    memo_batch  = np.array(random.sample(mem,batch_size))
    
    mem_states       = np.stack(np.array(memo_batch[:,0]))
    mem_actions      = np.stack(np.array(memo_batch[:,1]))
    mem_rewards      = np.stack(np.array(memo_batch[:,2]))
    mem_nxt_state    = np.stack(np.array(memo_batch[:,3]))
    mem_final_state  = np.stack(np.array(memo_batch[:,4]))

    
    return (mem_states, mem_actions, mem_rewards.reshape(-1, 1), mem_nxt_state, mem_final_state.reshape(-1, 1))


def epsilon_greedy(q_values, num_updates):
    '''
    Custom epsilon greedy policy. Learn completely random in the first "exploratory_steps" number of steps to
    explore the state space more or less equally. Then decay to minimal epsilon to become more and more greedy.
    '''
    
    #First completely random, then decaying linearly from eps_max to eps_min
    epsilon = 1 if num_updates< exploratory_steps else  \
                max(eps_min, eps_max - (eps_max-eps_min) * (num_updates-exploratory_steps)/eps_decay_steps)
        
    if np.random.rand() < epsilon:
        # explore
        return np.random.randint(num_actions) 
    else:
        # exploit
        return np.argmax(q_values) 



In [None]:
#!rm -r one-ellipse/

In [None]:
save_path = "two-ellipses/"

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [None]:
start_at = 0
current_target = 0
with tf.Session() as sess:
    
    if os.path.isfile(save_path+".index"):
        saver.restore(sess, save_path)
        start_at = global_step.eval()
        print("Retrieving saved parameters")
    else:
        init.run()
        
    #start with online being the same as predictor network
    update_predictor.run()
    #start_at = 0
    print("Starting learning process - Filling memory.",end="")
    
    while True:
        t +=1
        num_updates = global_step.eval()
        if num_updates-start_at >= num_total_updates:
            saver.save(sess, save_path)
            break
            
        #Reset world if goal is reached  
        if over:            
            finished +=1
            my_world.restart()
            state = my_world.get_frame()


        #Perform actions according to online network and our custom epsilon greedy policy.
        q_values = learner_out[current_target].eval(feed_dict={x: [state]})
        action_idx = epsilon_greedy(q_values, num_updates-start_at)
        #Perform action
        actions[action_idx]()
        #observe world...
        next_state, reward, over = (my_world.get_frame(),\
                                    my_world.get_reward(),\
                                    my_world.game_over())

        #Place observations in memory.
        memory[current_target].append((state, action_idx, reward, next_state, not over))

        
        state = world.get_frame()

        
        
        #Keep track of game_length over time
        game_length += 1

        if over:
            game_length_over_t.append(game_length)
            time.append(t)
            game_length = 0

        if np.any([len(mem) < training_start for mem in memory ]):
            

            #skip first "training_start" steps until memory is filled enough
            current_target+=1
            current_target%=num_targets
            my_world.set_target(**targets[current_target])
            continue
        
        
        #Print progress!
        elif verbose and num_updates % 200 == 0:
            print("\r Update {}/{} loss : {:.2f} "
                  " finished games:{} width {:.1f} height {:.1f} reward {:.2f}".format(
             num_updates-start_at, num_total_updates,np.sum(prediction_loss)/batch_size,
             finished,my_world.haxis,my_world.vaxis,reward), end="")
            
            
        # Sample memories and use the target DQN to produce the target Q-Value
        mem_x, mem_action, mem_rewards, mem_next_state, continues = sample_memories(memory[current_target],batch_size)
        
        #For target estimate use the predictor #bootstrappingFutureRewards..
        next_q_values = predictor_out[current_target].eval(feed_dict={x: mem_next_state})
        #Learn according to greedy policy.
        
        max_next_q_values = np.max(next_q_values, axis=1, keepdims=True)
        
        #Target to achieve. 
        y_val = mem_rewards + continues * discount_rate * max_next_q_values

        #Perform training steps and calculate the loss
        _, prediction_loss = sess.run([training_step_per_target[current_target],\
                                       loss_per_target[current_target]], 
                                      feed_dict={x: mem_x, act_idx: mem_action, y: y_val})
        # Update the predicting network every once in a while 
        if num_updates % learn_period == 0:
            update_predictor.run()

        # save tuned variables.
        if num_updates % 1000 == 0:
            current_target+=1
            current_target%=num_targets
            my_world.set_target(**targets[current_target])
            
            saver.save(sess, save_path)


In [None]:
#Plotting the reward function over parameter space

my_world.set_target(**targets[0])
range_x  = int(FRAME_DIM/2-MARGIN/2)
range_y  = int(FRAME_DIM/2-MARGIN/2)
parameter_space = np.array([(haxis,vaxis) for haxis in np.arange(1,range_x+1)\
                                          for vaxis in np.arange(1,range_y+1)])

reward_per_target     = [np.zeros((range_x+1,range_y+1)) for _ in range(len(targets))]

for ellipse in parameter_space:
    my_world.haxis = ellipse[0]
    my_world.vaxis = ellipse[1]

    my_world.set_frame(which="first")


    for idx in range(len(targets)):
       
        reward_per_target[idx][ellipse[0]][ellipse[1]] = my_world.get_reward()
       
plt.imshow(reward_per_target[0][1:,1:],origin="lower")
plt.colorbar()

In [None]:
plt.plot(time, game_length_over_t)

In [None]:
#TODO refactor this, let network play greedy.

game_length = 0
play_game_length_over_t = []
play_time = []
with tf.Session() as sess:
    
    if os.path.isfile("./.index"):
        saver.restore(sess, './')
        
        
        for t in range(2000):
            #Reset world if goal is reached  
            if over:            
                finished +=1
                my_world.restart()
                state = my_world.get_frame()
                

            #Perform actions according to online network and our custom epsilon greedy policy.
            q_values = predictor_out.eval(feed_dict={x: [state]})
            action_idx = np.argmax(q_values)
            #Perform action
            actions[action_idx]()
            #observe world...
            state, reward, over = (my_world.get_frame(),\
                                        my_world.get_reward(),\
                                        my_world.game_over(precision=precision))

           
            #Keep track of game_length over time
            game_length += 1

            if over:
                play_game_length_over_t.append(game_length)
                play_time.append(t)
                game_length = 0

In [None]:
plt.plot(play_time,play_game_length_over_t)