In [3]:

import numpy as np
import pandas as pd
#np.random.seed(1337)
import os

In [5]:

from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam, SGD
import keras
import theano
import theano.tensor as T
from keras import backend as K

class LunarLearner:

    def squared_error(self, y_true, y_pred):
        '''
        Custom error function
        Action Mask and predictions are provided in y_true
        '''
        slice_true = y_true[:, 0:4]
        slice_pred = y_pred[:, 0:4]
        mask = y_true[:, 4:8]
        new_true = K.sum(slice_pred*mask, axis=-1)
        new_pred = K.sum(slice_true, axis=-1)
        return K.sum(K.square(new_pred - new_true), axis=-1)

    def __init__(self, lr=0.001, gamma=1.0, batch_size=200, layer_width=64):
        self.gamma = gamma
        self.learning_rate = lr
        self.batch_size = batch_size
        self.num_actions = 4
        self.model = Sequential()
        self.model.add(Dense(layer_width, input_dim=8, init='uniform', activation='linear' ))
        self.model.add(keras.layers.advanced_activations.LeakyReLU(alpha=0.01))
        self.model.add(Dense(8, init='uniform', activation='linear'))
        opt = Adam(lr=self.learning_rate)
        self.model.compile(optimizer=opt, loss=self.squared_error)


    def train_learner(self, data):
        '''
        Trains the learner from historical observations
        '''
        nh, nw = data.shape
        indexes = np.random.choice(nh-1, self.batch_size)
        next_states = indexes + 1

        pn = np.hstack([data[indexes,:], data[next_states, :]])
        pn = pn[pn[:,0] == pn[:,nw]] # ensure next state is from the same game

        xp = pn[:, 5:(nw-1)]
        xn = pn[:, (nw+5):-1]
        r = pn[:, (nw+4):(nw+5)]
        a = pn[:, (nw+2):(nw+3)]
        final = 1. - pn[:, (nw+3):(nw+4)]
        a_vec = np.zeros(shape=(len(a), 4), dtype=float)
        a_vec[np.arange(len(a)), a.astype(int).reshape(-1)] = 1.
        x_next = xn
        x_curr = xp

        Q_next = self.predict(x_next).reshape((-1,8)) # one record for each action
        Q_max = np.max(Q_next[:,0:4], axis=1).reshape(-1, 1)

        Q = r + self.gamma * Q_max*final
        Q_target = np.hstack((Q*a_vec, a_vec)) # stack target and mask

        history = LossHistory()
        self.target_actions = np.zeros((1, 1))
        self.model.fit(x_curr, Q_target, nb_epoch=1, batch_size=self.batch_size*4, verbose=0, callbacks=[history])
        #print history.losses[-1]

    def predict_actions(self, X):
        return self.predict(X.reshape(1,-1))[:,0:4]

    def predict(self, X):
        return self.model.predict(X, batch_size=self.batch_size*4)


class LossHistory(keras.callbacks.Callback):
    '''
    Record Loss history for debugging
    '''
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))

Using TensorFlow backend.


In [21]:
class LunarAgentNNBase:
    def __init__(self, learner, datapath, epsilon_change=0.0, epsilon_decay=0.99, epsilon_min=0.1,
                 keep_best=True, normalise_data=False, accelerate=False, properties={}):
        self.epsilon = 1.0  # e greedy implementation of q learning
        self.normalise= normalise_data
        self.epsilon_decay = epsilon_decay
        self.epsilon_change = epsilon_change
        self.epsilon_min = epsilon_min
        self.keep_best = keep_best
        self.accelerate = accelerate
        self.history = []
        self.average_over = 100
        self.current_game = []
        self.previous_game = []
        # self.mem_idx = 0
        self.ENV_STATE, self.ENV_REWARD, self.ENV_DONE = 0, 1, 2
        self.game_idx = 0
        self.move_idx = 0
        self.previous_action = 0
        self.cumulative_reward = 0.0
        self.num_actions = 4
        self.learner = learner
        self.datapath = datapath
        self.decay = 1.0
        self.data = None
        self.early_termination = False
        self.best = (0,-1000.)
        self.allhistory = []

    def get_progress(self):
        '''
        Gets the rolling average at the end of each game and records in a npy file for plotting
        Each run will have a separate file.
        '''
        hist = np.array(self.allhistory)
        basepath = 'results/'
        np.save(basepath + self.datapath + '-data.npy', hist)
        result = np.mean(hist[-self.average_over:, 1])
        return result

    def new_game(self):
        '''
        Run at the end of each game to reset parameters
        '''
        self.history.append(np.array([self.game_idx, self.cumulative_reward]))
        self.allhistory.append(np.array([self.game_idx, self.cumulative_reward]))
        print(self.datapath + "G {0}, Len: {1}, eps: {2:0.3f} Rewd: {3}, Best {4} Avg {5}".format(self.game_idx,
                                                                                         len(self.current_game),
                                                                                         self.epsilon,
                                                                                         self.cumulative_reward,
                                                                                         self.best, self.get_progress()))
        cg_data = np.array(self.current_game)
        game_rewards = cg_data[:, 4]
        reverse_cumsum = np.cumsum(game_rewards[::-1])[::-1]
        cg_data[:, -1] = reverse_cumsum
        self.game_idx += 1
        self.move_idx = 0
        self.previous_action = 0
        self.cumulative_reward = 0.0
        self.previous_game = self.current_game

        if self.data is None:
            self.data = np.array(self.current_game)
        else:
            self.data = np.vstack((self.data, np.array(self.current_game)))

        '''MEMORY OPTIMISTAION - ONLY KEEP BEST GAMES'''

        if self.keep_best and len(self.history) > 512:
            games = np.array(self.history)
            games_sorted = games[games[:,1].argsort()[::-1],:] #b[b[:,1].argsort()[::-1]]
            selected_games = games_sorted[:256,:]
            #selected_games = np.array(self.history)[:256]
            filter_data = np.in1d(self.data[:, 0], selected_games[:,0])
            self.data = self.data[filter_data, :]
            self.history = list(selected_games)

        '''ALTERNATE MEMORY OPTIMISATION - ONLY KEEP RECENT GAMES'''

        if not self.keep_best and len(self.history) > 512:
            selected_games = np.array(self.history)[502:, :]  #drop 10 every 10
            # selected_games = np.array(self.history)[:256]
            filter_data = np.in1d(self.data[:, 0], selected_games[:, 0])
            self.data = self.data[filter_data, :]
            self.history = list(selected_games)

        self.current_game = []
        #decay epsilon
        self.epsilon *= self.decay
        self.early_termination = False

    def record_history(self, action_idx, obs):
        '''
        Keeps a history of observations
        Determine when epsilon decay kicks in
        '''
        state, reward, terminal_state, _ = obs

        if self.normalise:
            state = state - np.array([0.0, 0.60, 0.0, 0.75, 0.0, 0.0, 0.5, 0.5])
            state = state / np.array([1., 1., 1., 2., 0.25, 1., 1., 1.])
            state = np.clip(state, -1., 1.)

        # columns = game_idx, move_idx, action_idx, is_done, reward, [px, py, vx, vy, pa, va, l_leg, r_leg ] cumm_reward
        record = np.array([self.game_idx, self.move_idx, action_idx, int(terminal_state), reward] + list(state) + [0.0])
        self.current_game.append(record)

        if self.game_idx > 256 and terminal_state:
            if self.epsilon > self.epsilon_min:
                self.decay = self.epsilon_decay
            else:
                self.decay = 1.0

        if self.game_idx > 256:
            self.learner.train_learner(self.data)

        self.move_idx += 1
        return state, reward, terminal_state

    def epsilon_greedy_action(self, state):
        '''
        Perform epsilon greedy actions selection
        '''
        epsilon_greedy_action = self.previous_action
        rand = np.random.random()
        if rand < self.epsilon:
            """EXPLORE"""
            change_action = np.random.random() > self.epsilon_change
            if change_action:
                epsilon_greedy_action = np.random.randint(0,self.num_actions, dtype=int) #EXPLORE
        else:
            '''Optional keep previous action (disabled by default)'''
            rand = np.random.random()
            if self.accelerate and self.game_idx > 250 and rand < (float(self.game_idx) / 1000.):
                change_action = np.random.random() > self.epsilon_change
                if change_action:
                    epsilon_greedy_action = np.random.randint(0, self.num_actions, dtype=int)  # EXPLORE
            else:
                """EXPLOIT"""
                # get best action
                values = self.learner.predict_actions(state)
                epsilon_greedy_action = np.argmax(values)

        self.previous_action = epsilon_greedy_action
        return epsilon_greedy_action

    def next_action(self, obs):
        '''
        Select the next best action
        '''
        state, reward, terminal_state = self.record_history(self.previous_action, obs)
        self.cumulative_reward += reward
        next_action = 0

        if terminal_state or self.early_termination:
            #self._transform_batch(self.history)
            if self.best[1] < self.cumulative_reward:
                self.best = (self.game_idx, self.cumulative_reward)


            self.new_game()
        else:
            next_action = self.epsilon_greedy_action(state)
            self.previous_action = next_action

        return next_action, self.early_termination

def video_schedule(episode_id):
    '''
    Disabled for final run
    '''
    return False


def run(agent, datapath):
    '''
    Start the envionment and simluator
    '''
    game_env = gym.make('LunarLander-v2')
    print(os.environ['PATH'])
    game_env = gym.wrappers.Monitor(game_env, 'results/' + datapath, video_callable=video_schedule, force=True)

    for game in range(20000):
        orig_state = game_env.reset()
        obs = [orig_state, 0, False, {}] # format consistent with step
        action_idx, exit = agent.next_action(obs)
        while not obs[2] and not exit:
            obs = game_env.step(action_idx)
            action_idx, exit = agent.next_action(obs)
    game_env.monitor.close()


In [None]:
import gym 

if __name__ == "__main__":
    learner = LunarLearner(lr=0.001, gamma=0.99, batch_size=1000, layer_width=128)
    name = 'LunarAgentNN_v10x511_LC'
    agent = LunarAgentNNBase(learner, name, epsilon_min=0.15)
    run(agent, name)

  '` call to the Keras 2 API: ' + signature)
  '` call to the Keras 2 API: ' + signature)
[2017-03-17 16:15:01,920] Making new env: LunarLander-v2
[2017-03-17 16:15:01,923] Creating monitor directory results/LunarAgentNN_v10x511_LC


/usr/local/bin:/opt/ros/kinetic/bin:/home/mimoralea/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
LunarAgentNN_v10x511_LCG 0, Len: 109, eps: 1.000 Rewd: -312.2994240772602, Best (0, -312.2994240772602) Avg -312.2994240772602
LunarAgentNN_v10x511_LCG 1, Len: 73, eps: 1.000 Rewd: -176.77123691105487, Best (1, -176.77123691105487) Avg -244.53533049415753
LunarAgentNN_v10x511_LCG 2, Len: 83, eps: 1.000 Rewd: -163.1693879710819, Best (2, -163.1693879710819) Avg -217.41334965313231
LunarAgentNN_v10x511_LCG 3, Len: 121, eps: 1.000 Rewd: -196.85558724568688, Best (2, -163.1693879710819) Avg -212.27390905127095
LunarAgentNN_v10x511_LCG 4, Len: 86, eps: 1.000 Rewd: -139.423925888947, Best (4, -139.423925888947) Avg -197.70391241880617
LunarAgentNN_v10x511_LCG 5, Len: 105, eps: 1.000 Rewd: -479.29991837220587, Best (4, -139.423925888947) Avg -244.6365800777061
LunarAgentNN_v10x511_LCG 6, Len: 96, eps: 1.000 Rewd: -231.1053436394664, Best (4



LunarAgentNN_v10x511_LCG 257, Len: 88, eps: 1.000 Rewd: -153.17272930886872, Best (45, -18.413364672624184) Avg -236.12032772771926
LunarAgentNN_v10x511_LCG 258, Len: 124, eps: 0.990 Rewd: -545.0429816012287, Best (45, -18.413364672624184) Avg -239.6551883427978
LunarAgentNN_v10x511_LCG 259, Len: 96, eps: 0.980 Rewd: -285.5809960781469, Best (45, -18.413364672624184) Avg -238.78579169701806
LunarAgentNN_v10x511_LCG 260, Len: 93, eps: 0.970 Rewd: -174.15200924021065, Best (45, -18.413364672624184) Avg -237.47449617765622
LunarAgentNN_v10x511_LCG 261, Len: 107, eps: 0.961 Rewd: -180.65621811355726, Best (45, -18.413364672624184) Avg -237.72253322275083
LunarAgentNN_v10x511_LCG 262, Len: 165, eps: 0.951 Rewd: -29.310737800830907, Best (45, -18.413364672624184) Avg -234.97053938556655
LunarAgentNN_v10x511_LCG 263, Len: 125, eps: 0.941 Rewd: -245.59172235381013, Best (45, -18.413364672624184) Avg -236.10820455786782
LunarAgentNN_v10x511_LCG 264, Len: 83, eps: 0.932 Rewd: -138.77854840300625