In [8]:

import numpy as np
import pandas as pd
#np.random.seed(1337)
import os

In [9]:

from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam, SGD
import keras
import theano
import theano.tensor as T
from keras import backend as K

class LunarLearner:

    def squared_error(self, y_true, y_pred):
        '''
        Custom error function
        Action Mask and predictions are provided in y_true
        '''
        slice_true = y_true[:, 0:4]
        slice_pred = y_pred[:, 0:4]
        mask = y_true[:, 4:8]
        new_true = K.sum(slice_pred*mask, axis=-1)
        new_pred = K.sum(slice_true, axis=-1)
        return K.sum(K.square(new_pred - new_true), axis=-1)

    def __init__(self, lr=0.001, gamma=1.0, batch_size=200, layer_width=64):
        self.gamma = gamma
        self.learning_rate = lr
        self.batch_size = batch_size
        self.num_actions = 4
        self.model = Sequential()
        self.model.add(Dense(layer_width, input_dim=8, init='uniform', activation='linear' ))
        self.model.add(keras.layers.advanced_activations.LeakyReLU(alpha=0.01))
        self.model.add(Dense(8, init='uniform', activation='linear'))
        opt = Adam(lr=self.learning_rate)
        self.model.compile(optimizer=opt, loss=self.squared_error)


    def train_learner(self, data):
        '''
        Trains the learner from historical observations
        '''
        nh, nw = data.shape
        indexes = np.random.choice(nh-1, self.batch_size)
        next_states = indexes + 1

        pn = np.hstack([data[indexes,:], data[next_states, :]])
        pn = pn[pn[:,0] == pn[:,nw]] # ensure next state is from the same game

        xp = pn[:, 5:(nw-1)]
        xn = pn[:, (nw+5):-1]
        r = pn[:, (nw+4):(nw+5)]
        a = pn[:, (nw+2):(nw+3)]
        final = 1. - pn[:, (nw+3):(nw+4)]
        a_vec = np.zeros(shape=(len(a), 4), dtype=float)
        a_vec[np.arange(len(a)), a.astype(int).reshape(-1)] = 1.
        x_next = xn
        x_curr = xp

        Q_next = self.predict(x_next).reshape((-1,8)) # one record for each action
        Q_max = np.max(Q_next[:,0:4], axis=1).reshape(-1, 1)

        Q = r + self.gamma * Q_max*final
        Q_target = np.hstack((Q*a_vec, a_vec)) # stack target and mask

        history = LossHistory()
        self.target_actions = np.zeros((1, 1))
        self.model.fit(x_curr, Q_target, nb_epoch=1, batch_size=self.batch_size*4, verbose=0, callbacks=[history])
        #print history.losses[-1]

    def predict_actions(self, X):
        return self.predict(X.reshape(1,-1))[:,0:4]

    def predict(self, X):
        return self.model.predict(X, batch_size=self.batch_size*4)


class LossHistory(keras.callbacks.Callback):
    '''
    Record Loss history for debugging
    '''
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))

In [10]:
class LunarAgentNNBase:
    def __init__(self, learner, datapath, epsilon_change=0.0, epsilon_decay=0.99, epsilon_min=0.1,
                 keep_best=True, normalise_data=False, accelerate=False, properties={}):
        self.epsilon = 1.0  # e greedy implementation of q learning
        self.normalise= normalise_data
        self.epsilon_decay = epsilon_decay
        self.epsilon_change = epsilon_change
        self.epsilon_min = epsilon_min
        self.keep_best = keep_best
        self.accelerate = accelerate
        self.history = []
        self.average_over = 100
        self.current_game = []
        self.previous_game = []
        # self.mem_idx = 0
        self.ENV_STATE, self.ENV_REWARD, self.ENV_DONE = 0, 1, 2
        self.game_idx = 0
        self.move_idx = 0
        self.previous_action = 0
        self.cumulative_reward = 0.0
        self.num_actions = 4
        self.learner = learner
        self.datapath = datapath
        self.decay = 1.0
        self.data = None
        self.early_termination = False
        self.best = (0,-1000.)
        self.allhistory = []

    def get_progress(self):
        '''
        Gets the rolling average at the end of each game and records in a npy file for plotting
        Each run will have a separate file.
        '''
        hist = np.array(self.allhistory)
        basepath = 'results/'
        np.save(basepath + self.datapath + '-data.npy', hist)
        result = np.mean(hist[-self.average_over:, 1])
        return result

    def new_game(self):
        '''
        Run at the end of each game to reset parameters
        '''
        self.history.append(np.array([self.game_idx, self.cumulative_reward]))
        self.allhistory.append(np.array([self.game_idx, self.cumulative_reward]))
        print(self.datapath + "G {0}, Len: {1}, eps: {2:0.3f} Rewd: {3}, Best {4} Avg {5}".format(self.game_idx,
                                                                                         len(self.current_game),
                                                                                         self.epsilon,
                                                                                         self.cumulative_reward,
                                                                                         self.best, self.get_progress()))
        cg_data = np.array(self.current_game)
        game_rewards = cg_data[:, 4]
        reverse_cumsum = np.cumsum(game_rewards[::-1])[::-1]
        cg_data[:, -1] = reverse_cumsum
        self.game_idx += 1
        self.move_idx = 0
        self.previous_action = 0
        self.cumulative_reward = 0.0
        self.previous_game = self.current_game

        if self.data is None:
            self.data = np.array(self.current_game)
        else:
            self.data = np.vstack((self.data, np.array(self.current_game)))

        '''MEMORY OPTIMISTAION - ONLY KEEP BEST GAMES'''

        if self.keep_best and len(self.history) > 512:
            games = np.array(self.history)
            games_sorted = games[games[:,1].argsort()[::-1],:] #b[b[:,1].argsort()[::-1]]
            selected_games = games_sorted[:256,:]
            #selected_games = np.array(self.history)[:256]
            filter_data = np.in1d(self.data[:, 0], selected_games[:,0])
            self.data = self.data[filter_data, :]
            self.history = list(selected_games)

        '''ALTERNATE MEMORY OPTIMISATION - ONLY KEEP RECENT GAMES'''

        if not self.keep_best and len(self.history) > 512:
            selected_games = np.array(self.history)[502:, :]  #drop 10 every 10
            # selected_games = np.array(self.history)[:256]
            filter_data = np.in1d(self.data[:, 0], selected_games[:, 0])
            self.data = self.data[filter_data, :]
            self.history = list(selected_games)

        self.current_game = []
        #decay epsilon
        self.epsilon *= self.decay
        self.early_termination = False

    def record_history(self, action_idx, obs):
        '''
        Keeps a history of observations
        Determine when epsilon decay kicks in
        '''
        state, reward, terminal_state, _ = obs

        if self.normalise:
            state = state - np.array([0.0, 0.60, 0.0, 0.75, 0.0, 0.0, 0.5, 0.5])
            state = state / np.array([1., 1., 1., 2., 0.25, 1., 1., 1.])
            state = np.clip(state, -1., 1.)

        # columns = game_idx, move_idx, action_idx, is_done, reward, [px, py, vx, vy, pa, va, l_leg, r_leg ] cumm_reward
        record = np.array([self.game_idx, self.move_idx, action_idx, int(terminal_state), reward] + list(state) + [0.0])
        self.current_game.append(record)

        if self.game_idx > 256 and terminal_state:
            if self.epsilon > self.epsilon_min:
                self.decay = self.epsilon_decay
            else:
                self.decay = 1.0

        if self.game_idx > 256:
            self.learner.train_learner(self.data)

        self.move_idx += 1
        return state, reward, terminal_state

    def epsilon_greedy_action(self, state):
        '''
        Perform epsilon greedy actions selection
        '''
        epsilon_greedy_action = self.previous_action
        rand = np.random.random()
        if rand < self.epsilon:
            """EXPLORE"""
            change_action = np.random.random() > self.epsilon_change
            if change_action:
                epsilon_greedy_action = np.random.randint(0,self.num_actions, dtype=int) #EXPLORE
        else:
            '''Optional keep previous action (disabled by default)'''
            rand = np.random.random()
            if self.accelerate and self.game_idx > 250 and rand < (float(self.game_idx) / 1000.):
                change_action = np.random.random() > self.epsilon_change
                if change_action:
                    epsilon_greedy_action = np.random.randint(0, self.num_actions, dtype=int)  # EXPLORE
            else:
                """EXPLOIT"""
                # get best action
                values = self.learner.predict_actions(state)
                epsilon_greedy_action = np.argmax(values)

        self.previous_action = epsilon_greedy_action
        return epsilon_greedy_action

    def next_action(self, obs):
        '''
        Select the next best action
        '''
        state, reward, terminal_state = self.record_history(self.previous_action, obs)
        self.cumulative_reward += reward
        next_action = 0

        if terminal_state or self.early_termination:
            #self._transform_batch(self.history)
            if self.best[1] < self.cumulative_reward:
                self.best = (self.game_idx, self.cumulative_reward)


            self.new_game()
        else:
            next_action = self.epsilon_greedy_action(state)
            self.previous_action = next_action

        return next_action, self.early_termination


In [None]:
import gym
import os

learner = LunarLearner(lr=0.001, gamma=0.99, batch_size=1000, layer_width=128)
datapath = 'LunarAgentNN_v10x511_LC'
agent = LunarAgentNNBase(learner, datapath, epsilon_min=0.15)
game_env = gym.make('LunarLander-v2')
print(os.environ['PATH'])
game_env = gym.wrappers.Monitor(game_env, 'results/' + datapath, force=True)

for game in range(20000):
    orig_state = game_env.reset()
    obs = [orig_state, 0, False, {}] # format consistent with step
    action_idx, exit = agent.next_action(obs)
    while not obs[2] and not exit:
        obs = game_env.step(action_idx)
        action_idx, exit = agent.next_action(obs)
game_env.monitor.close()

  '` call to the Keras 2 API: ' + signature)
  '` call to the Keras 2 API: ' + signature)
[2017-03-19 21:00:18,297] Making new env: LunarLander-v2
[2017-03-19 21:00:18,309] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/mimoralea/Projects/applied-reinforcement-learning/results/LunarAgentNN_v10x511_LC')
[2017-03-19 21:00:18,311] Clearing 4 monitor files from previous run (because force=True was provided)
[2017-03-19 21:00:18,316] Starting new video recorder writing to /home/mimoralea/Projects/applied-reinforcement-learning/results/LunarAgentNN_v10x511_LC/openaigym.video.1.14428.video000000.mp4


/usr/local/bin:/opt/ros/kinetic/bin:/home/mimoralea/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin


[2017-03-19 21:00:19,737] Starting new video recorder writing to /home/mimoralea/Projects/applied-reinforcement-learning/results/LunarAgentNN_v10x511_LC/openaigym.video.1.14428.video000001.mp4


LunarAgentNN_v10x511_LCG 0, Len: 78, eps: 1.000 Rewd: -359.6988007229631, Best (0, -359.69880072296309) Avg -359.6988007229631


[2017-03-19 21:00:21,796] Starting new video recorder writing to /home/mimoralea/Projects/applied-reinforcement-learning/results/LunarAgentNN_v10x511_LC/openaigym.video.1.14428.video000008.mp4


LunarAgentNN_v10x511_LCG 1, Len: 113, eps: 1.000 Rewd: -435.72027407782883, Best (0, -359.69880072296309) Avg -397.70953740039596
LunarAgentNN_v10x511_LCG 2, Len: 137, eps: 1.000 Rewd: -541.3885846618803, Best (0, -359.69880072296309) Avg -445.6025531542241
LunarAgentNN_v10x511_LCG 3, Len: 83, eps: 1.000 Rewd: -167.1174060019801, Best (3, -167.1174060019801) Avg -375.9812663661631
LunarAgentNN_v10x511_LCG 4, Len: 92, eps: 1.000 Rewd: -489.12654122115333, Best (3, -167.1174060019801) Avg -398.6103213371611
LunarAgentNN_v10x511_LCG 5, Len: 80, eps: 1.000 Rewd: -94.89040848392548, Best (5, -94.890408483925484) Avg -347.9903358616218
LunarAgentNN_v10x511_LCG 6, Len: 114, eps: 1.000 Rewd: -291.01006732785675, Best (5, -94.890408483925484) Avg -339.85029749965537
LunarAgentNN_v10x511_LCG 7, Len: 73, eps: 1.000 Rewd: -152.22230105512665, Best (5, -94.890408483925484) Avg -316.39679794408926
LunarAgentNN_v10x511_LCG 8, Len: 121, eps: 1.000 Rewd: -200.89734903698042, Best (5, -94.89040848392548

[2017-03-19 21:00:24,177] Starting new video recorder writing to /home/mimoralea/Projects/applied-reinforcement-learning/results/LunarAgentNN_v10x511_LC/openaigym.video.1.14428.video000027.mp4


LunarAgentNN_v10x511_LCG 19, Len: 75, eps: 1.000 Rewd: -125.9013924548671, Best (5, -94.890408483925484) Avg -214.7662294719411
LunarAgentNN_v10x511_LCG 20, Len: 134, eps: 1.000 Rewd: -205.82067307212247, Best (5, -94.890408483925484) Avg -214.34025059575924
LunarAgentNN_v10x511_LCG 21, Len: 119, eps: 1.000 Rewd: -143.8418153475589, Best (5, -94.890408483925484) Avg -211.13577626629558
LunarAgentNN_v10x511_LCG 22, Len: 73, eps: 1.000 Rewd: -257.35329271843364, Best (5, -94.890408483925484) Avg -213.14523350334505
LunarAgentNN_v10x511_LCG 23, Len: 66, eps: 1.000 Rewd: -135.47619028067376, Best (5, -94.890408483925484) Avg -209.9090233690671
LunarAgentNN_v10x511_LCG 24, Len: 66, eps: 1.000 Rewd: -143.76117465847605, Best (5, -94.890408483925484) Avg -207.26310942064347
LunarAgentNN_v10x511_LCG 25, Len: 78, eps: 1.000 Rewd: -196.6647371320416, Best (5, -94.890408483925484) Avg -206.8554797172357
LunarAgentNN_v10x511_LCG 26, Len: 68, eps: 1.000 Rewd: -245.12753628657384, Best (5, -94.89040

[2017-03-19 21:00:26,793] Starting new video recorder writing to /home/mimoralea/Projects/applied-reinforcement-learning/results/LunarAgentNN_v10x511_LC/openaigym.video.1.14428.video000064.mp4


LunarAgentNN_v10x511_LCG 63, Len: 92, eps: 1.000 Rewd: -179.71259189463981, Best (56, -76.173076851236772) Avg -204.13728637801796
LunarAgentNN_v10x511_LCG 64, Len: 97, eps: 1.000 Rewd: -373.5951444845675, Best (56, -76.173076851236772) Avg -206.74433034888796
LunarAgentNN_v10x511_LCG 65, Len: 63, eps: 1.000 Rewd: -155.26205004454374, Best (56, -76.173076851236772) Avg -205.96429579882215
LunarAgentNN_v10x511_LCG 66, Len: 65, eps: 1.000 Rewd: -194.35722221666265, Best (56, -76.173076851236772) Avg -205.7910558946108
LunarAgentNN_v10x511_LCG 67, Len: 104, eps: 1.000 Rewd: -641.7258177309571, Best (56, -76.173076851236772) Avg -212.20186121573354
LunarAgentNN_v10x511_LCG 68, Len: 79, eps: 1.000 Rewd: -273.52819498650024, Best (56, -76.173076851236772) Avg -213.09064866168669
LunarAgentNN_v10x511_LCG 69, Len: 78, eps: 1.000 Rewd: -140.05435054694968, Best (56, -76.173076851236772) Avg -212.0472729743333
LunarAgentNN_v10x511_LCG 70, Len: 94, eps: 1.000 Rewd: -129.833316879565, Best (56, -7

[2017-03-19 21:00:29,517] Starting new video recorder writing to /home/mimoralea/Projects/applied-reinforcement-learning/results/LunarAgentNN_v10x511_LC/openaigym.video.1.14428.video000125.mp4


LunarAgentNN_v10x511_LCG 123, Len: 76, eps: 1.000 Rewd: -159.47013321768327, Best (87, -42.27857625971243) Avg -229.52160920985716
LunarAgentNN_v10x511_LCG 124, Len: 79, eps: 1.000 Rewd: -228.03963104870468, Best (87, -42.27857625971243) Avg -230.36439377375945
LunarAgentNN_v10x511_LCG 125, Len: 120, eps: 1.000 Rewd: -305.0144534158062, Best (87, -42.27857625971243) Avg -231.4478909365971
LunarAgentNN_v10x511_LCG 126, Len: 108, eps: 1.000 Rewd: -575.4961692572743, Best (87, -42.27857625971243) Avg -234.7515772663041
LunarAgentNN_v10x511_LCG 127, Len: 83, eps: 1.000 Rewd: -252.30783681587513, Best (87, -42.27857625971243) Avg -234.81812280625806
LunarAgentNN_v10x511_LCG 128, Len: 101, eps: 1.000 Rewd: -231.08929227085173, Best (87, -42.27857625971243) Avg -234.243042009952
LunarAgentNN_v10x511_LCG 129, Len: 114, eps: 1.000 Rewd: -351.05888243391973, Best (87, -42.27857625971243) Avg -234.36503130370818
LunarAgentNN_v10x511_LCG 130, Len: 100, eps: 1.000 Rewd: -228.34098956315202, Best (8

[2017-03-19 21:00:33,162] Starting new video recorder writing to /home/mimoralea/Projects/applied-reinforcement-learning/results/LunarAgentNN_v10x511_LC/openaigym.video.1.14428.video000216.mp4


LunarAgentNN_v10x511_LCG 207, Len: 96, eps: 1.000 Rewd: -490.2356866722901, Best (87, -42.27857625971243) Avg -227.95086860071
LunarAgentNN_v10x511_LCG 208, Len: 79, eps: 1.000 Rewd: -165.5845215260473, Best (87, -42.27857625971243) Avg -227.2236186025114
LunarAgentNN_v10x511_LCG 209, Len: 78, eps: 1.000 Rewd: -136.19734365707632, Best (87, -42.27857625971243) Avg -224.9273999030964
LunarAgentNN_v10x511_LCG 210, Len: 79, eps: 1.000 Rewd: -158.7414863853702, Best (87, -42.27857625971243) Avg -225.1184288004223
LunarAgentNN_v10x511_LCG 211, Len: 94, eps: 1.000 Rewd: -198.20456452324765, Best (87, -42.27857625971243) Avg -225.1571319185722
LunarAgentNN_v10x511_LCG 212, Len: 73, eps: 1.000 Rewd: -217.62619084300832, Best (87, -42.27857625971243) Avg -225.41285629473782
LunarAgentNN_v10x511_LCG 213, Len: 95, eps: 1.000 Rewd: -357.0005856378953, Best (87, -42.27857625971243) Avg -227.6152459567697
LunarAgentNN_v10x511_LCG 214, Len: 83, eps: 1.000 Rewd: -154.65088027338658, Best (87, -42.2785



LunarAgentNN_v10x511_LCG 257, Len: 83, eps: 1.000 Rewd: -153.94511552208417, Best (87, -42.27857625971243) Avg -233.09423391797966
LunarAgentNN_v10x511_LCG 258, Len: 67, eps: 0.990 Rewd: -130.22353124559726, Best (87, -42.27857625971243) Avg -233.24357825935348
LunarAgentNN_v10x511_LCG 259, Len: 112, eps: 0.980 Rewd: -197.99589084179814, Best (87, -42.27857625971243) Avg -231.56120461245652
LunarAgentNN_v10x511_LCG 260, Len: 106, eps: 0.970 Rewd: -72.2637549965067, Best (87, -42.27857625971243) Avg -231.2414784553991
LunarAgentNN_v10x511_LCG 261, Len: 106, eps: 0.961 Rewd: -440.2631941130767, Best (87, -42.27857625971243) Avg -232.6671171403082
LunarAgentNN_v10x511_LCG 262, Len: 74, eps: 0.951 Rewd: -171.8878887952034, Best (87, -42.27857625971243) Avg -232.6149053991085
LunarAgentNN_v10x511_LCG 263, Len: 117, eps: 0.941 Rewd: -191.67048120825157, Best (87, -42.27857625971243) Avg -231.74073968937097
LunarAgentNN_v10x511_LCG 264, Len: 85, eps: 0.932 Rewd: -25.403615113534812, Best (264

[2017-03-19 21:02:22,242] Starting new video recorder writing to /home/mimoralea/Projects/applied-reinforcement-learning/results/LunarAgentNN_v10x511_LC/openaigym.video.1.14428.video000343.mp4


LunarAgentNN_v10x511_LCG 342, Len: 152, eps: 0.426 Rewd: -166.6308020889752, Best (264, -25.403615113534812) Avg -194.58468251419634




LunarAgentNN_v10x511_LCG 343, Len: 115, eps: 0.421 Rewd: -150.2868778629574, Best (264, -25.403615113534812) Avg -194.798922189284
LunarAgentNN_v10x511_LCG 344, Len: 288, eps: 0.417 Rewd: -586.9952855756212, Best (264, -25.403615113534812) Avg -195.21539226298512
LunarAgentNN_v10x511_LCG 345, Len: 148, eps: 0.413 Rewd: -133.51304169606527, Best (264, -25.403615113534812) Avg -195.24710033095275
LunarAgentNN_v10x511_LCG 346, Len: 155, eps: 0.409 Rewd: -119.52399451266854, Best (264, -25.403615113534812) Avg -193.74979162787128
LunarAgentNN_v10x511_LCG 347, Len: 293, eps: 0.405 Rewd: -455.0420618565182, Best (264, -25.403615113534812) Avg -193.78192873146446
LunarAgentNN_v10x511_LCG 348, Len: 191, eps: 0.401 Rewd: -263.24082994588684, Best (264, -25.403615113534812) Avg -193.72868101908819
LunarAgentNN_v10x511_LCG 349, Len: 351, eps: 0.397 Rewd: -190.33558517993038, Best (264, -25.403615113534812) Avg -193.67913990660298
LunarAgentNN_v10x511_LCG 350, Len: 277, eps: 0.393 Rewd: -157.30685

[2017-03-19 21:23:50,578] Starting new video recorder writing to /home/mimoralea/Projects/applied-reinforcement-learning/results/LunarAgentNN_v10x511_LC/openaigym.video.1.14428.video000512.mp4


LunarAgentNN_v10x511_LCG 511, Len: 1001, eps: 0.150 Rewd: -109.49106011265901, Best (264, -25.403615113534812) Avg -79.98239702174652




LunarAgentNN_v10x511_LCG 512, Len: 1001, eps: 0.150 Rewd: -77.0336837224422, Best (264, -25.403615113534812) Avg -80.09336824401142
LunarAgentNN_v10x511_LCG 513, Len: 1001, eps: 0.150 Rewd: -41.99224916933727, Best (264, -25.403615113534812) Avg -79.40698566763042
LunarAgentNN_v10x511_LCG 514, Len: 1001, eps: 0.150 Rewd: -120.44103626565739, Best (264, -25.403615113534812) Avg -80.1445101540137
LunarAgentNN_v10x511_LCG 515, Len: 1001, eps: 0.150 Rewd: -132.8206131448895, Best (264, -25.403615113534812) Avg -81.0154576511456


In [None]:
import io
import json
import base64

from IPython.display import HTML

video_path, meta_path = env.videos[-1]

video = io.open(video_path, 'r+b').read()
encoded = base64.b64encode(video)

with open(meta_path) as data_file:    
    meta = json.load(data_file)

html_tag = """
<h2>{0}<h2/>
<video width="960" height="540" controls>
    <source src="data:video/mp4;base64,{1}" type="video/mp4" />
</video>"""
strm = html_tag.format('Episode ' + str(meta['episode_id']), encoded.decode('ascii'))
HTML(data=strm)