In [2]:
import numpy as np
import gym
import ppaquette_gym_doom
from ppaquette_gym_doom.wrappers import SetResolution, ToDiscrete
from gym.wrappers import SkipWrapper
from gym import wrappers

# (see https://github.com/ppaquette/gym-doom/blob/master/ppaquette_gym_doom/doom_basic.py)
def create_env(seed=None):
    env_spec = gym.spec('ppaquette/DoomDefendCenter-v0')
    env_spec.id = 'DoomDefendCenter-v0'
    env = env_spec.make()

    if seed is not None:
        env.seed(seed)

    return SetResolution('200x150')(
        SkipWrapper(repeat_count=4)(
        ToDiscrete('minimal')(env)))

env = create_env()
WIDTH, HEIGHT = env.screen_width, env.screen_height

NOOP, SHOOT, RIGHT, LEFT = 0, 1, 2, 3

## Collecting experiences

In [5]:
from collections import namedtuple
import operator

SARE = namedtuple('SAR', ['state', 'action', 'reward', 'end'])


def generate_sares(env, agent, episode_count=100):
    reward = 0
    done = False

    for i in range(episode_count):
        observation = env.reset()
        while True:
            action = agent.act(observation, reward, done)
            new_observation, reward, done, _ = env.step(action)
            yield SARE(observation, action, reward, done)
            
            if done:
                break
            else:
                observation = new_observation

def episode_sares(env, agent, episode_count=100):
    sares = list(generate_sares(env, agent, episode_count))
    print('average reward per episode = {}'.format(
        sum(r for _, _, r, _ in sares) / float(sum(e for _, _, _, e in sares))))
    return sares


def to_experiences(sares, only_n_misses=100):
    experiences = [
        (previous_s, a, r, next_s, end)
        for (previous_s, a, r, end), (next_s, _, _, _) in zip(sares[:-1], sares[1:])
    ]

    # simplistic experience prioritization
    shuffled_exps = experiences if only_n_misses is None\
        else random.choices(experiences, k=only_n_misses) + [e for e in experiences if e[2] > 0]
    random.shuffle(shuffled_exps)

    prev_frames, actions, rewards, next_frames, is_ends = zip(*shuffled_exps)
    prev_frames = np.asarray(prev_frames)
    next_frames = np.asarray(next_frames)
    actions = np.asarray(actions)
    rewards = np.asarray(rewards)
    is_ends = np.asarray(is_ends)

    print('Training on {}/{} positive/total out of {} 1-step experiences with actions distribution {}'.format(
        np.sum(rewards>=0),
        len(rewards),
        len(experiences),
        np.bincount(actions)))
    
    return (prev_frames, next_frames, actions, rewards, is_ends)

## Deep Q-learning

In [6]:
import tensorflow as tf
from keras import backend as K

from keras.layers import Dense, Convolution2D, Flatten, Activation
from keras.models import Sequential
from keras.optimizers import Adam

sess = tf.InteractiveSession()
K.set_session(sess)

def create_q_model(conv1_weights=None, conv2_weights=None, dense1_weights=None, dense2_weights=None):
    model = Sequential()

    model.add(
        Convolution2D(nb_filter=4, nb_row=6, nb_col=6, subsample=(1, 1), border_mode='valid', weights=conv1_weights,
            input_shape=[HEIGHT, WIDTH, 3], dim_ordering='tf'))
    model.add(Activation('relu'))
    model.add(
        Convolution2D(nb_filter=8, nb_row=2, nb_col=2, weights=conv2_weights))
    model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dense(128, init='normal', weights=dense1_weights))
    model.add(Activation('relu'))
    model.add(Dense(4, init='normal', weights=dense2_weights))
    model.compile(loss='mse', optimizer=Adam())
    
    return model

acting_model = create_q_model()
target_model = create_q_model()

def copy_model(model):
    conv1_weights =  [w.eval() for w in model.layers[0].weights]
    conv2_weights = [w.eval() for w in model.layers[2].weights]
    dense1_weights = [w.eval() for w in model.layers[5].weights]
    dense2_weights = [w.eval() for w in model.layers[7].weights]
    return create_q_model(conv1_weights, conv2_weights, dense1_weights, dense2_weights)

Using TensorFlow backend.


In [7]:
def sares_to_input_targets(model, sares, gamma = .99, reward_clip=5, only_n_misses=100):
    
    prev_frames, next_frames, actions, rewards, is_ends = to_experiences(sares, only_n_misses)
    
    n_samples = len(actions)
    clipped_rewards = np.clip(rewards, -np.inf, reward_clip)
    
    # Transcription of the Q-learning target formula
    targets = clipped_rewards + gamma * (1 - is_ends) * model.predict(next_frames).max(axis=1)

    target_action_rewards = model.predict(prev_frames)
    target_action_rewards[np.arange(n_samples), actions] = targets

    return prev_frames, target_action_rewards

## Training

In [8]:
import random

class EpsilonGreedyQAgent(object):
    def __init__(self, model, epsilon=.1):
        self.model = model
        self.epsilon = epsilon

    def act(self, observation, reward, done):
        if random.uniform(0, 1) <= self.epsilon:
            return random.choice([NOOP, SHOOT, LEFT, RIGHT])
        else:
            return self.model.predict(observation[np.newaxis])[0].argmax()


N_BATCHES = 400
N_BATCHED_EPISODES = 10
UPDATE_TARGET_EVERY_N_BACTHES = 2
MINI_BATCH_SIZE = 32
REWARD_CLIP = 5
ONLY_N_MISSES = 200

env = create_env()
env = wrappers.Monitor(env, directory='tmp/DoomDefendLine_q_learning', force=True, mode='training')

for _ in range(N_BATCHES):
    for _ in range(UPDATE_TARGET_EVERY_N_BACTHES):
        sares = episode_sares(env, EpsilonGreedyQAgent(acting_model, epsilon=.1), N_BATCHED_EPISODES)
        prev_frames, target_action_rewards = sares_to_input_targets(target_model, sares, reward_clip=REWARD_CLIP, only_n_misses=ONLY_N_MISSES)
        acting_model.fit(x=prev_frames, y=target_action_rewards, batch_size=MINI_BATCH_SIZE, nb_epoch=1, verbose=0)
    
    target_model = copy_model(acting_model)


# final greedy episodes
sares = episode_sares(env, EpsilonGreedyQAgent(acting_model, epsilon=0), episode_count=1000)

env.close()
gym.upload('tmp/DoomDefendLine_q_learning', api_key='sk_bNZUvCfkTfabQCoKoKbjFA')

[2017-03-04 18:13:42,555] Clearing 22 monitor files from previous run (because force=True was provided)
[2017-03-04 18:13:42,990] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video000000.mp4
[2017-03-04 18:13:44,139] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video000001.mp4
[2017-03-04 18:13:49,037] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video000008.mp4


average reward per episode = 1.4
Training on 220/224 positive/total out of 666 1-step experiences with actions distribution [  2 159  54   9]
average reward per episode = 0.3
Training on 208/213 positive/total out of 571 1-step experiences with actions distribution [  3 198   5   7]


[2017-03-04 18:14:22,767] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video000027.mp4


average reward per episode = 0.1
Training on 211/211 positive/total out of 569 1-step experiences with actions distribution [  1 198   7   5]
average reward per episode = 0.0
Training on 207/210 positive/total out of 593 1-step experiences with actions distribution [  4 201   3   2]
average reward per episode = 0.4
Training on 211/214 positive/total out of 633 1-step experiences with actions distribution [  7 197   7   3]
average reward per episode = 0.5
Training on 213/215 positive/total out of 626 1-step experiences with actions distribution [  9 196   6   4]


[2017-03-04 18:15:27,529] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video000064.mp4


average reward per episode = 0.4
Training on 212/214 positive/total out of 632 1-step experiences with actions distribution [  3 203   6   2]
average reward per episode = 0.2
Training on 208/212 positive/total out of 594 1-step experiences with actions distribution [  6 196   6   4]
average reward per episode = 0.5
Training on 210/214 positive/total out of 635 1-step experiences with actions distribution [  0 204   5   5]
average reward per episode = 0.3
Training on 208/213 positive/total out of 588 1-step experiences with actions distribution [  5 196   9   3]
average reward per episode = 0.3
Training on 212/213 positive/total out of 587 1-step experiences with actions distribution [ 10 192   5   6]
average reward per episode = 0.2
Training on 210/212 positive/total out of 558 1-step experiences with actions distribution [  1 198   6   7]


[2017-03-04 18:17:05,928] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video000125.mp4


average reward per episode = 0.7
Training on 214/217 positive/total out of 588 1-step experiences with actions distribution [  4 209   3   1]
average reward per episode = 0.3
Training on 207/213 positive/total out of 538 1-step experiences with actions distribution [  6 196   8   3]
average reward per episode = 0.3
Training on 210/213 positive/total out of 598 1-step experiences with actions distribution [  7 197   2   7]
average reward per episode = 0.6
Training on 213/216 positive/total out of 658 1-step experiences with actions distribution [  5 199   9   3]
average reward per episode = 0.0
Training on 207/210 positive/total out of 522 1-step experiences with actions distribution [  4 192   7   7]
average reward per episode = 0.3
Training on 209/213 positive/total out of 584 1-step experiences with actions distribution [  8 203   1   1]
average reward per episode = 0.4
Training on 210/214 positive/total out of 621 1-step experiences with actions distribution [  7 197   8   2]
averag

[2017-03-04 18:19:34,647] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video000216.mp4


average reward per episode = 0.7
Training on 215/217 positive/total out of 624 1-step experiences with actions distribution [  6 204   4   3]
average reward per episode = 0.7
Training on 213/217 positive/total out of 611 1-step experiences with actions distribution [  7 200   5   5]
average reward per episode = 0.2
Training on 206/212 positive/total out of 586 1-step experiences with actions distribution [  0 204   4   4]
average reward per episode = 0.5
Training on 213/215 positive/total out of 613 1-step experiences with actions distribution [  6 191  13   5]
average reward per episode = 0.5
Training on 210/215 positive/total out of 618 1-step experiences with actions distribution [  3 198  10   4]
average reward per episode = 0.3
Training on 207/213 positive/total out of 625 1-step experiences with actions distribution [  1 205   4   3]
average reward per episode = 0.3
Training on 211/213 positive/total out of 581 1-step experiences with actions distribution [  5 196   4   8]
averag

[2017-03-04 18:23:08,483] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video000343.mp4


average reward per episode = 0.3
Training on 207/213 positive/total out of 593 1-step experiences with actions distribution [  8 191  14]
average reward per episode = 0.0
Training on 209/210 positive/total out of 614 1-step experiences with actions distribution [  3 185  15   7]
average reward per episode = 0.6
Training on 216/216 positive/total out of 592 1-step experiences with actions distribution [  6 184  23   3]
average reward per episode = 0.5
Training on 214/215 positive/total out of 619 1-step experiences with actions distribution [  3 195  11   6]
average reward per episode = 0.6
Training on 212/216 positive/total out of 652 1-step experiences with actions distribution [  7 175  30   4]
average reward per episode = 0.6
Training on 212/216 positive/total out of 641 1-step experiences with actions distribution [  6 178  25   7]
average reward per episode = 1.3
Training on 219/223 positive/total out of 624 1-step experiences with actions distribution [  3 192  22   6]
average re

[2017-03-04 18:27:57,053] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video000512.mp4


average reward per episode = 1.1
Training on 219/221 positive/total out of 597 1-step experiences with actions distribution [  6 169  41   5]
average reward per episode = 0.8
Training on 218/218 positive/total out of 595 1-step experiences with actions distribution [  5 164  48   1]
average reward per episode = 0.5
Training on 213/215 positive/total out of 645 1-step experiences with actions distribution [  3 152  55   5]
average reward per episode = 0.5
Training on 213/215 positive/total out of 624 1-step experiences with actions distribution [  5 155  46   9]
average reward per episode = 0.9
Training on 214/218 positive/total out of 611 1-step experiences with actions distribution [  3 161  51   3]
average reward per episode = 1.0
Training on 215/220 positive/total out of 683 1-step experiences with actions distribution [  8 145  57  10]
average reward per episode = 0.3
Training on 207/213 positive/total out of 639 1-step experiences with actions distribution [  6 141  58   8]
averag

[2017-03-04 18:34:15,083] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video000729.mp4


average reward per episode = 1.1
Training on 221/221 positive/total out of 776 1-step experiences with actions distribution [  5 140  73   3]
average reward per episode = 1.5
Training on 224/225 positive/total out of 757 1-step experiences with actions distribution [  6 134  79   6]
average reward per episode = 0.9
Training on 216/219 positive/total out of 697 1-step experiences with actions distribution [  4 133  78   4]
average reward per episode = 0.5
Training on 213/215 positive/total out of 672 1-step experiences with actions distribution [  3 132  74   6]
average reward per episode = 1.8
Training on 227/228 positive/total out of 821 1-step experiences with actions distribution [  6 123  94   5]
average reward per episode = 1.6
Training on 222/225 positive/total out of 776 1-step experiences with actions distribution [  7 131  86   1]
average reward per episode = 1.0
Training on 220/220 positive/total out of 757 1-step experiences with actions distribution [  1 117  96   6]
averag

[2017-03-04 18:43:13,325] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video001000.mp4


average reward per episode = 1.1
Training on 216/221 positive/total out of 726 1-step experiences with actions distribution [  1 113 102   5]
average reward per episode = 0.6
Training on 213/216 positive/total out of 670 1-step experiences with actions distribution [  3 130  77   6]
average reward per episode = 1.2
Training on 219/222 positive/total out of 687 1-step experiences with actions distribution [  2 118  99   3]
average reward per episode = 1.2
Training on 218/221 positive/total out of 720 1-step experiences with actions distribution [  9 115  94   3]
average reward per episode = 1.1
Training on 220/221 positive/total out of 787 1-step experiences with actions distribution [  5 122  89   5]
average reward per episode = 0.9
Training on 219/219 positive/total out of 668 1-step experiences with actions distribution [  5 130  78   6]
average reward per episode = 0.7
Training on 214/217 positive/total out of 728 1-step experiences with actions distribution [  5 134  67  11]
averag

[2017-03-04 19:07:56,006] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video002000.mp4


average reward per episode = 1.3
Training on 221/223 positive/total out of 615 1-step experiences with actions distribution [  6 140  72   5]
average reward per episode = 2.2
Training on 231/232 positive/total out of 788 1-step experiences with actions distribution [  3 133  88   8]
average reward per episode = 2.8
Training on 238/238 positive/total out of 826 1-step experiences with actions distribution [ 11 141  78   8]
average reward per episode = 0.9
Training on 214/219 positive/total out of 569 1-step experiences with actions distribution [  2 142  68   7]
average reward per episode = 2.5
Training on 229/234 positive/total out of 794 1-step experiences with actions distribution [  2 151  79   2]
average reward per episode = 2.0
Training on 228/230 positive/total out of 724 1-step experiences with actions distribution [  6 121  93  10]
average reward per episode = 1.8
Training on 225/228 positive/total out of 744 1-step experiences with actions distribution [  1 151  69   7]
averag

[2017-03-04 19:34:04,017] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video003000.mp4


average reward per episode = 1.7
Training on 224/227 positive/total out of 611 1-step experiences with actions distribution [  8 145  49  25]
average reward per episode = 0.9
Training on 216/219 positive/total out of 633 1-step experiences with actions distribution [  9 126  52  32]
average reward per episode = 0.4
Training on 208/214 positive/total out of 548 1-step experiences with actions distribution [  6 149  41  18]
average reward per episode = 1.4
Training on 221/224 positive/total out of 677 1-step experiences with actions distribution [ 10 137  64  13]
average reward per episode = 1.5
Training on 223/225 positive/total out of 667 1-step experiences with actions distribution [  5 151  53  16]
average reward per episode = 1.2
Training on 219/222 positive/total out of 671 1-step experiences with actions distribution [  8 118  76  20]
average reward per episode = 1.0
Training on 214/220 positive/total out of 602 1-step experiences with actions distribution [  3 142  50  25]
averag

[2017-03-04 20:01:37,022] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video004000.mp4


average reward per episode = 2.0
Training on 226/230 positive/total out of 722 1-step experiences with actions distribution [ 28 133  49  20]
average reward per episode = 2.3
Training on 232/233 positive/total out of 709 1-step experiences with actions distribution [ 24 146  56   7]
average reward per episode = 1.6
Training on 225/226 positive/total out of 697 1-step experiences with actions distribution [ 42 126  46  12]
average reward per episode = 2.3
Training on 229/233 positive/total out of 778 1-step experiences with actions distribution [ 41 112  66  14]
average reward per episode = 1.3
Training on 222/223 positive/total out of 639 1-step experiences with actions distribution [ 21 134  46  22]
average reward per episode = 2.6
Training on 235/236 positive/total out of 804 1-step experiences with actions distribution [ 32 115  70  19]
average reward per episode = 0.7
Training on 212/217 positive/total out of 599 1-step experiences with actions distribution [38 91 52 36]
average re

[2017-03-04 20:30:41,221] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video005000.mp4


average reward per episode = 0.7
Training on 214/217 positive/total out of 619 1-step experiences with actions distribution [ 57 112  27  21]
average reward per episode = 0.9
Training on 218/219 positive/total out of 633 1-step experiences with actions distribution [23 88 62 46]
average reward per episode = 0.6
Training on 213/216 positive/total out of 617 1-step experiences with actions distribution [35 87 57 37]
average reward per episode = 0.3
Training on 208/213 positive/total out of 568 1-step experiences with actions distribution [35 96 38 44]
average reward per episode = 1.9
Training on 225/229 positive/total out of 719 1-step experiences with actions distribution [30 91 66 42]
average reward per episode = 2.4
Training on 230/234 positive/total out of 815 1-step experiences with actions distribution [29 95 81 29]
average reward per episode = 0.4
Training on 210/214 positive/total out of 576 1-step experiences with actions distribution [43 60 66 45]
average reward per episode = 0

[2017-03-04 21:01:22,878] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video006000.mp4


average reward per episode = 1.0
Training on 219/220 positive/total out of 655 1-step experiences with actions distribution [35 94 57 34]
average reward per episode = 0.6
Training on 210/216 positive/total out of 589 1-step experiences with actions distribution [26 88 65 37]
average reward per episode = 0.7
Training on 214/217 positive/total out of 620 1-step experiences with actions distribution [ 39 101  39  38]
average reward per episode = 1.5
Training on 221/225 positive/total out of 670 1-step experiences with actions distribution [ 33 113  49  30]
average reward per episode = 0.9
Training on 217/219 positive/total out of 624 1-step experiences with actions distribution [27 86 60 46]
average reward per episode = 1.3
Training on 221/223 positive/total out of 709 1-step experiences with actions distribution [29 65 76 53]
average reward per episode = 0.5
Training on 209/215 positive/total out of 603 1-step experiences with actions distribution [ 31 126  37  21]
average reward per epi

[2017-03-04 21:37:25,269] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video007000.mp4


average reward per episode = 3.6
Training on 245/245 positive/total out of 924 1-step experiences with actions distribution [ 20 130  87   8]
average reward per episode = 3.6
Training on 244/246 positive/total out of 917 1-step experiences with actions distribution [ 11 125  98  12]
average reward per episode = 4.8
Training on 256/258 positive/total out of 991 1-step experiences with actions distribution [  7 166  82   3]
average reward per episode = 3.3
Training on 242/243 positive/total out of 878 1-step experiences with actions distribution [  7 166  62   8]
average reward per episode = 4.3
Training on 249/253 positive/total out of 958 1-step experiences with actions distribution [  4 148  98   3]
average reward per episode = 3.6
Training on 241/246 positive/total out of 863 1-step experiences with actions distribution [  7 133  98   8]
average reward per episode = 4.3
Training on 253/253 positive/total out of 968 1-step experiences with actions distribution [  6 154  86   7]
averag

[2017-03-04 22:14:09,238] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning/openaigym.video.0.52850.video008000.mp4
[2017-03-04 22:27:49,702] Finished writing results. You can upload them to the scoreboard via gym.upload('/Users/gui/Dev/rl-study/tmp/DoomDefendLine_q_learning')
[2017-03-04 22:27:49,730] [DoomDefendCenter-v0] Uploading 9000 episodes of training data


average reward per episode = 1.469


[2017-03-04 22:27:56,246] [DoomDefendCenter-v0] Uploading videos of 18 training episodes (1313267 bytes)
[2017-03-04 22:28:09,448] [DoomDefendCenter-v0] Creating evaluation object from tmp/DoomDefendLine_q_learning with learning curve and training video
[2017-03-04 22:28:09,966] 
****************************************************
You successfully uploaded your evaluation on DoomDefendCenter-v0 to
OpenAI Gym! You can find it at:

    https://gym.openai.com/evaluations/eval_eiiH0J8yS1yx4NHrJt0wqQ

****************************************************
