In [2]:
import numpy as np
import gym
import ppaquette_gym_doom
from ppaquette_gym_doom.wrappers import SetResolution, ToDiscrete
from gym.wrappers import SkipWrapper
from gym import wrappers

# (see https://github.com/ppaquette/gym-doom/blob/master/ppaquette_gym_doom/doom_basic.py)
def create_env(seed=None):
    env_spec = gym.spec('ppaquette/DoomBasic-v0')
    env_spec.id = 'DoomBasic-v0'
    env = env_spec.make()

    if seed is not None:
        env.seed(seed)

    return SetResolution('200x150')(
        SkipWrapper(repeat_count=4)(
        ToDiscrete('minimal')(env)))

env = create_env()
WIDTH, HEIGHT = env.screen_width, env.screen_height

NOOP, SHOOT, RIGHT, LEFT = 0, 1, 2, 3

## Collecting experiences

In [3]:
from collections import namedtuple
import operator

SARE = namedtuple('SAR', ['state', 'action', 'reward', 'end'])


def generate_sares(env, agent, episode_count=100):
    reward = 0
    done = False

    for i in range(episode_count):
        observation = env.reset()
        while True:
            action = agent.act(observation, reward, done)
            new_observation, reward, done, _ = env.step(action)
            yield SARE(observation, action, reward, done)
            
            if done:
                break
            else:
                observation = new_observation

def episode_sares(env, agent, episode_count=100):
    sares = list(generate_sares(env, agent, episode_count))
    print('average reward per episode = {}'.format(
        sum(r for _, _, r, _ in sares) / float(sum(e for _, _, _, e in sares))))
    return sares

        
def to_experiences(sares, only_n_misses=100):
    experiences = [
        (previous_s, a, r, next_s, end)
        for (previous_s, a, r, end), (next_s, _, _, _) in zip(sares[:-1], sares[1:])
    ]

    # simplistic experience prioritization
    shuffled_exps = experiences if only_n_misses is None\
        else random.choices(experiences, k=only_n_misses) + [e for e in experiences if e[2] > 0]
    random.shuffle(shuffled_exps)

    prev_frames, actions, rewards, next_frames, is_ends = zip(*shuffled_exps)
    prev_frames = np.asarray(prev_frames)
    next_frames = np.asarray(next_frames)
    actions = np.asarray(actions)
    rewards = np.asarray(rewards)
    is_ends = np.asarray(is_ends)
    
    print('Training on {}/{} positive/total out of {} 1-step experiences with actions distribution {}'.format(
        np.sum(rewards>=0),
        len(rewards),
        len(experiences),
        np.bincount(actions)))
    
    return (prev_frames, next_frames, actions, rewards, is_ends)

## Deep Q-learning

In [7]:
import tensorflow as tf
from keras import backend as K

from keras.layers import Dense, Convolution2D, Flatten, Activation
from keras.models import Sequential
from keras.optimizers import Adam

sess = tf.InteractiveSession()
K.set_session(sess)

def create_q_model(conv1_weights=None, conv2_weights=None, dense1_weights=None, dense2_weights=None):
    model = Sequential()

    model.add(
        Convolution2D(nb_filter=2, nb_row=6, nb_col=6, subsample=(1, 1), border_mode='valid', weights=conv1_weights,
            input_shape=[HEIGHT, WIDTH, 3], dim_ordering='tf'))
    model.add(Activation('relu'))
    model.add(
        Convolution2D(nb_filter=4, nb_row=2, nb_col=2, subsample=(1, 1), weights=conv2_weights))
    model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dense(64, init='normal', weights=dense1_weights))
    model.add(Activation('relu'))
    model.add(Dense(4, init='normal', weights=dense2_weights))
    model.compile(loss='mse', optimizer=Adam())
    
    return model

acting_model = create_q_model()
target_model = create_q_model()

def copy_model(model):
    conv1_weights =  [w.eval() for w in model.layers[0].weights]
    conv2_weights = [w.eval() for w in model.layers[2].weights]
    dense1_weights = [w.eval() for w in model.layers[5].weights]
    dense2_weights = [w.eval() for w in model.layers[7].weights]
    return create_q_model(conv1_weights, conv2_weights, dense1_weights, dense2_weights)

In [8]:
def sares_to_input_targets(model, sares, gamma = .99, reward_clip=5, only_n_misses=100):
    
    prev_frames, next_frames, actions, rewards, is_ends = to_experiences(sares, only_n_misses)
    
    n_samples = len(actions)
    clipped_rewards = np.clip(rewards, -np.inf, reward_clip)
    
    # Transcription of the Q-learning target formula
    targets = clipped_rewards + gamma * (1 - is_ends) * model.predict(next_frames).max(axis=1)

    target_action_rewards = model.predict(prev_frames)
    target_action_rewards[np.arange(n_samples), actions] = targets

    return prev_frames, target_action_rewards

## Training

In [9]:
import random

class EpsilonGreedyQAgent(object):
    def __init__(self, model, epsilon=.1):
        self.model = model
        self.epsilon = epsilon

    def act(self, observation, reward, done):
        if random.uniform(0, 1) <= self.epsilon:
            return random.choice([NOOP, SHOOT, LEFT, RIGHT])
        else:
            return self.model.predict(observation[np.newaxis])[0].argmax()


N_BATCHES = 100
N_BATCHED_EPISODES = 10
UPDATE_TARGET_EVERY_N_BACTHES = 2
MINI_BATCH_SIZE = 32
REWARD_CLIP = 5
ONLY_N_MISSES = 200

env = create_env()
env = wrappers.Monitor(env, directory='tmp/q_learning', force=True, mode='training')

for _ in range(N_BATCHES):
    for _ in range(UPDATE_TARGET_EVERY_N_BACTHES):
        sares = episode_sares(env, EpsilonGreedyQAgent(acting_model, epsilon=.1), N_BATCHED_EPISODES)
        prev_frames, target_action_rewards = sares_to_input_targets(target_model, sares, reward_clip=REWARD_CLIP, only_n_misses=ONLY_N_MISSES)
        acting_model.fit(x=prev_frames, y=target_action_rewards, batch_size=MINI_BATCH_SIZE, nb_epoch=1, verbose=0)
    
    target_model = copy_model(acting_model)


# final greedy episodes
sares = episode_sares(env, EpsilonGreedyQAgent(acting_model, epsilon=0), episode_count=1000)

env.close()
#gym.upload('tmp/q_learning', api_key='sk_bNZUvCfkTfabQCoKoKbjFA')

[2017-03-04 17:54:06,668] Clearing 26 monitor files from previous run (because force=True was provided)
[2017-03-04 17:54:07,044] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.1.52235.video000000.mp4
[2017-03-04 17:54:13,501] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.1.52235.video000001.mp4
[2017-03-04 17:54:15,328] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.1.52235.video000008.mp4


average reward per episode = -80.0
Training on 13/207 positive/total out of 243 1-step experiences with actions distribution [  8 188   9   2]
average reward per episode = -322.4
Training on 1/201 positive/total out of 644 1-step experiences with actions distribution [ 28   3   5 165]


[2017-03-04 17:54:46,825] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.1.52235.video000027.mp4


average reward per episode = -319.4
Training on 1/201 positive/total out of 647 1-step experiences with actions distribution [ 12   4   3 182]
average reward per episode = -362.5
Training on 0/200 positive/total out of 699 1-step experiences with actions distribution [172   6   5  17]
average reward per episode = -335.9
Training on 1/201 positive/total out of 677 1-step experiences with actions distribution [  7   7   1 186]
average reward per episode = -357.0
Training on 0/200 positive/total out of 699 1-step experiences with actions distribution [ 17   4   9 170]


[2017-03-04 17:55:50,210] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.1.52235.video000064.mp4


average reward per episode = -358.5
Training on 0/200 positive/total out of 699 1-step experiences with actions distribution [163   3   8  26]
average reward per episode = -323.4
Training on 1/201 positive/total out of 658 1-step experiences with actions distribution [ 29   7   3 162]
average reward per episode = -354.0
Training on 0/200 positive/total out of 699 1-step experiences with actions distribution [102   2   8  88]
average reward per episode = -323.4
Training on 1/201 positive/total out of 650 1-step experiences with actions distribution [176   8   3  14]
average reward per episode = -341.4
Training on 0/200 positive/total out of 683 1-step experiences with actions distribution [  7   6   4 183]
average reward per episode = -358.5
Training on 0/200 positive/total out of 699 1-step experiences with actions distribution [ 10   4   7 179]


[2017-03-04 17:57:32,382] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.1.52235.video000125.mp4


average reward per episode = -314.3
Training on 2/202 positive/total out of 648 1-step experiences with actions distribution [176  10   9   7]
average reward per episode = -322.9
Training on 2/201 positive/total out of 654 1-step experiences with actions distribution [168   6   4  23]
average reward per episode = -326.9
Training on 1/201 positive/total out of 652 1-step experiences with actions distribution [  9  10   4 178]
average reward per episode = -357.5
Training on 0/200 positive/total out of 699 1-step experiences with actions distribution [ 35   3  32 130]
average reward per episode = -269.2
Training on 3/203 positive/total out of 589 1-step experiences with actions distribution [102   5  46  50]
average reward per episode = -331.3
Training on 3/202 positive/total out of 681 1-step experiences with actions distribution [48 11 76 67]
average reward per episode = -265.7
Training on 3/202 positive/total out of 570 1-step experiences with actions distribution [87 10 56 49]
average

[2017-03-04 18:00:06,719] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.1.52235.video000216.mp4


average reward per episode = -320.2
Training on 4/204 positive/total out of 606 1-step experiences with actions distribution [74 51 48 31]
average reward per episode = -342.7
Training on 3/203 positive/total out of 640 1-step experiences with actions distribution [18 43 57 85]
average reward per episode = -331.8
Training on 2/201 positive/total out of 682 1-step experiences with actions distribution [58  8 64 71]
average reward per episode = -91.8
Training on 14/208 positive/total out of 276 1-step experiences with actions distribution [ 10 180  15   3]
average reward per episode = -289.3
Training on 5/202 positive/total out of 597 1-step experiences with actions distribution [46 12 85 59]
average reward per episode = -183.8
Training on 6/205 positive/total out of 409 1-step experiences with actions distribution [92 40 23 50]
average reward per episode = -232.1
Training on 7/205 positive/total out of 443 1-step experiences with actions distribution [  8 147  18  32]
average reward per 

[2017-03-04 18:03:32,704] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.1.52235.video000343.mp4


average reward per episode = -139.5
Training on 11/206 positive/total out of 335 1-step experiences with actions distribution [ 3 99 56 48]
average reward per episode = -215.1
Training on 3/203 positive/total out of 478 1-step experiences with actions distribution [ 2 14 90 97]
average reward per episode = -258.4
Training on 4/203 positive/total out of 465 1-step experiences with actions distribution [70 94 27 12]
average reward per episode = -131.4
Training on 10/207 positive/total out of 346 1-step experiences with actions distribution [52 74 45 36]
average reward per episode = -347.4
Training on 1/201 positive/total out of 688 1-step experiences with actions distribution [  8  13  62 118]
average reward per episode = -301.8
Training on 3/201 positive/total out of 607 1-step experiences with actions distribution [141  21  28  11]
average reward per episode = -238.0
Training on 5/204 positive/total out of 503 1-step experiences with actions distribution [  5  37  31 131]
average rewar

[2017-03-04 18:07:28,185] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.1.52235.video000512.mp4


average reward per episode = -328.3
Training on 3/202 positive/total out of 593 1-step experiences with actions distribution [145  46   6   5]
average reward per episode = -312.9
Training on 2/201 positive/total out of 634 1-step experiences with actions distribution [  9   5 174  13]
average reward per episode = -316.4
Training on 0/200 positive/total out of 639 1-step experiences with actions distribution [  6   4   5 185]
average reward per episode = -382.5
Training on 0/200 positive/total out of 699 1-step experiences with actions distribution [  3  28 158  11]
average reward per episode = -226.6
Training on 4/204 positive/total out of 482 1-step experiences with actions distribution [137  35  15  17]
average reward per episode = -272.2
Training on 6/203 positive/total out of 509 1-step experiences with actions distribution [  5  53 112  33]
average reward per episode = -189.4
Training on 8/206 positive/total out of 424 1-step experiences with actions distribution [ 8 51 88 59]
ave

[2017-03-04 18:12:23,331] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.1.52235.video000729.mp4


average reward per episode = -262.6
Training on 5/203 positive/total out of 510 1-step experiences with actions distribution [  3  34 129  37]
average reward per episode = -36.3
Training on 12/209 positive/total out of 225 1-step experiences with actions distribution [ 8 81 55 65]
average reward per episode = -265.7
Training on 5/203 positive/total out of 509 1-step experiences with actions distribution [41 43 94 25]
average reward per episode = -204.6
Training on 5/204 positive/total out of 441 1-step experiences with actions distribution [29 26 92 57]
average reward per episode = -157.7
Training on 9/206 positive/total out of 365 1-step experiences with actions distribution [88 90 17 11]
average reward per episode = -215.3
Training on 6/205 positive/total out of 438 1-step experiences with actions distribution [ 7 75 82 41]
average reward per episode = -153.4
Training on 9/206 positive/total out of 397 1-step experiences with actions distribution [96 25 41 44]
average reward per epis

[2017-03-04 18:19:17,957] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.1.52235.video001000.mp4


average reward per episode = -205.1
Training on 5/204 positive/total out of 479 1-step experiences with actions distribution [173  12  18   1]
average reward per episode = -71.6
Training on 13/207 positive/total out of 259 1-step experiences with actions distribution [38 37 80 52]
average reward per episode = -138.5
Training on 8/205 positive/total out of 367 1-step experiences with actions distribution [123  14  46  22]
average reward per episode = -147.9
Training on 5/205 positive/total out of 372 1-step experiences with actions distribution [101  21  38  45]
average reward per episode = -99.7
Training on 11/206 positive/total out of 311 1-step experiences with actions distribution [ 18  18 159  11]
average reward per episode = -13.3
Training on 17/208 positive/total out of 175 1-step experiences with actions distribution [111  43  17  37]
average reward per episode = -96.2
Training on 10/206 positive/total out of 298 1-step experiences with actions distribution [ 22  16 150  18]
ave

[2017-03-04 18:43:50,765] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.1.52235.video002000.mp4
[2017-03-04 18:45:09,355] Finished writing results. You can upload them to the scoreboard via gym.upload('/Users/gui/Dev/rl-study/tmp/q_learning')


average reward per episode = 72.067


## Embedding viz

See http://projector.tensorflow.org/?config=https://raw.githubusercontent.com/pilipolio/rl-study/master/projectors/doom_v1_projector_config.json

In [None]:
import pandas as pd

prev_frames, target_action_rewards = sares_to_input_targets(target_model, sares, n_misses=None)

thumbnail_size = 150
n_frames = 1000
frames = prev_frames[:n_frames, :, :, :]
thumbnails = frames[:, :, 25:-25, :]
frame_embeddings = Sequential(acting_model.layers[:-1]).predict(frames)


frame_action_rewards = acting_model.predict(frames)
frame_metadata = pd.DataFrame.from_dict({
    'best_action': np.array(['NOOP', 'SHOOT', 'LEFT', 'RIGHT'])[frame_action_rewards.argmax(1)], 
    'value': frame_action_rewards.max(1)})\
    .assign(value_quantile=lambda df: np.digitize(df.value, bins=np.percentile(df.value, q=[25, 50, 75])))

In [None]:
import os
import scipy.misc

def images_to_sprite(data):
    """Creates the sprite image along with any necessary padding
    From https://github.com/tensorflow/tensorflow/issues/6322
    Args:
      data: NxHxW[x3] tensor containing the images.

    Returns:
      data: Properly shaped HxWx3 image with any necessary padding.
    """
    if len(data.shape) == 3:
        data = np.tile(data[...,np.newaxis], (1,1,1,3))
    data = data.astype(np.float32)
    min = np.min(data.reshape((data.shape[0], -1)), axis=1)
    data = (data.transpose(1,2,3,0) - min).transpose(3,0,1,2)
    max = np.max(data.reshape((data.shape[0], -1)), axis=1)
    data = (data.transpose(1,2,3,0) / max).transpose(3,0,1,2)
    # Inverting the colors seems to look better for MNIST
    #data = 1 - data

    n = int(np.ceil(np.sqrt(data.shape[0])))
    padding = ((0, n ** 2 - data.shape[0]), (0, 0),
            (0, 0)) + ((0, 0),) * (data.ndim - 3)
    data = np.pad(data, padding, mode='constant',
            constant_values=0)
    # Tile the individual thumbnails into an image.
    data = data.reshape((n, n) + data.shape[1:]).transpose((0, 2, 1, 3)
            + tuple(range(4, data.ndim + 1)))
    data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])
    data = (data * 255).astype(np.uint8)
    return data

def save_projector_config(frame_embeddings, frame_metadata, thumbnails=None):
    gh_root = 'https://raw.githubusercontent.com/pilipolio/rl-study/master'
    projector_dir = 'projectors'
    embedding_name = 'doom_v1'
    
    projector_config = {
        'embeddings': [
        {
            'metadataPath': os.path.join(gh_root, projector_dir, embedding_name + '_metadata.tsv'),
            'tensorName': 'Frames',
            'tensorShape': frame_embeddings.shape,
            'tensorPath': os.path.join(gh_root, projector_dir, embedding_name + '.tsv')
        }
        ]
    }
    
    if thumbnails is not None:
        projector_config['embeddings'][0]['sprite'] = {
                'imagePath': os.path.join(gh_root, projector_dir, embedding_name + '_sprite.png'),
                'singleImageDim': thumbnails.shape}
        sprite = images_to_sprite(thumbnails)
        scipy.misc.imsave(os.path.join(projector_dir, embedding_name + '_sprite.png'), sprite)
        
    pd.DataFrame(frame_embeddings).to_csv(os.path.join(projector_dir, embedding_name + '.tsv'),
                            sep='\t', index=None, header=None)
    frame_metadata.to_csv(os.path.join(projector_dir, embedding_name + '_metadata.tsv'), sep='\t', index=None)

    with open(os.path.join(projector_dir, embedding_name + ('_with_sprite' if thumbnails is not None else '') + '_projector_config.json'), 'w+') as f:
        json.dump(projector_config, f)

save_projector_config(frame_embeddings, frame_metadata)

save_projector_config(frame_embeddings, frame_metadata, thumbnails)