<br/>

$$ \huge{\underline{\textbf{ Playing Atari Games with Deep RL }}} $$

$$ \large{\textbf{MountainCar + DQN + Memory Reply}} $$

<br/>



In [None]:
def q_learning(start_step, env, frames, gamma, eps_decay_steps, eps_target,
               batch_size, model, mem, callback=None, trace=None, render=False, rng=None):
    """Episodic Semi-Gradient Sarsa
    
    Params:
        env - environment
        ep - number of episodes to run
        gamma - discount factor [0..1]
        eps - epsilon-greedy param
        model      - function approximator, already initialised, with methods:
                     eval(state, action) -> float
                     train(state, target) -> None
    """
    
    if rng is None:
        rng = np.random
    
    def policy(st, model, eps):
        if rng.rand() > eps:
            stack = np.stack([st])  # convert lazyframe to nn input shape [1, 84, 84, 4]
            q_values = model.eval(stack)
            return np.argmax(q_values)
        else:
            return env.action_space.sample()
    
    if eps_decay_steps is not None:
        eps_delta = (1-eps_target) / eps_decay_steps
        eps = 1 - start_step*eps_delta
        eps = max(eps, eps_target)
    else:
        eps = eps_target
        
    assert len(mem) >= batch_size
    
    tts_ = 0                                 # total time step
    for e_ in itertools.count():             # count from 0 to infinity
        
        S = env.reset()
        episode_full_reward = 0
        if render: env.render()
        
        for t_ in itertools.count():         # count from 0 to infinity
            
            A = policy(S, model, eps)
            
            S_, R, done, info = env.step(A)
            episode_full_reward += info['full-reward']  # unclipped reward
            if render: env.render()
            
            mem.append(S, A, R, S_, done)
            
            if callback is not None:
                callback(tts_+start_step, t_, S, A, R, done, info, eps, episode_full_reward, model, mem, trace)
            
            states, actions, rewards, n_states, dones, _ = mem.get_batch(batch_size)
            targets = model.eval(n_states)
            targets = rewards + gamma * np.max(targets, axis=-1)
            targets[dones] = rewards[dones]                # return of next-to-terminal state is just R
            model.train(states, actions, targets)

            tts_ += 1
            if tts_ >= frames:
                return
            
            if done:
                break
                
            S = S_
            
            if eps > eps_target:
                eps = max(eps - eps_delta, eps_target)
                

In [None]:
def evaluate(env, frames, episodes, eps, model, callback=None, trace=None, render=False, sleep=0, rng=None):

    if rng is None:
        rng = np.random
    
    def policy(st, model, eps):
        if rng.rand() > eps:
            stack = np.stack([st])  # convert lazyframe to nn input shape [1, 84, 84, 4]
            q_values = model.eval(stack)
            return np.argmax(q_values)
        else:
            return env.action_space.sample()
        
    per_episode_full_rewards = []
    
    tts_ = 0                                 # total time step
    for e_ in itertools.count():             # count from 0 to infinity
        
        S = env.reset()
        episode_full_reward = 0
        
        if render:
            env.render()
            time.sleep(sleep)
        
        for t_ in itertools.count():         # count from 0 to infinity
            
            A = policy(S, model, eps)
            
            S_, R, done, info = env.step(A)
            episode_full_reward += info['full-reward']   # unclipped reward
            
            if render:
                env.render()
                time.sleep(sleep)
            
            if callback is not None:
                raise  # todo remove callback
                callback(tts_, e_, t_, S, A, R, done, eps, model, None, trace)
    
            if done:
                per_episode_full_rewards.append(episode_full_reward)
                break
                
            if frames is not None and tts_ >= frames:
                return per_episode_full_rewards
                
            S = S_
                
            tts_ += 1
            
        if episodes is not None and e_ >= episodes-1:
            return per_episode_full_rewards

In [None]:
def prefill_memory(env, mem, steps=None, episodes=None, render=False, rng=None):
        
    if rng is None:
        rng = np.random
        
    # Fill memory buffer using random policy
    tts_ = 0
    for e_ in itertools.count():
        if episodes is not None and e_ >= episodes:
            return
        
        S = env.reset();
        if render: env.render()
        
        for t_ in itertools.count():
            
            A = rng.randint(0, env.action_space.n)    # random policy
            S_, R, done, _ = env.step(A)
            if render: env.render()
                
            mem.append(S, A, R, S_, done)
            
            tts_ += 1
            
            if steps is not None and tts_ >= steps:
                return
            
            if done:
                break
            
            S = S_

---

# Experiment Setup

Imports (source file: [tiles3.py](tiles3.py), [helpers_1001.py](helpers_1001.py))

In [None]:
import pdb
import psutil

In [None]:
import os
import time
import datetime
import numpy as np
import matplotlib.pyplot as plt
import tables
import itertools
import collections

import PIL
import gym
import tensorflow as tf

In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
with tf.Session(config=config) as sess:
    devs = sess.list_devices()
    print('\n'.join([x.name for x in devs]))

Environment

In [None]:
import helpers
import importlib
importlib.reload(helpers)

In [None]:
import sys
sys.path.append('../Debug_NN')

In [None]:
import importlib
import tables_logger
importlib.reload(tables_logger)

In [None]:
# %load_ext line_profiler

We will need callback to capture q-value array for whole state-action space at specified episodes.

In [None]:
class Trace():
    def __init__(self, tf_summary_writer, eval_every, test_states=None):
        
        self.tf_summary_writer = tf_summary_writer
        
        self.eval_every = eval_every
        self.test_states = test_states
        
        self.total_step = 0
        self.ep_rewards = collections.defaultdict(float)
        
        self.ep_start_time = None
        
    def push_summary(self, tag, simple_value, flush=False):
        summary = tf.Summary()
        summary.value.add(tag=tag, simple_value=simple_value)
        self.tf_summary_writer.add_summary(summary, self.total_step)
        self.tf_summary_writer.flush()

In [None]:
def callback(total_time_step, tstep, st, act, rew_, done_, info, eps, ep_full_reward, model, memory, trace):
    """Called from gradient_MC after every episode.
    
    Params:
        episode [int] - episode number
        tstep [int]   - timestep within episode
        model [obj]   - function approximator
        trace [list]  - list to write results to"""
    
    assert total_time_step == trace.total_step    
    
    if tstep == 0:    # Episode just started
        trace.ep_start_time = time.time()
    
    if done_:
        trace.ep_rewards[total_time_step] = ep_full_reward
        
    #
    #   Summaries
    #
    if trace.tf_summary_writer is not None:
            
        # Epsilon
        summary = tf.Summary()
        summary.value.add(tag='Metrics/Epsilon', simple_value=eps)
        trace.tf_summary_writer.add_summary(summary, trace.total_step)

        # Average_Q
        if trace.eval_every is not None:
            if total_time_step % trace.eval_every == 0:
                q_test_values = model.eval(trace.test_states)
                q_test_average = np.mean(np.max(q_test_values, axis=-1))  # max over actions
                summary = tf.Summary()
                summary.value.add(tag='Metrics/Average_Q', simple_value=q_test_average)
                trace.tf_summary_writer.add_summary(summary, trace.total_step)

        # Ep_Reward
        if done_:
            episode_wall_time = time.time() - trace.ep_start_time
            summary = tf.Summary()
            summary.value.add(tag='Metrics/Ep_Reward', simple_value=ep_full_reward)
            summary.value.add(tag='Other/StepsPerSec', simple_value=tstep/episode_wall_time)
            trace.tf_summary_writer.add_summary(summary, trace.total_step)
    
    trace.total_step += 1

---

# Atari Helpers

In [None]:
for env in gym.envs.registry.all():
    if env.id.startswith('Q'):
        print(env.id)

In [None]:
# from skimage.transform import resize
# from skimage.color import rgb2gray

In [None]:
# raise  # Sentinel
# def preprocess(obs):
#     obs_rgb = rgb2gray(obs)
#     obs_110x84 = resize(obs_rgb, output_shape=(110, 84), mode='reflect', anti_aliasing=True)
#     obs_84x84 = obs_110x84[13:-13,:]
#     obs_uint8 = (obs_84x84*255).astype(np.uint8)
#     return obs_uint8

In [None]:
# raise  # Sentinel
# def preprocess(obs):
#     img = PIL.Image.fromarray(obs)
#     img = img.convert('L')
#     img = img.resize([84, 84], resample=PIL.Image.NEAREST, box=[0,34,160,160+34])
#     return np.array(img)

In [None]:
def preprocess(obs):
    img = PIL.Image.fromarray(obs)
    img = img.convert('L')
    # (left, upper, right, lower)
    # OFFSET = 8   # Breakout
    OFFSET = 16  # Pong, BeamRider, SpaceInvaders, Breakout?
    img = img.resize([84, 84], resample=PIL.Image.BILINEAR, box=[0,210-160-OFFSET,160,210-OFFSET])
    return np.array(img)

In [None]:
if False:
    env = gym.make('SpaceInvaders-v4')
    obs = env.reset()
    for i in range(50):
        obs, _, _, _ = env.step(0)
        
    plt.imshow(obs); plt.show()
    
    imgp2 = preprocess(obs)
    plt.imshow(imgp2, cmap='gray', vmin=0, vmax=255); plt.show()

In [None]:
def plot_frames(frames):
    stack = np.array(frames)  # convert LazyFrame to np.ndarray
    assert stack.shape == (84, 84, 4)
    fig, axes = plt.subplots(nrows=1, ncols=stack.shape[-1], figsize=[16,4])
    for i in range(stack.shape[-1]):
        axes[i].imshow(stack[:,:,i], cmap='gray', vmin=0, vmax=255)
        axes[i].set_title('frame '+str(i))
    plt.show()

In [None]:
class LazyFrames:
    def __init__(self, frames):
        assert isinstance(frames, list)
        assert isinstance(frames[0], np.ndarray)
        self._frames = frames   # list of np.ndarray
        
    def __array__(self, dtype=None):
        # print('__ARRAY__ called')
        merged = np.stack(self._frames, axis=-1)
        if dtype is not None:
            merged = merged.astype(dtype)
        return merged
    
    def __str__(self):
        return str(np.round(np.stack(self._frames, axis=-1), decimals=4))

In [None]:
class WrapAtari:
    def __init__(self, env):
        assert env.observation_space == gym.spaces.Box(low=0, high=255, shape=[210,160,3], dtype=np.uint8)
        
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=[84, 84, 4], dtype=np.uint8)
        self.action_space = env.action_space
        
        self._env = env
        self._frames = collections.deque(maxlen=4)
    
    def reset(self):
        raw_obs = self._env.reset()           # 160x120 RGB
        obs = preprocess(raw_obs)             # 84x84 grayscale
        for _ in range(self._frames.maxlen):
            self._frames.append(obs)          # replace all
        return LazyFrames(list(self._frames))
    
    def step(self, action):
        assert self.action_space.contains(action)
        raw_obs, rew, done, info = self._env.step(action)
        obs = preprocess(raw_obs)             # 84x84 grayscale
        self._frames.append(obs)
        assert 'full-reward' not in info
        info['full-reward'] = rew
        return LazyFrames(list(self._frames)), np.sign(rew), done, info
    
    def seed(self, seed):
        self._env.seed(seed)
    
    def render(self, mode='human'):
        return self._env.render(mode=mode)
    
    def close(self):
        self._env.close()

# Experiment Setup

In [None]:
def setup_experiment(env_name, mem_size, mem_fill, tf_logdir=None, seed=None):
    global env
    if tf_logdir is not None:
        assert not os.path.exists(tf_logdir)
    try: env.close()
    except: pass
    
    env = gym.make(env_name)
    if env_name == 'MovingDot-v0': env.max_steps = 100
    env = WrapAtari(env)
    if seed is not None:
        env.seed(seed)
    

    tf.reset_default_graph()
    if seed is not None:
        tf.set_random_seed(seed)
    session = tf.Session()
    summary_writer = None
    if tf_logdir is not None:
        summary_writer = tf.summary.FileWriter(tf_logdir)

    neural_net = TFNeuralNet(tf_session=session, tf_summary_writer=summary_writer,
                             nb_out=env.action_space.n, lr=0.0002, extended_debug=False)
    
    model = TFFunctApprox(neural_net, st_low=0, st_high=255, rew_mean=0, rew_std=1, nb_actions=env.action_space.n)
    
    if seed is None:
        mem = Memory(max_len=mem_size, state_shape=(), state_dtype=object)
        prefill_memory(env, mem, steps=mem_fill, render=False)
    else:
        mem = Memory(max_len=mem_size, state_shape=(), state_dtype=object, rng=np.random.RandomState(seed))
        prefill_memory(env, mem, steps=mem_fill, render=False, rng=np.random.RandomState(seed))
    test_states, _, _, _, _, _ = mem.get_batch(32)
    
    trace = Trace(tf_summary_writer=summary_writer,
                  eval_every=1000,
                  test_states=test_states)
    
    if summary_writer is not None:
        summary_writer.add_graph(session.graph)
        summary_writer.flush()
    session.run(tf.global_variables_initializer())
    
    # neural_net.setup_logdb('outarray.h5', 20)
    
    return env, trace, model, mem

In [None]:
def run_experiment(env, trace, model, mem, epoch_size, nb_total_steps, eps_decay_steps,
                   test_frames, test_episodes, stop_filename, render, rng=None):
    if rng is None:
        rng = np.random
    while trace.total_step < nb_total_steps:
        q_learning(trace.total_step, env, frames=epoch_size, gamma=.95, eps_decay_steps=eps_decay_steps, eps_target=0.1,
               batch_size=32, model=model, mem=mem, callback=callback, trace=trace, render=render, rng=rng)
        # model._model.save('./tf_models/Pong-v0_'+ str(trace.total_step) + '.ckpt')
        ep_rewards = evaluate(env, test_frames, test_episodes, eps=0.05, model=model, render=render, rng=rng)
        trace.push_summary(tag='Metrics/Reward_Avg', simple_value=np.mean(ep_rewards))
        trace.push_summary(tag='Metrics/Reward_Max', simple_value=np.max(ep_rewards))
        trace.push_summary(tag='Metrics/Test_Reward', simple_value=np.sum(ep_rewards))   # backward compatibility only
        
        print('Epoch:', trace.total_step // epoch_size,
              '\tTotal Step:', trace.total_step, 
              '\tNum Episodes:', len(ep_rewards),
              '\tTotal Reward:', np.sum(ep_rewards),
              '\tAvg Reward:', np.mean(ep_rewards),
              '\tMax Reward:', np.max(ep_rewards))

        if os.path.exists(stop_filename):
            break

# Movning Dot

In [None]:
import moving_dot

In [None]:
env, trace, model, mem = setup_experiment(env_name='MovingDot-v0',
                                          mem_size=10000, mem_fill=1000,
                                          tf_logdir='tf_log_2/movingdot/44_bilin')

In [None]:
run_experiment(env, trace, model, mem, epoch_size=1000, nb_total_steps=10000, eps_decay_steps=10000,
               test_frames=None, test_episodes=10, stop_filename='STOP_MOVINGDOT', render=False)

In [None]:
%lprun -f q_learning -f model._model.backward \
    q_learning(trace.total_step, env, frames=1000, gamma=.95, eps_decay_steps=mem.max_len, eps_target=0.1, \
               batch_size=32, model=model, mem=mem, callback=callback, trace=trace, render=False)

In [None]:
ep_full_rewards = evaluate(env, frames=1050, episodes=None, eps=0.05, model=model, render=True)

In [None]:
env.close()


# Beam Rider

In [None]:
env, trace, model, mem = setup_experiment(env_name='BeamRiderDeterministic-v4',
                                          mem_size=200000, mem_fill=10000,
                                          tf_logdir='tf_log_2/beam_rider/test')

In [None]:
run_experiment(env, trace, model, mem, epoch_size=25000, nb_total_steps=200000, eps_decay_steps=50000,
               test_frames=10000, test_episodes=None, stop_filename='STOP_BEAM', render=True)

# Pong

In [None]:
env, trace, model, mem = setup_experiment(env_name='PongDeterministic-v4',
                                          mem_size=200000, mem_fill=10000,
                                          tf_logdir='tf_log_2/pong/test')

In [None]:
run_experiment(env, trace, model, mem, epoch_size=25000, nb_total_steps=200000, eps_decay_steps=50000,
               test_frames=10000, test_episodes=None, stop_filename='STOP_PONG', render=True)

# Pong

NOT TESTED:

In [None]:
def experiment_pong(tf_logdir=None):

    tf.reset_default_graph()
    session = tf.Session()
    summary_writer = None
    if tf_logdir is not None:
        assert not os.path.exists(tf_logdir)
        summary_writer = tf.summary.FileWriter(tf_logdir)

    neural_net = TFNeuralNet(tf_session=session, tf_summary_writer=summary_writer,
                             nb_out=6, lr=0.00025)
    
    model = TFFunctApprox(neural_net, st_low=0, st_high=255, rew_mean=0, rew_std=1, nb_actions=6)
    
    mem = Memory(max_len=200000, state_shape=(), state_dtype=object)
    prefill_memory(env, mem, steps=10000, render=False)
    test_states, _, _, _, _, _ = mem.get_batch(32)
    
    trace = Trace(tf_summary_writer=summary_writer,
                  eval_every=1000,
                  test_states=test_states)
    
    if summary_writer is not None:
        summary_writer.add_graph(session.graph)
        summary_writer.flush()
    session.run(tf.global_variables_initializer())
    
    # neural_net.setup_logdb('outarray.h5', 20)
    
    return trace, model, mem

In [None]:
# Setup Experiment
try: env.close()
except: pass
env = gym.make('PongDeterministic-v4')
env = WrapAtari(env)
trace, model, mem = experiment_pong(tf_logdir='tf_log_2/pong/10')

In [None]:
# Run Experiment
while trace.total_step < 200000:
    q_learning(trace.total_step, env, frames=25000, gamma=.95, eps_decay_steps=50000, eps_target=0.1,
           batch_size=32, model=model, mem=mem, callback=callback, trace=trace)
    # model._model.save('./tf_models/Pong-v0_'+ str(trace.total_step) + '.ckpt')
    test_reward = evaluate(env, 10000, None, eps=0.05, model=model, render=True)
    trace.push_summary(tag='Metrics/Test_Reward', simple_value=test_reward)
    
    if os.path.exists('STOP_PONG_10'):
        break
    
    if psutil.swap_memory().percent > 50:
        break

TESTED:

In [None]:
try: env.close()
except: pass
env = gym.make('PongDeterministic-v4')
env = WrapAtari(env)

In [None]:
cnn = TFNeuralNet(nb_out=6, logdir='tf_log_2/pong/5')
# cnn.setup_logdb('outarray.h5', 5)
model = TFFunctApprox(cnn, st_low=0, st_high=255, rew_mean=0, rew_std=1, nb_actions=6)

In [None]:
mem = Memory(max_len=200000, state_shape=(), state_dtype=object)

In [None]:
# %lprun -f preprocess prefill_memory(env, mem, steps=10000)
prefill_memory(env, mem, steps=10000, render=False)

In [None]:
trace = Trace()
rewards = []

In [None]:
# tr = evaluate(env, 10000, None, eps=0.05, model=model, render=True)
tr = evaluate(env, None, episodes=3, eps=0.0, model=model, render=True)
print('tr', tr)
rewards.append(tr)

In [None]:
while trace.total_step < 300000:
    q_learning(trace.total_step, env, frames=50000, gamma=.95, eps_decay_steps=50000, eps_target=0.1,
           batch_size=32, model=model, mem=mem, callback=callback, trace=trace)
    # tr = evaluate(env, 10000, None, eps=0.05, model=model, render=True)
    tr = evaluate(env, None, episodes=3, eps=0.0, model=model, render=True)
    # cnn.save('./tf_models/PongDeterministic-v4_'+ str(trace.total_step) + '.ckpt')
    print('iter', trace.total_step, 'tr', tr)
    rewards.append(tr)
    plt.plot(rewards)
    plt.show()

In [None]:
tr = evaluate(env, 10000, None, eps=0.0, model=model, render=True)

# Breakout - test test

In [None]:
try: env.close()
except: pass
env = gym.make('BreakoutDeterministic-v4')
env = WrapAtari(env)

In [None]:
cnn = TFNeuralNet(nb_out=env.action_space.n, logdir='tf_log_2/breakout/1')
# cnn.setup_logdb('outarray.h5', 5)
model = TFFunctApprox(cnn, st_low=0, st_high=255, rew_mean=0, rew_std=1, nb_actions=env.action_space.n)

In [None]:
mem = Memory(max_len=500000, state_shape=(), state_dtype=object)

In [None]:
prefill_memory(env, mem, steps=50000)

In [None]:
for i in range(100):
    idx = np.random.randint(len(mem))
    plot_frames(mem._hist_St[idx])

In [None]:
trace = Trace()
rewards = []

In [None]:
tr = evaluate(env, 10000, None, eps=0.05, model=model, render=True)
cnn.save('./tf_models/BreakoutDeterministic-v4_'+ str(trace.total_step) + '.ckpt')
print('iter', trace.total_step, 'tr', tr)
rewards.append(tr)

In [None]:
while trace.total_step < 2000000:
    q_learning(trace.total_step, env, frames=50000, gamma=.95, eps_decay_steps=500000, eps_target=0.1,
           batch_size=32, model=model, mem=mem, callback=callback, trace=trace)
    tr = evaluate(env, 10000, None, eps=0.05, model=model, render=True)
    cnn.save('./tf_models/BreakoutDeterministic-v4_'+ str(trace.total_step) + '.ckpt')
    print('iter', trace.total_step, 'tr', tr)
    rewards.append(tr)
    plt.plot(rewards)
    plt.show()

In [None]:
env.close()

# Space Invaders

In [None]:
env, trace, model, mem = setup_experiment(env_name='SpaceInvadersDeterministic-v4',
                                          mem_size=1000000, mem_fill=50000,
                                          tf_logdir='tf_log_2/space_invaders/test')

In [None]:
run_experiment(env, trace, model, mem, epoch_size=50000, nb_total_steps=10000000, eps_decay_steps=1000000,
               test_frames=10000, test_episodes=None, stop_filename='STOP_PONG', render=True)

NOT TESTED:

In [None]:
env, trace, model, mem = setup_experiment(env_name='SpaceInvadersDeterministic-v4',
                                          mem_fill=50000, mem_size=500000,
                                          tf_logdir='tf_log_2/space_invaders/10')

In [None]:
run_experiment(env, trace, model, mem,
               nb_total_steps=2000000, test_every=25000, test_steps=10000, stop_filename='STOP_INVADERS_10')

In [None]:
for i in range(10):
    idx = np.random.randint(len(mem))
    plot_frames(mem._hist_St[idx])

In [None]:
def experiment_space_invaders(tf_logdir=None):
    
    try: env.close()
    except: pass
    env = gym.make('SpaceInvadersDeterministic-v4')
    env = WrapAtari(env)

    tf.reset_default_graph()
    session = tf.Session()
    summary_writer = None
    if tf_logdir is not None:
        assert not os.path.exists(tf_logdir)
        summary_writer = tf.summary.FileWriter(tf_logdir)

    neural_net = TFNeuralNet(tf_session=session, tf_summary_writer=summary_writer,
                             nb_out=env.action_space.n, lr=0.00025)
    
    model = TFFunctApprox(neural_net, st_low=0, st_high=255, rew_mean=0, rew_std=1, nb_actions=env.action_space.n)
    
    mem = Memory(max_len=500000, state_shape=(), state_dtype=object)
    prefill_memory(env, mem, steps=50000, render=False)
    test_states, _, _, _, _, _ = mem.get_batch(32)
    
    trace = Trace(tf_summary_writer=summary_writer,
                  eval_every=1000,
                  test_states=test_states)
    
    if summary_writer is not None:
        summary_writer.add_graph(session.graph)
        summary_writer.flush()
    session.run(tf.global_variables_initializer())
    
    # neural_net.setup_logdb('outarray.h5', 20)
    
    return env, trace, model, mem

In [None]:
# Setup Experiment
env, trace, model, mem = experiment_space_invaders(tf_logdir='tf_log_2/space_invaders/1')

In [None]:
env.close()

In [None]:
# Run Experiment
while trace.total_step < 2000000:
    q_learning(trace.total_step, env, frames=25000, gamma=.95, eps_decay_steps=500000, eps_target=0.1,
           batch_size=32, model=model, mem=mem, callback=callback, trace=trace)
    # model._model.save('./tf_models/Pong-v0_'+ str(trace.total_step) + '.ckpt')
    test_reward = evaluate(env, 10000, None, eps=0.05, model=model, render=True)
    trace.push_summary(tag='Metrics/Test_Reward', simple_value=test_reward)
    
    if os.path.exists('STOP_INVADERS_1'):
        break
    
    if psutil.swap_memory().percent > 50:
        break

# Function Approximators and Memory - Faster

In [None]:
class TFNeuralNet():
    def __init__(self, tf_session, tf_summary_writer, nb_out, lr, extended_debug=False):
        
        self._sess = tf_session
        self._summary_writer = tf_summary_writer
        self._extended_debug = extended_debug
        
        self.nb_out = nb_out
        
        self._log_filename = None
        self._dict_layers = {}
        self._timestep = 0
        

        graph = tf.get_default_graph()
        with tf.variable_scope('NeuralNet'):
            
            with tf.variable_scope('ZZ_Inputs'):
                self._x = tf.placeholder(name='xx', shape=[None, 84, 84, 4], dtype=tf.uint8)
                self._y = tf.placeholder(name='yy', shape=[None], dtype=tf.float32)
                self._a = tf.placeholder(name='aa', shape=[None], dtype=tf.int32)
                self._x_scaled = tf.cast(self._x, tf.float32) / 255.0
                tf.summary.histogram('DataIn', self._x)
                tf.summary.histogram('Targets', self._y)

            with tf.variable_scope('Conv_1'):
                model = tf.layers.conv2d(self._x_scaled,
                                         filters=16,
                                         kernel_size=[8, 8],
                                         strides=[4, 4],
                                         padding='valid',
                                         activation=tf.nn.relu,
                                         kernel_initializer=tf.random_normal_initializer(stddev=.01),
                                         bias_initializer=tf.constant_initializer(value=.1))
                
                tf.summary.histogram('Weights', graph.get_tensor_by_name('NeuralNet/Conv_1/conv2d/kernel:0'))
                tf.summary.histogram('Biases', graph.get_tensor_by_name('NeuralNet/Conv_1/conv2d/bias:0'))
                tf.summary.histogram('PreActivations', graph.get_tensor_by_name('NeuralNet/Conv_1/conv2d/BiasAdd:0'))
                
                # Norms Ratio
                WC1, _ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='NeuralNet/Conv_1/conv2d')
                WC1_bu = tf.get_variable('WC1_bu', trainable=False, initializer=WC1.initialized_value())
                tf.summary.scalar( 'Update_Norm_Ratio', tf.norm(WC1 - WC1_bu) / tf.norm(WC1_bu) )
                

            with tf.variable_scope('Conv_2'):
                model = tf.layers.conv2d(model,
                                         filters=32,
                                         kernel_size=[4, 4],
                                         strides=[2, 2],
                                         padding='valid',
                                         activation=tf.nn.relu,
                                         kernel_initializer=tf.random_normal_initializer(stddev=.01),
                                         bias_initializer=tf.constant_initializer(value=.1))
                
                tf.summary.histogram('Weights', graph.get_tensor_by_name('NeuralNet/Conv_2/conv2d/kernel:0'))
                tf.summary.histogram('Biases', graph.get_tensor_by_name('NeuralNet/Conv_2/conv2d/bias:0'))
                tf.summary.histogram('PreActivations', graph.get_tensor_by_name('NeuralNet/Conv_2/conv2d/BiasAdd:0'))
                
                # Norms Ratio
                WC2, _ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='NeuralNet/Conv_2/conv2d')
                WC2_bu = tf.get_variable('WC2_bu', trainable=False, initializer=WC2.initialized_value())
                tf.summary.scalar( 'Update_Norm_Ratio', tf.norm(WC2 - WC2_bu) / tf.norm(WC2_bu) )
                
            model = tf.layers.flatten(model)
            
            with tf.variable_scope('Dense'):
                model = tf.layers.dense(model,
                                        units=256,
                                        activation=tf.nn.relu,
                                        kernel_initializer=tf.random_normal_initializer(stddev=.01),
                                        bias_initializer=tf.constant_initializer(value=.1))
                
                tf.summary.histogram('Weights', graph.get_tensor_by_name('NeuralNet/Dense/dense/kernel:0'))
                tf.summary.histogram('Biases', graph.get_tensor_by_name('NeuralNet/Dense/dense/bias:0'))
                tf.summary.histogram('PreActivations', graph.get_tensor_by_name('NeuralNet/Dense/dense/BiasAdd:0'))
                
                # Norms Ratio
                WD, _ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='NeuralNet/Dense/dense')
                WD_bu = tf.get_variable('WD_bu', trainable=False, initializer=WD.initialized_value())
                tf.summary.scalar( 'Update_Norm_Ratio', tf.norm(WD - WD_bu) / tf.norm(WD_bu) )
            
            with tf.variable_scope('Output'):
                self._y_hat = tf.layers.dense(model,
                                              units=nb_out,
                                              activation=None,
                                              kernel_initializer=tf.random_normal_initializer(stddev=.01),
                                              bias_initializer=tf.constant_initializer(value=.1))
                
                tf.summary.histogram('Weights', graph.get_tensor_by_name('NeuralNet/Output/dense/kernel:0'))
                tf.summary.histogram('Biases', graph.get_tensor_by_name('NeuralNet/Output/dense/bias:0'))
                tf.summary.histogram('PreActivations', graph.get_tensor_by_name('NeuralNet/Output/dense/BiasAdd:0'))
                
                # Norms Ratio
                WO, _ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='NeuralNet/Output/dense')
                WO_bu = tf.get_variable('WO_bu', trainable=False, initializer=WO.initialized_value())
                tf.summary.scalar( 'Update_Norm_Ratio', tf.norm(WO - WO_bu) / tf.norm(WO_bu) )

                
            self._one_hot = tf.one_hot(self._a, nb_out, dtype=tf.int32, name='onehot')
            self._y_hat_actions = tf.dynamic_partition(self._y_hat, self._one_hot, 2)[1]
            self._loss = .5 * tf.losses.mean_squared_error(self._y, self._y_hat_actions)
        
            
            tf.summary.scalar('ZZ_Loss', self._loss)
            with tf.name_scope('Metrics/'):
                self._loss_summary = tf.summary.scalar('Loss', self._loss)
            

            
            if not extended_debug:
                #self._optimizer = tf.train.AdamOptimizer(learning_rate=lr)
                assert lr == .0002
                self._optimizer = tf.train.RMSPropOptimizer(lr, 0.99, 0.0, 1e-6)
                self._train_op = self._optimizer.minimize(self._loss)
            
            else:
                with tf.control_dependencies([tf.assign(WC1_bu, WC1), tf.assign(WC2_bu, WC2),
                                              tf.assign(WD_bu, WD), tf.assign(WO_bu, WO)]):
                    
                    # Option 1: no gradient clipping
                    self._optimizer = tf.train.AdamOptimizer(learning_rate=lr)
                    self._grads_and_vars = self._optimizer.compute_gradients(self._loss)
                    self._train_op = self._optimizer.apply_gradients(self._grads_and_vars)
            
                    # Option 2: Global gradient clipping
                    #self._optimizer = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.0, momentum=0.95, epsilon=0.01)
                    #gradients, variables = zip(*self._optimizer.compute_gradients(self._loss))
                    #gradients, _ = tf.clip_by_global_norm(gradients, 1)
                    #self._train_op = self._optimizer.apply_gradients(zip(gradients, variables))

                    # Option 3: Per matrix
                    #self._optimizer = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.0, momentum=0.95, epsilon=0.01)
                    #gradients, variables = zip(*self._optimizer.compute_gradients(self._loss))
                    #gradients = [ None if gradient is None else tf.clip_by_norm(gradient, 1.0) for gradient in gradients ]
                    #self._train_op = self._optimizer.apply_gradients(zip(gradients, variables))
            
                
                with tf.variable_scope('NeuralNet/Conv_1/'):
                    tf.summary.scalar('GradNorm', tf.norm(graph.get_tensor_by_name(
                        'NeuralNet/gradients/NeuralNet/Conv_1/conv2d/Conv2D_grad/tuple/control_dependency_1:0')))
                with tf.variable_scope('NeuralNet/Conv_2/'):
                    tf.summary.scalar('GradNorm', tf.norm(graph.get_tensor_by_name(
                        'NeuralNet/gradients/NeuralNet/Conv_2/conv2d/Conv2D_grad/tuple/control_dependency_1:0')))
                with tf.variable_scope('NeuralNet/Dense/'):
                    tf.summary.scalar('GradNorm', tf.norm(graph.get_tensor_by_name(
                        'NeuralNet/gradients/NeuralNet/Dense/dense/MatMul_grad/tuple/control_dependency_1:0')))
                with tf.variable_scope('NeuralNet/Output/'):
                    tf.summary.scalar('GradNorm', tf.norm(graph.get_tensor_by_name(
                        'NeuralNet/gradients/NeuralNet/Output/dense/MatMul_grad/tuple/control_dependency_1:0')))

        self._merged_summaries = tf.summary.merge_all()
            
        
                
    def backward(self, x, y, a):
        assert x.ndim == 4
        assert y.ndim == 1
        assert a.ndim == 1
        assert x.shape == (32, 84, 84, 4)
        
        if not self._extended_debug:
            loss_summary, _ = \
                self._sess.run([self._loss_summary, self._train_op],
                                feed_dict={self._x: x, self._y: y, self._a: a})
            self._summary_writer.add_summary(loss_summary, self._timestep)
            
        else:
            dict_layers, merged_summaries, loss_summary, _ = \
                self._sess.run([self._dict_layers, self._merged_summaries, self._loss_summary, self._train_op],
                                feed_dict={self._x: x, self._y: y, self._a: a})
        
            self._summary_writer.add_summary(merged_summaries, self._timestep)
            self._summary_writer.add_summary(loss_summary, self._timestep)
        
            if self._log_filename is not None:
                tables_logger.append_log(self._log_filename, dict_layers)
        
        self._timestep += 1
    
    def forward(self, x):
        return self._sess.run(self._y_hat, feed_dict={self._x: x})
    
    def save(self, filepath):
        saver = tf.train.Saver()
        saver.save(self._sess, filepath)
        
    def load(self, filepath):
        saver = tf.train.Saver()
        saver.restore(self._sess, filepath)
        
    def setup_logdb(self, filename, batch_save):
        if not self._extended_debug:
            raise ValueError('Please enable extended_debug=True in constructor.')
        
        self._log_filename = filename
        
        graph = tf.get_default_graph()

        dict_inout = {
            #'batch_x' : cnn._x[0:batch_save,:,:,:],
            'batch_y' : cnn._y[0:batch_save,:],
        }

        dict_conv_1 = {
            'W': graph.get_tensor_by_name('NeuralNet/Conv_1/conv2d/kernel:0'),
            'b': graph.get_tensor_by_name('NeuralNet/Conv_1/conv2d/bias:0'),
            'dW': graph.get_tensor_by_name('NeuralNet/gradients/NeuralNet/Conv_1/conv2d/Conv2D_grad/tuple/control_dependency_1:0'),
            'db': graph.get_tensor_by_name('NeuralNet/gradients/NeuralNet/Conv_1/conv2d/BiasAdd_grad/tuple/control_dependency_1:0'),
            'z': graph.get_tensor_by_name('NeuralNet/Conv_1/conv2d/BiasAdd:0')[0:batch_save,:,:,:],
        }

        dict_conv_2 = {
            'W': graph.get_tensor_by_name('NeuralNet/Conv_2/conv2d/kernel:0'),
            'b': graph.get_tensor_by_name('NeuralNet/Conv_2/conv2d/bias:0'),
            'dW': graph.get_tensor_by_name('NeuralNet/gradients/NeuralNet/Conv_2/conv2d/Conv2D_grad/tuple/control_dependency_1:0'),
            'db': graph.get_tensor_by_name('NeuralNet/gradients/NeuralNet/Conv_2/conv2d/BiasAdd_grad/tuple/control_dependency_1:0'),
            'z': graph.get_tensor_by_name('NeuralNet/Conv_2/conv2d/BiasAdd:0')[0:batch_save,:,:,:],
        }

        dict_dense = {
            'W': graph.get_tensor_by_name('NeuralNet/Dense/dense/kernel:0')[:100,:50],
            'b': graph.get_tensor_by_name('NeuralNet/Dense/dense/bias:0'),
            'dW': graph.get_tensor_by_name('NeuralNet/gradients/NeuralNet/Dense/dense/MatMul_grad/tuple/control_dependency_1:0')[:100,:50],
            'db': graph.get_tensor_by_name('NeuralNet/gradients/NeuralNet/Dense/dense/BiasAdd_grad/tuple/control_dependency_1:0'),
            'z': graph.get_tensor_by_name('NeuralNet/Dense/dense/BiasAdd:0')[0:batch_save,:],
        }

        dict_output = {
            'W': graph.get_tensor_by_name('NeuralNet/Output/dense/kernel:0'),
            'b': graph.get_tensor_by_name('NeuralNet/Output/dense/bias:0'),
            'dW': graph.get_tensor_by_name('NeuralNet/gradients/NeuralNet/Output/dense/MatMul_grad/tuple/control_dependency_1:0'),
            'db': graph.get_tensor_by_name('NeuralNet/gradients/NeuralNet/Output/dense/BiasAdd_grad/tuple/control_dependency_1:0'),
            'z': graph.get_tensor_by_name('NeuralNet/Output/dense/BiasAdd:0')[0:batch_save,:],
        }

        dict_metrics = {
            'loss': cnn._loss,
        }

        self._dict_layers = {
            'inout': dict_inout,
            'conv_1': dict_conv_1,
            'conv_2': dict_conv_2,
            'dense': dict_dense,
            'output': dict_output,
            'metrics': dict_metrics,
        }

        tables_logger.create_log(filename, self._dict_layers, batch_save)

In [None]:
class TFFunctApprox():

    def __init__(self, model, st_low, st_high, rew_mean, rew_std, nb_actions):
        """Q-function approximator using Keras model

        Args:
            model: Keras compiled model
        """
        self._model = model
        
        assert np.isscalar(st_low) and np.isscalar(st_high)
        
        if nb_actions != model.nb_out:
            raise ValueError('Output shape does not match action_space shape')

        # normalise inputs
        self._offsets = st_low + (st_high - st_low) / 2
        self._scales = 1 / ((st_high - st_low) / 2)
        
        self._rew_mean = rew_mean
        self._rew_std = rew_std

    def eval(self, states):
        assert isinstance(states, np.ndarray)
        assert states.ndim == 4
        assert states.shape == (32, 84, 84, 4) or states.shape == (1, 84, 84, 4) or states.shape == (10, 84, 84, 4)
        
        inputs = states # (states - self._offsets) * self._scales

        y_hat = self._model.forward(inputs)
        
        return y_hat*self._rew_std + self._rew_mean

    def train(self, states, actions, targets):
        
        assert isinstance(states, np.ndarray)
        assert isinstance(actions, np.ndarray)
        assert isinstance(targets, np.ndarray)
        assert states.ndim == 4
        assert actions.ndim == 1
        assert targets.ndim == 1
        assert len(states) == len(actions) == len(targets)
        
        targets = (targets-self._rew_mean) / self._rew_std    # decreases range (std>1) to approx -1..1

        inputs = states # (states - self._offsets) * self._scales
#         all_targets = self._model.forward(inputs)             # this range should be small already
#         all_targets[np.arange(len(all_targets)), actions] = targets
#         return self._model.backward(inputs, all_targets)
        return self._model.backward(inputs, targets, actions)

# Function Approximators and Memory

In [None]:
raise # sentinel
class TFNeuralNet():
    def __init__(self, tf_session, tf_summary_writer, nb_out, lr, extended_debug=False):
        
        self._sess = tf_session
        self._summary_writer = tf_summary_writer
        self._extended_debug = extended_debug
        
        self.nb_out = nb_out
        
        self._log_filename = None
        self._dict_layers = {}
        self._timestep = 0
        

        graph = tf.get_default_graph()
        with tf.variable_scope('NeuralNet'):
            
            with tf.variable_scope('ZZ_Inputs'):
                self._x = tf.placeholder(name='xx', shape=[None, 84, 84, 4], dtype=tf.float32)
                self._y = tf.placeholder(name='yy', shape=[None, nb_out], dtype=tf.float32)
                tf.summary.histogram('DataIn', self._x)
                tf.summary.histogram('Targets', self._y)

            with tf.variable_scope('Conv_1'):
                model = tf.layers.conv2d(self._x, filters=16, kernel_size=[8, 8], strides=[4, 4],
                                         padding='valid', activation=tf.nn.relu)
                tf.summary.histogram('Weights', graph.get_tensor_by_name('NeuralNet/Conv_1/conv2d/kernel:0'))
                tf.summary.histogram('Biases', graph.get_tensor_by_name('NeuralNet/Conv_1/conv2d/bias:0'))
                tf.summary.histogram('PreActivations', graph.get_tensor_by_name('NeuralNet/Conv_1/conv2d/BiasAdd:0'))
                
                # Norms Ratio
                WC1, _ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='NeuralNet/Conv_1/conv2d')
                WC1_bu = tf.get_variable('WC1_bu', trainable=False, initializer=WC1.initialized_value())
                tf.summary.scalar( 'Update_Norm_Ratio', tf.norm(WC1 - WC1_bu) / tf.norm(WC1_bu) )
                

            with tf.variable_scope('Conv_2'):
                model = tf.layers.conv2d(model, filters=32, kernel_size=[4, 4], strides=[2, 2],
                                         padding='valid', activation=tf.nn.relu)
                tf.summary.histogram('Weights', graph.get_tensor_by_name('NeuralNet/Conv_2/conv2d/kernel:0'))
                tf.summary.histogram('Biases', graph.get_tensor_by_name('NeuralNet/Conv_2/conv2d/bias:0'))
                tf.summary.histogram('PreActivations', graph.get_tensor_by_name('NeuralNet/Conv_2/conv2d/BiasAdd:0'))
                
                # Norms Ratio
                WC2, _ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='NeuralNet/Conv_2/conv2d')
                WC2_bu = tf.get_variable('WC2_bu', trainable=False, initializer=WC2.initialized_value())
                tf.summary.scalar( 'Update_Norm_Ratio', tf.norm(WC2 - WC2_bu) / tf.norm(WC2_bu) )
                
            model = tf.layers.flatten(model)
            
            with tf.variable_scope('Dense'):
                model = tf.layers.dense(model, 256, activation=tf.nn.relu)
                tf.summary.histogram('Weights', graph.get_tensor_by_name('NeuralNet/Dense/dense/kernel:0'))
                tf.summary.histogram('Biases', graph.get_tensor_by_name('NeuralNet/Dense/dense/bias:0'))
                tf.summary.histogram('PreActivations', graph.get_tensor_by_name('NeuralNet/Dense/dense/BiasAdd:0'))
                
                # Norms Ratio
                WD, _ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='NeuralNet/Dense/dense')
                WD_bu = tf.get_variable('WD_bu', trainable=False, initializer=WD.initialized_value())
                tf.summary.scalar( 'Update_Norm_Ratio', tf.norm(WD - WD_bu) / tf.norm(WD_bu) )
            
            with tf.variable_scope('Output'):
                self._y_hat = tf.layers.dense(model, nb_out, activation=None)
                tf.summary.histogram('Weights', graph.get_tensor_by_name('NeuralNet/Output/dense/kernel:0'))
                tf.summary.histogram('Biases', graph.get_tensor_by_name('NeuralNet/Output/dense/bias:0'))
                tf.summary.histogram('PreActivations', graph.get_tensor_by_name('NeuralNet/Output/dense/BiasAdd:0'))
                
                # Norms Ratio
                WO, _ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='NeuralNet/Output/dense')
                WO_bu = tf.get_variable('WO_bu', trainable=False, initializer=WO.initialized_value())
                tf.summary.scalar( 'Update_Norm_Ratio', tf.norm(WO - WO_bu) / tf.norm(WO_bu) )

            self._loss = tf.losses.mean_squared_error(self._y, self._y_hat)
            
            tf.summary.scalar('ZZ_Loss', self._loss)
            with tf.name_scope('Metrics/'):
                self._loss_summary = tf.summary.scalar('Loss', self._loss)
            
            # No gradient clipping
            self._optimizer = tf.train.AdamOptimizer(learning_rate=lr)
            # self._optimizer = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.0, momentum=0.95, epsilon=0.01)
            self._grads_and_vars = self._optimizer.compute_gradients(self._loss)
            
            if extended_debug:
                with tf.control_dependencies([tf.assign(WC1_bu, WC1), tf.assign(WC2_bu, WC2),
                                              tf.assign(WD_bu, WD), tf.assign(WO_bu, WO)]):
                    self._train_op = self._optimizer.apply_gradients(self._grads_and_vars)
            else:
                self._train_op = self._optimizer.apply_gradients(self._grads_and_vars)

            # Global gradient clipping
    #         self._optimizer = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.0, momentum=0.95, epsilon=0.01)
    #         gradients, variables = zip(*self._optimizer.compute_gradients(self._loss))
    #         gradients, _ = tf.clip_by_global_norm(gradients, 1)
    #         self._train_op = self._optimizer.apply_gradients(zip(gradients, variables))

            # Per matrix
    #         self._optimizer = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.0, momentum=0.95, epsilon=0.01)
    #         gradients, variables = zip(*self._optimizer.compute_gradients(self._loss))
    #         gradients = [ None if gradient is None else tf.clip_by_norm(gradient, 1.0) for gradient in gradients ]
    #         self._train_op = self._optimizer.apply_gradients(zip(gradients, variables))

            with tf.variable_scope('NeuralNet/Conv_1/'):
                tf.summary.scalar('GradNorm', tf.norm(graph.get_tensor_by_name(
                    'NeuralNet/gradients/NeuralNet/Conv_1/conv2d/Conv2D_grad/tuple/control_dependency_1:0')))
            with tf.variable_scope('NeuralNet/Conv_2/'):
                tf.summary.scalar('GradNorm', tf.norm(graph.get_tensor_by_name(
                    'NeuralNet/gradients/NeuralNet/Conv_2/conv2d/Conv2D_grad/tuple/control_dependency_1:0')))
            with tf.variable_scope('NeuralNet/Dense/'):
                tf.summary.scalar('GradNorm', tf.norm(graph.get_tensor_by_name(
                    'NeuralNet/gradients/NeuralNet/Dense/dense/MatMul_grad/tuple/control_dependency_1:0')))
            with tf.variable_scope('NeuralNet/Output/'):
                tf.summary.scalar('GradNorm', tf.norm(graph.get_tensor_by_name(
                    'NeuralNet/gradients/NeuralNet/Output/dense/MatMul_grad/tuple/control_dependency_1:0')))
        
        self._merged_summaries = tf.summary.merge_all()
            
        
                
    def backward(self, x, y):
        assert x.ndim == 4
        assert y.ndim == 2
        assert x.shape == (32, 84, 84, 4)
        
        if not self._extended_debug:
            loss_summary, _ = \
                self._sess.run([self._loss_summary, self._train_op],
                                feed_dict={self._x: x, self._y: y})
            self._summary_writer.add_summary(loss_summary, self._timestep)
            
        else:
            dict_layers, merged_summaries, loss_summary, _ = \
                self._sess.run([self._dict_layers, self._merged_summaries, self._loss_summary, self._train_op],
                                feed_dict={self._x: x, self._y: y})
        
            self._summary_writer.add_summary(merged_summaries, self._timestep)
            self._summary_writer.add_summary(loss_summary, self._timestep)
        
            if self._log_filename is not None:
                tables_logger.append_log(self._log_filename, dict_layers)
        
        self._timestep += 1
    
    def forward(self, x):
        return self._sess.run(self._y_hat, feed_dict={self._x: x})
    
    def save(self, filepath):
        saver = tf.train.Saver()
        saver.save(self._sess, filepath)
        
    def load(self, filepath):
        saver = tf.train.Saver()
        saver.restore(self._sess, filepath)
        
    def setup_logdb(self, filename, batch_save):
        if not self._extended_debug:
            raise ValueError('Please enable extended_debug=True in constructor.')
        
        self._log_filename = filename
        
        graph = tf.get_default_graph()

        dict_inout = {
            #'batch_x' : cnn._x[0:batch_save,:,:,:],
            'batch_y' : cnn._y[0:batch_save,:],
        }

        dict_conv_1 = {
            'W': graph.get_tensor_by_name('NeuralNet/Conv_1/conv2d/kernel:0'),
            'b': graph.get_tensor_by_name('NeuralNet/Conv_1/conv2d/bias:0'),
            'dW': graph.get_tensor_by_name('NeuralNet/gradients/NeuralNet/Conv_1/conv2d/Conv2D_grad/tuple/control_dependency_1:0'),
            'db': graph.get_tensor_by_name('NeuralNet/gradients/NeuralNet/Conv_1/conv2d/BiasAdd_grad/tuple/control_dependency_1:0'),
            'z': graph.get_tensor_by_name('NeuralNet/Conv_1/conv2d/BiasAdd:0')[0:batch_save,:,:,:],
        }

        dict_conv_2 = {
            'W': graph.get_tensor_by_name('NeuralNet/Conv_2/conv2d/kernel:0'),
            'b': graph.get_tensor_by_name('NeuralNet/Conv_2/conv2d/bias:0'),
            'dW': graph.get_tensor_by_name('NeuralNet/gradients/NeuralNet/Conv_2/conv2d/Conv2D_grad/tuple/control_dependency_1:0'),
            'db': graph.get_tensor_by_name('NeuralNet/gradients/NeuralNet/Conv_2/conv2d/BiasAdd_grad/tuple/control_dependency_1:0'),
            'z': graph.get_tensor_by_name('NeuralNet/Conv_2/conv2d/BiasAdd:0')[0:batch_save,:,:,:],
        }

        dict_dense = {
            'W': graph.get_tensor_by_name('NeuralNet/Dense/dense/kernel:0')[:100,:50],
            'b': graph.get_tensor_by_name('NeuralNet/Dense/dense/bias:0'),
            'dW': graph.get_tensor_by_name('NeuralNet/gradients/NeuralNet/Dense/dense/MatMul_grad/tuple/control_dependency_1:0')[:100,:50],
            'db': graph.get_tensor_by_name('NeuralNet/gradients/NeuralNet/Dense/dense/BiasAdd_grad/tuple/control_dependency_1:0'),
            'z': graph.get_tensor_by_name('NeuralNet/Dense/dense/BiasAdd:0')[0:batch_save,:],
        }

        dict_output = {
            'W': graph.get_tensor_by_name('NeuralNet/Output/dense/kernel:0'),
            'b': graph.get_tensor_by_name('NeuralNet/Output/dense/bias:0'),
            'dW': graph.get_tensor_by_name('NeuralNet/gradients/NeuralNet/Output/dense/MatMul_grad/tuple/control_dependency_1:0'),
            'db': graph.get_tensor_by_name('NeuralNet/gradients/NeuralNet/Output/dense/BiasAdd_grad/tuple/control_dependency_1:0'),
            'z': graph.get_tensor_by_name('NeuralNet/Output/dense/BiasAdd:0')[0:batch_save,:],
        }

        dict_metrics = {
            'loss': cnn._loss,
        }

        self._dict_layers = {
            'inout': dict_inout,
            'conv_1': dict_conv_1,
            'conv_2': dict_conv_2,
            'dense': dict_dense,
            'output': dict_output,
            'metrics': dict_metrics,
        }

        tables_logger.create_log(filename, self._dict_layers, batch_save)

In [None]:
raise # sentinel
class TFFunctApprox():

    def __init__(self, model, st_low, st_high, rew_mean, rew_std, nb_actions):
        """Q-function approximator using Keras model

        Args:
            model: Keras compiled model
        """
        self._model = model
        
        assert np.isscalar(st_low) and np.isscalar(st_high)
        
        if nb_actions != model.nb_out:
            raise ValueError('Output shape does not match action_space shape')

        # normalise inputs
        self._offsets = st_low + (st_high - st_low) / 2
        self._scales = 1 / ((st_high - st_low) / 2)
        
        self._rew_mean = rew_mean
        self._rew_std = rew_std

    def eval(self, states):
        assert isinstance(states, np.ndarray)
        assert states.ndim == 4
        assert states.shape == (32, 84, 84, 4) or states.shape == (1, 84, 84, 4) or states.shape == (10, 84, 84, 4)
        
        inputs = (states - self._offsets) * self._scales

        y_hat = self._model.forward(inputs)
        
        return y_hat*self._rew_std + self._rew_mean

    def train(self, states, actions, targets):
        
        assert isinstance(states, np.ndarray)
        assert isinstance(actions, np.ndarray)
        assert isinstance(targets, np.ndarray)
        assert states.ndim == 4
        assert actions.ndim == 1
        assert targets.ndim == 1
        assert len(states) == len(actions) == len(targets)
        
        targets = (targets-self._rew_mean) / self._rew_std    # decreases range (std>1) to approx -1..1

        inputs = (states - self._offsets) * self._scales
        all_targets = self._model.forward(inputs)             # this range should be small already
        all_targets[np.arange(len(all_targets)), actions] = targets
        return self._model.backward(inputs, all_targets)


In [None]:
class Memory:
    """Circular buffer for DQN memory reply. Fairly fast."""

    def __init__(self, max_len, state_shape, state_dtype, rng=None):
        """
        Args:
            max_len: maximum capacity
        """
        assert isinstance(max_len, int)
        assert max_len > 0
        
        if rng is None:
            self._random = np.random  # reuse numpy
        else:
            self._random = rng        # use provided random number generator

        self.max_len = max_len                            # maximum length        
        self._curr_insert_ptr = 0                          # index to insert next data sample
        self._curr_len = 0                                 # number of currently stored elements

        state_arr_shape = [max_len] + list(state_shape)

        self._hist_St = np.zeros(state_arr_shape, dtype=state_dtype)
        self._hist_At = np.zeros(max_len, dtype=int)
        self._hist_Rt_1 = np.zeros(max_len, dtype=float)
        self._hist_St_1 = np.zeros(state_arr_shape, dtype=state_dtype)
        self._hist_done_1 = np.zeros(max_len, dtype=bool)

    def append(self, St, At, Rt_1, St_1, done_1):
        """Add one sample to memory, override oldest if max_len reached.

        Args:
            St [np.ndarray]   - state
            At [int]          - action
            Rt_1 [float]      - reward
            St_1 [np.ndarray] - next state
            done_1 [bool]       - next state terminal?
        """
        self._hist_St[self._curr_insert_ptr] = St
        self._hist_At[self._curr_insert_ptr] = At
        self._hist_Rt_1[self._curr_insert_ptr] = Rt_1
        self._hist_St_1[self._curr_insert_ptr] = St_1
        self._hist_done_1[self._curr_insert_ptr] = done_1
        
        if self._curr_len < self.max_len:                 # keep track of current length
            self._curr_len += 1
            
        self._curr_insert_ptr += 1                         # increment insertion pointer
        if self._curr_insert_ptr >= self.max_len:         # roll to zero if needed
            self._curr_insert_ptr = 0

    def __len__(self):
        """Number of samples in memory, 0 <= length <= max_len"""
        return self._curr_len

    def get_batch(self, batch_len):
        """Sample batch of data, with repetition

        Args:
            batch_len: nb of samples to pick

        Returns:
            states, actions, rewards, next_states, next_done, indices
            Each returned element is np.ndarray with length == batch_len
        """
        assert self._curr_len > 0
        assert batch_len > 0

        
        indices = self._random.randint(                   # randint much faster than np.random.sample
            low=0, high=self._curr_len, size=batch_len, dtype=int)

        states = np.take(self._hist_St, indices, axis=0)
        actions = np.take(self._hist_At, indices, axis=0)
        rewards_1 = np.take(self._hist_Rt_1, indices, axis=0)
        states_1 = np.take(self._hist_St_1, indices, axis=0)
        dones_1 = np.take(self._hist_done_1, indices, axis=0)
        
        if states.dtype == object and isinstance(self._hist_St[0], LazyFrames): 
            states = np.stack(states)       # convert to single np.ndarray shape [batch_size, 4, 84, 84]
            states_1 = np.stack(states_1)   # where '4' is number of history frames presented to agent

        return states, actions, rewards_1, states_1, dones_1, indices


    
    def pick_last(self, nb):
        """Pick last nb elements from memory
        
        Returns:
            states, actions, rewards, next_states, done_1, indices
            Each returned element is np.ndarray with length == batch_len
        """
        assert nb <= self._curr_len
        
        start = self._curr_insert_ptr - nb                # inclusive
        end = self._curr_insert_ptr                       # not inclusive
        indices = np.array(range(start,end), dtype=int)   # indices to pick, can be negative
        indices[indices < 0] += self._curr_len            # loop negative to positive
        
        states = np.take(self._hist_St, indices, axis=0)
        actions = np.take(self._hist_At, indices, axis=0)
        rewards_1 = np.take(self._hist_Rt_1, indices, axis=0)
        states_1 = np.take(self._hist_St_1, indices, axis=0)
        dones_1 = np.take(self._hist_done_1, indices, axis=0)
        
        if states.dtype == object and isinstance(self._hist_St[0], LazyFrames): 
            states = np.stack(states)       # convert to single np.ndarray shape [batch_size, 4, 84, 84]
            states_1 = np.stack(states_1)   # where '4' is number of history frames presented to agent
        
        return states, actions, rewards_1, states_1, dones_1, indices
    

---
below is just testing

<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>

# Regression Test

To do regression test:
* restart kernel (optional?)
* run all cells up to and including Experiment Setup
  * but __DO NOT__ run cell with gpu_options.allow_growth = True
* run try_freeze_random_seeds
* run cells
* compare tensorflow graph with tf_log_2/movingdot/regression_seedXXXX_correct

In [None]:
import random
import gym.spaces
def try_freeze_random_seeds(seed, reproducible):
    """Will attempt to make execution fully reproducible

    Params:
        seed (int): Set random seeds for following modules:
            random, numpy.random, tensorflow, gym.spaces
        reproducible (bool): if True, then:
            Disbale GPU by setting env. var. CUDA_VISIBLE_DEVICES to '-1'
            Disable randomised hashing by setting PYTHONHASHSEED to '0'
            Force single-threadeed execution in tensorflow
    """
    #
    #   Environment variables
    #
    if reproducible:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # disable GPU
        os.environ['PYTHONHASHSEED'] = '0'         # force reproducible hasing

    #
    #   Random seeds
    #
    print('Using random seed:', seed)
    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)
        tf.set_random_seed(seed)
    # always call this, if not called expicitly, defaults to seed==0
    gym.spaces.seed(seed)

    #
    #   Set TF session
    #
    if reproducible:
        config = tf.ConfigProto()    
        config.intra_op_parallelism_threads=1
        config.inter_op_parallelism_threads=1
        sess = tf.Session(config=config)

try_freeze_random_seeds(1234, True)

In [None]:
import moving_dot

In [None]:
try_freeze_random_seeds(1234, True)

In [None]:
env, trace, model, mem = setup_experiment(env_name='MovingDot-v0',
                                          mem_size=10000, mem_fill=1000,
                                          tf_logdir='tf_log_2/movingdot/regression_seed1234_actmask_2',
                                          seed=1234)

In [None]:
model._model.load('./gpuscale')

In [None]:
# Test tensorflow init (seed=1234)
arr = np.array([[-0.06707259,  0.01230972, -0.01680173,  0.06786124],
       [ 0.04711869, -0.01441539,  0.04921476,  0.02538211],
       [ 0.04901032, -0.03892146,  0.03339036,  0.01314713],
       [ 0.01419152, -0.05306549, -0.03446849,  0.0440792 ],
       [ 0.03037152, -0.06045384, -0.0203958 ,  0.042386  ],
       [ 0.0487171 ,  0.04372656,  0.0037141 ,  0.01883662],
       [-0.06191136,  0.01071654, -0.05378162, -0.03057779],
       [-0.00868624, -0.04274424, -0.03951517,  0.04651371]],
      dtype=np.float32)
WC1_tensor, _ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='NeuralNet/Conv_1/conv2d')
WC1 = model._model._sess.run(WC1_tensor)
np.allclose(WC1[:,:,:,0][0], arr)

In [None]:
WC1[:,:,:,0][0]

In [None]:
rng = np.random.RandomState(1234)
run_experiment(env, trace, model, mem, epoch_size=200, nb_total_steps=2000, eps_decay_steps=2000,
               test_frames=None, test_episodes=10, stop_filename='STOP_MOVINGDOT', render=False, rng=rng)

# Pong NN Test

In [None]:
env = gym.make('PongDeterministic-v4')
env = WrapAtari(env)

In [None]:
mem = Memory(max_len=10000, state_shape=(), state_dtype=object)
prefill_memory(env, mem, one_episode=False)
print(len(mem))

In [None]:
states, actions, rewards, n_states, dones, _ = mem.pick_last(len(mem))

In [None]:
np.count_nonzero(rewards==-1)

In [None]:
np.count_nonzero(rewards==1)

In [None]:
np.count_nonzero(rewards==0)

In [None]:
del states
del n_states

In [None]:
cnn = TFNeuralNet(nb_out=6)
#cnn.setup_logdb('outarray.h5', 5)

In [None]:
model = TFFunctApprox(cnn, st_low=0, st_high=255, rew_mean=0, rew_std=1, nb_actions=6)

In [None]:
gamma = 0.95

In [None]:
batch_size = 32

THIS SHOULD CONVERGE

In [None]:
losses = []
for i in range(50000):
    states, actions, rewards, n_states, dones, _ = mem.get_batch(batch_size)
    targets = model.eval(n_states)
    targets = rewards + gamma * np.max(targets, axis=-1)
    targets[dones] = rewards[dones]                # return of next-to-terminal state is just R
    loss = model.train(states, actions, targets)
    
    losses.append(loss)
    if i % 25 == 0:
        print(i, loss)

# Test CNN Forward

In [None]:
env = gym.make('PongDeterministic-v4')
env = WrapAtari(env)

In [None]:
mem = Memory(max_len=1000, state_shape=(), state_dtype=object)

In [None]:
prefill_memory(env, mem)

In [None]:
states, actions, rewards_1, states_1, dones_1, indices = mem.get_batch(10)

In [None]:
for i in range(10):
    print('----')
    print(rewards_1[i])
    plot_frames(states[i])

In [None]:
cnn = TFNeuralNet(nb_out=6)

In [None]:
states, actions, rewards_1, states_1, dones_1, indices = mem.get_batch(10)

In [None]:
states_nn = states / 255

In [None]:
cnn.forward(states_nn)

In [None]:
tf.trainable_variables()

In [None]:
writer = tf.summary.FileWriter(logdir='tf_log', graph=cnn._sess.graph)
writer.flush()

# Test CNN Logging

In [None]:
cnn = TFNeuralNet(nb_out=6)

In [None]:
filename = 'outarray.h5'
cnn.setup_logdb(filename, batch_save=10)

In [None]:
tables_logger.print_log(filename)

# Test Lazy Frame

In [None]:
A = np.array([1, 1, 1])
B = np.array([2, 2, 2])
C = np.array([3, 3, 3])

lf1 = LazyFrame([A, B])
lf2 = LazyFrame([B, C])

In [None]:
mem = np.zeros(shape=[10], dtype=object)

In [None]:
mem[0] = lf1
mem[1] = lf2

In [None]:
lf1._frames[0][0] = 4

In [None]:
lf1._frames

In [None]:
np.stack(mem[[0,1]])

In [None]:
np.array(mem[0])

# Test Evaluate

In [None]:
env.close()
env = gym.make('PongDeterministic-v4')
env = WrapAtari(env)

In [None]:
cnn = TFNeuralNet(nb_out=6)
model = TFFunctApprox(cnn, st_low=0, st_high=255, rew_mean=0, rew_std=1, nb_actions=6)

In [None]:
def callback_disp(total_step, episode, tstep, st, act, rew_, done_, eps, model, memory, trace):
    if done_:
        print(total_step)
    # pdb.set_trace()

In [None]:
ts = time.time()
tr = evaluate(env, 1000, None, eps=0.05, model=model, callback=callback_disp, render=True)
print(time.time() - ts)

In [None]:
env.close()

In [None]:
print(tr)

# Test fill

In [None]:
env.close()
env = gym.make('PongDeterministic-v4')
env = WrapAtari(env)

In [None]:
mem = Memory(10000, (), object)

In [None]:
prefill_memory(env, mem, steps=2000)

In [None]:
env.close()

In [None]:
len(mem)

In [None]:
states, actions, rewards_1, states_1, dones_1, indices = mem.pick_last(len(mem))

In [None]:
print(np.count_nonzero(dones_1))

In [None]:
print('rew  1', np.count_nonzero(rewards_1==1))
print('rew  0', np.count_nonzero(rewards_1==0))
print('rew -1', np.count_nonzero(rewards_1==-1))

# Test Mem Object

In [None]:
env = gym.make('PongDeterministic-v4')
env = WrapAtari(env)

In [None]:
env = gym.make('MovingDot3-v0')
env = WrapAtari(env)

In [None]:
lframes = env.reset()

In [None]:
lframes_, rew_, done_, _ = env.step(0)

In [None]:
mem = Memory(10, (), object)

In [None]:
mem.append(lframes, 0, rew_, lframes_, done_)

In [None]:
lframes = lframes_
lframes_, rew_, done_, _ = env.step(0)
mem.append(lframes, 0, rew_, lframes_, done_)

In [None]:
print(mem._hist_St)
print(mem._hist_At)
print(mem._hist_Rt_1)
print(mem._hist_St_1)
print(mem._hist_done_1)

In [None]:
arr = np.take(mem._hist_St, np.array([0, 1]), axis=0)

In [None]:
arr

In [None]:
np.stack(arr).shape

In [None]:
plot_frames(np.stack(arr)[0])

In [None]:
plot_frames(np.stack(arr)[1])

In [None]:
states, actions, rewards_1, states_1, dones_1, indices = mem.get_batch(2)

In [None]:
print(states.shape)
print(actions.shape)
print(rewards_1.shape)
print(states_1.shape)
print(dones_1.shape)

# Test render

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from matplotlib import animation, rc
from IPython.display import HTML

In [None]:
def frames_render(env, frames, episodes, eps, model, callback=None, trace=None, render=True, sleep=0):

    rendered_frames = []
    
    def policy(st, model, eps):
        if np.random.rand() > eps:
            stack = np.stack([st])  # convert lazyframe to nn input shape [1, 84, 84, 4]
            q_values = model.eval(stack)
            return np.argmax(q_values)
        else:
            return env.action_space.sample()
        
    total_reward = 0
    
    tts_ = 0                                 # total time step
    for e_ in itertools.count():             # count from 0 to infinity
        
        S = env.reset()
        
        if render:
            rendered_frames.append(env.render(mode='rgb_array'))
            time.sleep(sleep)
        
        for t_ in itertools.count():         # count from 0 to infinity
            
            A = policy(S, model, eps)
            
            S_, R, done, _ = env.step(A)
            
            total_reward += R
            
            if render:
                rendered_frames.append(env.render(mode='rgb_array'))
                time.sleep(sleep)
            
            if callback is not None:
                callback(tts_, e_, t_, S, A, R, done, eps, model, None, trace)
    
            if done:
                break
                
            if frames is not None and tts_ >= frames:
                return rendered_frames
                
            S = S_
                
            tts_ += 1
            
        if episodes is not None and e_ >= episodes-1:
            return rendered_frames

In [None]:
rendered_frames = frames_render(env, frames=None, episodes=1, eps=0.0, model=model, render=True)

In [None]:
plt.ioff()

fig = plt.figure(figsize=(rendered_frames[0].shape[1] / 72.0, rendered_frames[0].shape[0] / 72.0), dpi = 72)
ax = fig.add_subplot(111);

patch = ax.imshow(rendered_frames[0])
# plt.axis('off');

In [None]:
def animate(i):
    patch.set_data(rendered_frames[i])

In [None]:
anim = animation.FuncAnimation(fig, animate, frames=len(rendered_frames), interval=20, repeat=False)


In [None]:
HTML(anim.to_html5_video())