<br/>

$$ \huge{\underline{\textbf{ Playing Atari Games with Deep RL }}} $$

$$ \large{\textbf{MountainCar + DQN + Memory Reply}} $$

<br/>



In [None]:
def q_learning(start_step, env, frames, gamma, eps_decay_steps, eps_target,
               batch_size, model, mem, callback=None, trace=None):
    """Episodic Semi-Gradient Sarsa
    
    Params:
        env - environment
        ep - number of episodes to run
        gamma - discount factor [0..1]
        eps - epsilon-greedy param
        model      - function approximator, already initialised, with methods:
                     eval(state, action) -> float
                     train(state, target) -> None
    """
    def policy(st, model, eps):
        if np.random.rand() > eps:
            stack = np.stack([st])  # convert lazyframe to nn input shape [1, 84, 84, 4]
            q_values = model.eval(stack)
            return np.argmax(q_values)
        else:
            return env.action_space.sample()
    
    if eps_decay_steps is not None:
        eps_delta = (1-eps_target) / eps_decay_steps
        eps = 1 - start_step*eps_delta
        eps = max(eps, eps_target)
    else:
        eps = eps_target
        
    assert len(mem) >= batch_size
    
    tts_ = 0                                 # total time step
    for e_ in itertools.count():             # count from 0 to infinity
        
        S = env.reset()
        env.render()
        
        for t_ in itertools.count():         # count from 0 to infinity
            
            A = policy(S, model, eps)
            
            S_, R, done, _ = env.step(A)
            env.render()
            
            mem.append(S, A, R, S_, done)
            
            if callback is not None:
                callback(tts_+start_step, e_, t_, S, A, R, done, eps, model, mem, trace)
            
            states, actions, rewards, n_states, dones, _ = mem.get_batch(batch_size)
            targets = model.eval(n_states)
            targets = rewards + gamma * np.max(targets, axis=-1)
            targets[dones] = rewards[dones]                # return of next-to-terminal state is just R
            model.train(states, actions, targets)

            tts_ += 1
            if tts_ >= frames:
                return
            
            if done:
                break
                
            S = S_
            
            if eps > eps_target:
                eps = max(eps - eps_delta, eps_target)
                

In [None]:
def evaluate(env, frames, episodes, eps, model, callback=None, trace=None, render=True, sleep=0):

    def policy(st, model, eps):
        if np.random.rand() > eps:
            stack = np.stack([st])  # convert lazyframe to nn input shape [1, 84, 84, 4]
            q_values = model.eval(stack)
            return np.argmax(q_values)
        else:
            return env.action_space.sample()
        
    total_reward = 0
    
    tts_ = 0                                 # total time step
    for e_ in itertools.count():             # count from 0 to infinity
        
        S = env.reset()
        
        if render:
            env.render()
            time.sleep(sleep)
        
        for t_ in itertools.count():         # count from 0 to infinity
            
            A = policy(S, model, eps)
            
            S_, R, done, _ = env.step(A)
            
            total_reward += R
            
            if render:
                env.render()
                time.sleep(sleep)
            
            if callback is not None:
                callback(tts_, e_, t_, S, A, R, done, eps, model, None, trace)
    
            if done:
                break
                
            if frames is not None and tts_ >= frames:
                return total_reward
                
            S = S_
                
            tts_ += 1
            
        if episodes is not None and e_ >= episodes-1:
            return total_reward

In [None]:
def mem_fill(env, mem, steps=None, episodes=None, render=False):
        
    # Fill memory buffer using random policy
    tts_ = 0
    for e_ in itertools.count():
        if episodes is not None and e_ >= episodes:
            return
        
        S = env.reset();
        if render: env.render()
        
        for t_ in itertools.count():
        
            A = env.action_space.sample()    # random policy
            S_, R, done, _ = env.step(A)
            if render: env.render()
                
            mem.append(S, A, R, S_, done)
            
            tts_ += 1
            
            if steps is not None and tts_ >= steps:
                return
            
            if done:
                break
            
            S = S_

---

# Experiment Setup

Imports (source file: [tiles3.py](tiles3.py), [helpers_1001.py](helpers_1001.py))

In [None]:
import pdb

In [None]:
import time
import datetime
import numpy as np
import matplotlib.pyplot as plt
import tables
import itertools
import collections

import PIL
import gym
import tensorflow as tf

In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
with tf.Session(config=config) as sess:
    devs = sess.list_devices()
    print('\n'.join([x.name for x in devs]))

Environment

In [None]:
import helpers
import importlib
importlib.reload(helpers)

In [None]:
import sys
sys.path.append('../Debug_NN')

In [None]:
import importlib
import tables_logger
importlib.reload(tables_logger)

In [None]:
%load_ext line_profiler

We will need callback to capture q-value array for whole state-action space at specified episodes.

In [None]:
class Trace():
    def __init__(self):
            
#         self.eval_every = eval_every
#         self.render = render
#         self.test_states = test_states
#         self.test_labels = test_labels
#         self.state_labels = state_labels
        
        self.total_step = 0
        
        self.epsilons = []
        
#         self.q_values = collections.OrderedDict()
        self.ep_end_idx = collections.OrderedDict()
        self.ep_rewards = collections.defaultdict(float)
        
#         self.states = []
#         self.actions = []
#         self.rewards = []  # t+1
#         self.dones = []    # t+1
#         self.epsilons = []

In [None]:
def callback(total_step, episode, tstep, st, act, rew_, done_, eps, model, memory, trace):
    """Called from gradient_MC after every episode.
    
    Params:
        episode [int] - episode number
        tstep [int]   - timestep within episode
        model [obj]   - function approximator
        trace [list]  - list to write results to"""
        
    if done_:
        trace.ep_end_idx[episode] = trace.total_step
    
#     trace.states.append(st)
#     trace.actions.append(act)
#     trace.rewards.append(rew_)
#     trace.dones.append(done_)
#     trace.epsilons.append(eps)
    
    trace.ep_rewards[episode] += rew_
    
    trace.epsilons.append(eps)
    
#     if trace.render:
#         env.render()
            
#     if trace.eval_every is not None:
#         if trace.total_step % trace.eval_every == 0:
#             print()
#             print('■'*80)
#             print('episode:', episode, '\t time step:', tstep,
#                   '\t total time step:', trace.total_step, '\t eps:', round(eps,3), 
#                   '\t wall time:', datetime.datetime.now())

            
#         if trace.total_step % trace.eval_every == 0:
            
#             if len(st) == 2:
#                 # We are working with 2D environment,
#                 # plot whole Q-Value functions across whole state space
            
#                 q_arr = helpers.eval_state_action_space(model, env, split=[128,128])
#                 trace.q_values[trace.total_step] = q_arr

#                 helpers.plot_mountain_car(env, episode, trace.total_step, 1000, trace, memory,
#                                           axis_labels=['state[0]', 'state[1]'],
#                                           action_labels=['Act 0', 'Act 1', 'Act 2'],
#                                           action_colors=['red', 'blue', 'green'])
                
#             else:
#                 # Environment is not 2D, so we can't plot whole Q-Value function
#                 # Instead we plot state on standard graph, which is still better than nothing
                
#                 if trace.test_states is not None:
#                     y_hat = model.eval(trace.test_states)
#                     trace.q_values[trace.total_step] = y_hat
                
#                 helpers.plot_generic_environment(env, trace.total_step, 1000, trace, memory)

    assert total_step == trace.total_step            
    
    trace.total_step += 1

---

# Pong Helpers

In [None]:
for env in gym.envs.registry.all():
    if env.id.startswith('Pong'):
        print(env.id)

In [None]:
from skimage.transform import resize
from skimage.color import rgb2gray

In [None]:
def preprocess(obs):
    obs_rgb = rgb2gray(obs)
    obs_110x84 = resize(obs_rgb, output_shape=(110, 84), mode='reflect', anti_aliasing=True)
    obs_84x84 = obs_110x84[13:-13,:]
    obs_uint8 = (obs_84x84*255).astype(np.uint8)
    return obs_uint8

In [None]:
def preprocess(obs):
    img = PIL.Image.fromarray(obs)
    img = img.convert('L')
    img = img.resize([84, 84], resample=PIL.Image.BILINEAR, box=[0,34,160,160+34])
    return np.array(img)

In [None]:
def preprocess(obs):
    img = PIL.Image.fromarray(obs)
    img = img.convert('L')
    img = img.resize([84, 84], resample=PIL.Image.NEAREST, box=[0,34,160,160+34])
    return np.array(img)

In [None]:
def plot_frames(frames):
    stack = np.array(frames)  # convert LazyFrame to np.ndarray
    assert stack.shape == (84, 84, 4)
    fig, axes = plt.subplots(nrows=1, ncols=stack.shape[-1], figsize=[16,4])
    for i in range(stack.shape[-1]):
        axes[i].imshow(stack[:,:,i], cmap='gray', vmin=0, vmax=255)
        axes[i].set_title('frame '+str(i))
    plt.show()

In [None]:
class LazyFrames:
    def __init__(self, frames):
        assert isinstance(frames, list)
        assert isinstance(frames[0], np.ndarray)
        self._frames = frames   # list of np.ndarray
        
    def __array__(self, dtype=None):
        # print('__ARRAY__ called')
        merged = np.stack(self._frames, axis=-1)
        if dtype is not None:
            merged = merged.astype(dtype)
        return merged
    
    def __str__(self):
        return str(np.round(np.stack(self._frames, axis=-1), decimals=4))

In [None]:
class WrapAtari:
    def __init__(self, env):
        assert env.observation_space == gym.spaces.Box(low=0, high=255, shape=[210,160,3], dtype=np.uint8)
        
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=[84, 84, 4], dtype=np.uint8)
        self.action_space = env.action_space
        
        self._env = env
        self._frames = collections.deque(maxlen=4)
    
    def reset(self):
        raw_obs = self._env.reset()           # 160x120 RGB
        obs = preprocess(raw_obs)             # 84x84 grayscale
        for _ in range(self._frames.maxlen):
            self._frames.append(obs)          # replace all
        return LazyFrames(list(self._frames))
    
    def step(self, action):
        assert self.action_space.contains(action)
        raw_obs, rew, done, info = self._env.step(action)
        obs = preprocess(raw_obs)             # 84x84 grayscale
        self._frames.append(obs)
        return LazyFrames(list(self._frames)), np.sign(rew), done, info
    
    def render(self, mode='human'):
        return self._env.render(mode=mode)
    
    def close(self):
        self._env.close()

# Movning Dot

In [None]:
import moving_dot

In [None]:
import importlib
importlib.reload(moving_dot)

In [None]:
try: env.close()
except: pass
env = gym.make('MovingDot-v0')
env.max_steps = 100
env = WrapAtari(env)

In [None]:
cnn = TFNeuralNet(nb_out=5, logdir='tf_log_2/movingdot/test')
# cnn.setup_logdb('outarray.h5', 5)
model = TFFunctApprox(cnn, st_low=0, st_high=255, rew_mean=0, rew_std=1, nb_actions=5)

In [None]:
mem = Memory(max_len=10000, state_shape=(), state_dtype=object)
mem_fill(env, mem, steps=10000)

In [None]:
trace = Trace()
rewards = []

In [None]:
q_learning(trace.total_step, env, frames=10000, gamma=.95, eps_decay_steps=50000, eps_target=0.1,
           batch_size=32, model=model, mem=mem, callback=callback, trace=trace)
tr = evaluate(env, None, episodes=3, eps=0.05, model=model, render=True)
print('tr', tr)
rewards.append(tr)
plt.plot(rewards)

In [None]:
while trace.total_step < 25000:
    q_learning(trace.total_step, env, frames=5000, gamma=.95, eps_decay_steps=10000, eps_target=0.1,
           batch_size=32, model=model, mem=mem, callback=callback, trace=trace)
    # tr = evaluate(env, 10000, None, eps=0.05, model=model, render=True)
    tr = evaluate(env, None, episodes=10, eps=0.05, model=model, render=True)
    # cnn.save('./tf_models/PongDeterministic-v4_'+ str(trace.total_step) + '.ckpt')
    print('iter', trace.total_step, 'tr', tr)
    rewards.append(tr)
    plt.plot(rewards)
    plt.show()

In [None]:
plt.plot(trace.epsilons)

In [None]:
plt.plot(trace.ep_rewards.values())

# Pong - test test

This seems to work, 300k interation, Mem reply 50k, epsilon 1.0->0.1 over 50k

In [None]:
try: env.close()
except: pass
env = gym.make('PongDeterministic-v4')
env = WrapAtari(env)

In [None]:
cnn = TFNeuralNet(nb_out=6, logdir='tf_log_2/pong/5')
#cnn.setup_logdb('outarray.h5', 5)
model = TFFunctApprox(cnn, st_low=0, st_high=255, rew_mean=0, rew_std=1, nb_actions=6)

In [None]:
mem = Memory(max_len=200000, state_shape=(), state_dtype=object)

In [None]:
# %lprun -f preprocess mem_fill(env, mem, steps=10000)
mem_fill(env, mem, steps=10000, render=False)

In [None]:
trace = Trace()
rewards = []

In [None]:
# tr = evaluate(env, 10000, None, eps=0.05, model=model, render=True)
tr = evaluate(env, None, episodes=3, eps=0.0, model=model, render=True)
print('tr', tr)
rewards.append(tr)

In [None]:
while trace.total_step < 300000:
    q_learning(trace.total_step, env, frames=50000, gamma=.95, eps_decay_steps=50000, eps_target=0.1,
           batch_size=32, model=model, mem=mem, callback=callback, trace=trace)
    # tr = evaluate(env, 10000, None, eps=0.05, model=model, render=True)
    tr = evaluate(env, None, episodes=3, eps=0.0, model=model, render=True)
    # cnn.save('./tf_models/PongDeterministic-v4_'+ str(trace.total_step) + '.ckpt')
    print('iter', trace.total_step, 'tr', tr)
    rewards.append(tr)
    plt.plot(rewards)
    plt.show()

In [None]:
tr = evaluate(env, 10000, None, eps=0.0, model=model, render=True)

# Breakout - test test

In [None]:
env.close()
env = gym.make('BreakoutDeterministic-v4')
env = WrapAtari(env)

In [None]:
cnn = TFNeuralNet(nb_out=env.action_space.n)
#cnn.setup_logdb('outarray.h5', 5)
model = TFFunctApprox(cnn, st_low=0, st_high=255, rew_mean=0, rew_std=1, nb_actions=env.action_space.n)

In [None]:
mem = Memory(max_len=1000000, state_shape=(), state_dtype=object)

In [None]:
mem_fill(env, mem, steps=50000, render=True)

In [None]:
trace = Trace()
rewards = []

In [None]:
tr = evaluate(env, 10000, None, eps=0.05, model=model, render=True)
cnn.save('./tf_models/BreakoutDeterministic-v4_'+ str(trace.total_step) + '.ckpt')
print('iter', trace.total_step, 'tr', tr)
rewards.append(tr)
plt.plot(rewards)

In [None]:
while trace.total_step < 10000000:
    q_learning(trace.total_step, env, frames=50000, gamma=.95, eps_decay_steps=1000000, eps_target=0.1,
           batch_size=32, model=model, mem=mem, callback=callback, trace=trace)
    tr = evaluate(env, 10000, None, eps=0.05, model=model, render=True)
    cnn.save('./tf_models/BreakoutDeterministic-v4_'+ str(trace.total_step) + '.ckpt')
    print('iter', trace.total_step, 'tr', tr)
    rewards.append(tr)
    plt.plot(rewards)
    plt.show()

In [None]:
env.close()

# Pong

TODO

In [None]:
def experiment_pong(frames):
    
    cnn = TFNeuralNet(nb_out=6)
    #cnn.setup_logdb('outarray.h5', 5)
    
    model = TFFunctApprox(cnn, st_low=0, st_high=255, rew_mean=0, rew_std=1, nb_actions=6)
    
    mem = Memory(max_len=200000, state_shape=(), state_dtype=object)
    
#     trace = Trace(eval_every=1000,
#                  test_states=np.array([[0, 1.4, 0, 0, 0, 0, 0, 0],     # init
#                                        [0, 0.7, 0, 0, 0, 0, 0, 0],     # half way, no tilt
#                                        [0, 0.0, 0, 0, 0, 0, 0, 0],]),  # landing pad
#                  test_labels=['start', 'half-way', 'landing-pad'],
#                  state_labels=['Pos.x', 'Pos.y', 'Vel.x', 'Vel.y', 'Angle', 'Ang. Vel', 'Left Leg', 'Right Leg'])
    
    
    mem_fill(env, mem, steps=10000)
    
    if frames != 0:
        
        q_learning(0, env, frames=frames, gamma=.95, eps_decay_steps=50000, eps_target=0.1,
                   batch_size=32, model=model, mem=mem, callback=callback, trace=trace)
    
    return trace, model, mem

In [None]:
# Create, don't train
env = gym.make('PongDeterministic-v4')
env = WrapAtari(env)
trace_rl, model, mem = experiment_pong(frames=0)

In [None]:
# Train from scratch
env = gym.make('PongDeterministic-v4')
trace_rl, model, mem = experiment_lunarlander_tf(frames=200000)

In [None]:
# Save weights
model._model.save('./tf_models/PongNoFrameskip-v4.ckpt')

In [None]:
# Load weights
model._model.load('./tf_models/PongNoFrameskip-v4.ckpt')

In [None]:
def callback_disp(total_step, episode, tstep, st, act, rew_, done_, eps, model, memory, trace):
    if done_:
        print(rew_, done_)
    pdb.set_trace()

In [None]:
# Enjoy agent
try:
    enjoy_env(env, frames=float('inf'), eps=1.0, model=model, callback=callback_disp)
except KeyboardInterrupt:
    pass
finally:
    env.close()

---

# Enjoy Random Pong

THIS WORKS

In [None]:
# Create, don't train
env = gym.make('PongDeterministic-v4')
env = WrapAtari(env)

In [None]:
def callback_disp(total_step, episode, tstep, st, act, rew_, done_, eps, model, memory, trace):
    global axes
    if rew_ != 0:
        print('rew:', rew_)
    if done_:
        print('done:', done_)
    # plot_frames(st)

In [None]:
# Enjoy agent
try:
    enjoy_env(env, frames=float('inf'), episodes=1, eps=1.0, model=None, callback=callback_disp)
except KeyboardInterrupt:
    pass
finally:
    env.close()

# Function Approximators and Memory

In [None]:
class TFNeuralNet():
    def __init__(self, nb_out, logdir):
        
        self.nb_out = nb_out
        self._time_step = 0
        
        try:    sess.close()
        except: pass
        tf.reset_default_graph()
        
        self._log_filename = None
        self._dict_layers = {}

        self._x = tf.placeholder(name='x', shape=[None, 84, 84, 4], dtype=tf.float32)
        self._y = tf.placeholder(name='y', shape=[None, nb_out], dtype=tf.float32)

        model = tf.layers.conv2d(self._x, filters=16, kernel_size=[8, 8], strides=[4, 4],
                                 padding='valid', activation=tf.nn.relu, name='Conv_1')
            
        model = tf.layers.conv2d(model, filters=32, kernel_size=[4, 4], strides=[2, 2],
                                 padding='valid', activation=tf.nn.relu, name='Conv_2')
        
        model = tf.layers.flatten(model)
        model = tf.layers.dense(model, 256, activation=tf.nn.relu, name='Dense')
        self._y_hat = tf.layers.dense(model, nb_out, activation=None, name='Output')
        
        # self._mse = tf.reduce_mean( tf.pow(self._y - self._y_hat, 2) )
        self._loss = tf.losses.mean_squared_error(self._y, self._y_hat)

        # No gradient clipping
        self._optimizer = tf.train.AdamOptimizer(learning_rate=0.00025)
        # self._optimizer = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.0, momentum=0.95, epsilon=0.01)
        self._grads_and_vars = self._optimizer.compute_gradients(self._loss)
        self._train_op = self._optimizer.apply_gradients(self._grads_and_vars)
        
        # Global gradient clipping
#         self._optimizer = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.0, momentum=0.95, epsilon=0.01)
#         gradients, variables = zip(*self._optimizer.compute_gradients(self._loss))
#         gradients, _ = tf.clip_by_global_norm(gradients, 1)
#         self._train_op = self._optimizer.apply_gradients(zip(gradients, variables))
        
        # Per matrix
#         self._optimizer = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.0, momentum=0.95, epsilon=0.01)
#         gradients, variables = zip(*self._optimizer.compute_gradients(self._loss))
#         gradients = [ None if gradient is None else tf.clip_by_norm(gradient, 1.0) for gradient in gradients ]
#         self._train_op = self._optimizer.apply_gradients(zip(gradients, variables))
        
        tf.summary.scalar('loss', self._loss)

        self._sess = tf.Session()
        self._sess.run(tf.global_variables_initializer())
        
        self._merged_summaries = tf.summary.merge_all()
        self._writer = tf.summary.FileWriter(logdir=logdir, graph=self._sess.graph)
        self._writer.flush()
                
    def backward(self, x, y):
        assert x.ndim == 4
        assert y.ndim == 2
        assert x.shape == (32, 84, 84, 4)
        
        dict_layers, merged_summaries, _, loss = \
            self._sess.run([self._dict_layers, self._merged_summaries, self._train_op, self._loss],
                            feed_dict={self._x: x, self._y: y})
        
        self._writer.add_summary(merged_summaries, self._time_step)
        self._time_step += 1
        
        if self._log_filename is not None:
            tables_logger.append_log(self._log_filename, dict_layers)
        
        return loss
    
    def forward(self, x):
        return self._sess.run(self._y_hat, feed_dict={self._x: x})
    
    def save(self, filepath):
        saver = tf.train.Saver()
        saver.save(self._sess, filepath)
        
    def load(self, filepath):
        saver = tf.train.Saver()
        saver.restore(self._sess, filepath)
        
    def setup_logdb(self, filename, batch_save):
        
        graph = tf.get_default_graph()

        dict_inout = {
            #'batch_x' : cnn._x[0:batch_save,:,:,:],
            'batch_y' : cnn._y[0:batch_save,:],
        }

        dict_conv_1 = {
            'W': graph.get_tensor_by_name('Conv_1/kernel:0'),
            'b': graph.get_tensor_by_name('Conv_1/bias:0'),
            'dW': graph.get_tensor_by_name('gradients/Conv_1/Conv2D_grad/tuple/control_dependency_1:0'),
            'db': graph.get_tensor_by_name('gradients/Conv_1/BiasAdd_grad/tuple/control_dependency_1:0'),
            'z': graph.get_tensor_by_name('Conv_1/BiasAdd:0')[0:batch_save,:,:,:],
        }

        dict_conv_2 = {
            'W': graph.get_tensor_by_name('Conv_2/kernel:0'),
            'b': graph.get_tensor_by_name('Conv_2/bias:0'),
            'dW': graph.get_tensor_by_name('gradients/Conv_2/Conv2D_grad/tuple/control_dependency_1:0'),
            'db': graph.get_tensor_by_name('gradients/Conv_2/BiasAdd_grad/tuple/control_dependency_1:0'),
            'z': graph.get_tensor_by_name('Conv_2/BiasAdd:0')[0:batch_save,:,:,:],
        }

        dict_dense = {
            'W': graph.get_tensor_by_name('Dense/kernel:0')[:100,:50],
            'b': graph.get_tensor_by_name('Dense/bias:0'),
            'dW': graph.get_tensor_by_name('gradients/Dense/MatMul_grad/tuple/control_dependency_1:0')[:100,:50],
            'db': graph.get_tensor_by_name('gradients/Dense/BiasAdd_grad/tuple/control_dependency_1:0'),
            'z': graph.get_tensor_by_name('Dense/BiasAdd:0')[0:batch_save,:],
        }

        dict_output = {
            'W': graph.get_tensor_by_name('Output/kernel:0'),
            'b': graph.get_tensor_by_name('Output/bias:0'),
            'dW': graph.get_tensor_by_name('gradients/Output/MatMul_grad/tuple/control_dependency_1:0'),
            'db': graph.get_tensor_by_name('gradients/Output/BiasAdd_grad/tuple/control_dependency_1:0'),
            'z': graph.get_tensor_by_name('Output/BiasAdd:0')[0:batch_save,:],
        }

        dict_metrics = {
            'loss': cnn._loss,
        }

        self._log_filename = filename
        self._dict_layers = {
            'inout': dict_inout,
            'conv_1': dict_conv_1,
            'conv_2': dict_conv_2,
            'dense': dict_dense,
            'output': dict_output,
            'metrics': dict_metrics,
        }

        tables_logger.create_log(filename, self._dict_layers, batch_save)

In [None]:
class TFFunctApprox():

    def __init__(self, model, st_low, st_high, rew_mean, rew_std, nb_actions):
        """Q-function approximator using Keras model

        Args:
            model: Keras compiled model
        """
        self._model = model
        
        assert np.isscalar(st_low) and np.isscalar(st_high)
        
        if nb_actions != model.nb_out:
            raise ValueError('Output shape does not match action_space shape')

        # normalise inputs
        self._offsets = st_low + (st_high - st_low) / 2
        self._scales = 1 / ((st_high - st_low) / 2)
        
        self._rew_mean = rew_mean
        self._rew_std = rew_std

    def eval(self, states):
        assert isinstance(states, np.ndarray)
        assert states.ndim == 4
        assert states.shape == (32, 84, 84, 4) or states.shape == (1, 84, 84, 4)
        
        inputs = (states - self._offsets) * self._scales

        y_hat = self._model.forward(inputs)
        
        return y_hat*self._rew_std + self._rew_mean

    def train(self, states, actions, targets):
        
        assert isinstance(states, np.ndarray)
        assert isinstance(actions, np.ndarray)
        assert isinstance(targets, np.ndarray)
        assert states.ndim == 4
        assert actions.ndim == 1
        assert targets.ndim == 1
        assert len(states) == len(actions) == len(targets)
        
        targets = (targets-self._rew_mean) / self._rew_std    # decreases range (std>1) to approx -1..1

        inputs = (states - self._offsets) * self._scales
        all_targets = self._model.forward(inputs)             # this range should be small already
        all_targets[np.arange(len(all_targets)), actions] = targets
        return self._model.backward(inputs, all_targets)


In [None]:
class Memory:
    """Circular buffer for DQN memory reply. Fairly fast."""

    def __init__(self, max_len, state_shape, state_dtype):
        """
        Args:
            max_len: maximum capacity
        """
        assert isinstance(max_len, int)
        assert max_len > 0

        self.max_len = max_len                            # maximum length        
        self._curr_insert_ptr = 0                          # index to insert next data sample
        self._curr_len = 0                                 # number of currently stored elements

        state_arr_shape = [max_len] + list(state_shape)

        self._hist_St = np.zeros(state_arr_shape, dtype=state_dtype)
        self._hist_At = np.zeros(max_len, dtype=int)
        self._hist_Rt_1 = np.zeros(max_len, dtype=float)
        self._hist_St_1 = np.zeros(state_arr_shape, dtype=state_dtype)
        self._hist_done_1 = np.zeros(max_len, dtype=bool)

    def append(self, St, At, Rt_1, St_1, done_1):
        """Add one sample to memory, override oldest if max_len reached.

        Args:
            St [np.ndarray]   - state
            At [int]          - action
            Rt_1 [float]      - reward
            St_1 [np.ndarray] - next state
            done_1 [bool]       - next state terminal?
        """
        self._hist_St[self._curr_insert_ptr] = St
        self._hist_At[self._curr_insert_ptr] = At
        self._hist_Rt_1[self._curr_insert_ptr] = Rt_1
        self._hist_St_1[self._curr_insert_ptr] = St_1
        self._hist_done_1[self._curr_insert_ptr] = done_1
        
        if self._curr_len < self.max_len:                 # keep track of current length
            self._curr_len += 1
            
        self._curr_insert_ptr += 1                         # increment insertion pointer
        if self._curr_insert_ptr >= self.max_len:         # roll to zero if needed
            self._curr_insert_ptr = 0

    def __len__(self):
        """Number of samples in memory, 0 <= length <= max_len"""
        return self._curr_len

    def get_batch(self, batch_len):
        """Sample batch of data, with repetition

        Args:
            batch_len: nb of samples to pick

        Returns:
            states, actions, rewards, next_states, next_done, indices
            Each returned element is np.ndarray with length == batch_len
        """
        assert self._curr_len > 0
        assert batch_len > 0

        
        indices = np.random.randint(                   # randint much faster than np.random.sample
            low=0, high=self._curr_len, size=batch_len, dtype=int)

        states = np.take(self._hist_St, indices, axis=0)
        actions = np.take(self._hist_At, indices, axis=0)
        rewards_1 = np.take(self._hist_Rt_1, indices, axis=0)
        states_1 = np.take(self._hist_St_1, indices, axis=0)
        dones_1 = np.take(self._hist_done_1, indices, axis=0)
        
        if states.dtype == object and isinstance(mem._hist_St[0], LazyFrames): 
            states = np.stack(states)       # convert to single np.ndarray shape [batch_size, 4, 84, 84]
            states_1 = np.stack(states_1)   # where '4' is number of history frames presented to agent

        return states, actions, rewards_1, states_1, dones_1, indices


    
    def pick_last(self, nb):
        """Pick last nb elements from memory
        
        Returns:
            states, actions, rewards, next_states, done_1, indices
            Each returned element is np.ndarray with length == batch_len
        """
        assert nb <= self._curr_len
        
        start = self._curr_insert_ptr - nb                # inclusive
        end = self._curr_insert_ptr                       # not inclusive
        indices = np.array(range(start,end), dtype=int)   # indices to pick, can be negative
        indices[indices < 0] += self._curr_len            # loop negative to positive
        
        states = np.take(self._hist_St, indices, axis=0)
        actions = np.take(self._hist_At, indices, axis=0)
        rewards_1 = np.take(self._hist_Rt_1, indices, axis=0)
        states_1 = np.take(self._hist_St_1, indices, axis=0)
        dones_1 = np.take(self._hist_done_1, indices, axis=0)
        
        if states.dtype == object and isinstance(mem._hist_St[0], LazyFrames): 
            states = np.stack(states)       # convert to single np.ndarray shape [batch_size, 4, 84, 84]
            states_1 = np.stack(states_1)   # where '4' is number of history frames presented to agent
        
        return states, actions, rewards_1, states_1, dones_1, indices
    

---
below is just testing

<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>

# Pong NN Test

In [None]:
env = gym.make('PongDeterministic-v4')
env = WrapAtari(env)

In [None]:
mem = Memory(max_len=10000, state_shape=(), state_dtype=object)
mem_fill(env, mem, one_episode=False)
print(len(mem))

In [None]:
states, actions, rewards, n_states, dones, _ = mem.pick_last(len(mem))

In [None]:
np.count_nonzero(rewards==-1)

In [None]:
np.count_nonzero(rewards==1)

In [None]:
np.count_nonzero(rewards==0)

In [None]:
del states
del n_states

In [None]:
cnn = TFNeuralNet(nb_out=6)
#cnn.setup_logdb('outarray.h5', 5)

In [None]:
model = TFFunctApprox(cnn, st_low=0, st_high=255, rew_mean=0, rew_std=1, nb_actions=6)

In [None]:
gamma = 0.95

In [None]:
batch_size = 32

THIS SHOULD CONVERGE

In [None]:
losses = []
for i in range(50000):
    states, actions, rewards, n_states, dones, _ = mem.get_batch(batch_size)
    targets = model.eval(n_states)
    targets = rewards + gamma * np.max(targets, axis=-1)
    targets[dones] = rewards[dones]                # return of next-to-terminal state is just R
    loss = model.train(states, actions, targets)
    
    losses.append(loss)
    if i % 25 == 0:
        print(i, loss)

# Test CNN Forward

In [None]:
env = gym.make('PongDeterministic-v4')
env = WrapAtari(env)

In [None]:
mem = Memory(max_len=1000, state_shape=(), state_dtype=object)

In [None]:
mem_fill(env, mem)

In [None]:
states, actions, rewards_1, states_1, dones_1, indices = mem.get_batch(10)

In [None]:
for i in range(10):
    print('----')
    print(rewards_1[i])
    plot_frames(states[i])

In [None]:
cnn = TFNeuralNet(nb_out=6)

In [None]:
states, actions, rewards_1, states_1, dones_1, indices = mem.get_batch(10)

In [None]:
states_nn = states / 255

In [None]:
cnn.forward(states_nn)

In [None]:
tf.trainable_variables()

In [None]:
writer = tf.summary.FileWriter(logdir='tf_log', graph=cnn._sess.graph)
writer.flush()

# Test CNN Logging

In [None]:
cnn = TFNeuralNet(nb_out=6)

In [None]:
filename = 'outarray.h5'
cnn.setup_logdb(filename, batch_save=10)

In [None]:
tables_logger.print_log(filename)

# Test Lazy Frame

In [None]:
A = np.array([1, 1, 1])
B = np.array([2, 2, 2])
C = np.array([3, 3, 3])

lf1 = LazyFrame([A, B])
lf2 = LazyFrame([B, C])

In [None]:
mem = np.zeros(shape=[10], dtype=object)

In [None]:
mem[0] = lf1
mem[1] = lf2

In [None]:
lf1._frames[0][0] = 4

In [None]:
lf1._frames

In [None]:
np.stack(mem[[0,1]])

In [None]:
np.array(mem[0])

# Test Evaluate

In [None]:
env.close()
env = gym.make('PongDeterministic-v4')
env = WrapAtari(env)

In [None]:
cnn = TFNeuralNet(nb_out=6)
model = TFFunctApprox(cnn, st_low=0, st_high=255, rew_mean=0, rew_std=1, nb_actions=6)

In [None]:
def callback_disp(total_step, episode, tstep, st, act, rew_, done_, eps, model, memory, trace):
    if done_:
        print(total_step)
    # pdb.set_trace()

In [None]:
ts = time.time()
tr = evaluate(env, 1000, None, eps=0.05, model=model, callback=callback_disp, render=True)
print(time.time() - ts)

In [None]:
env.close()

In [None]:
print(tr)

# Test fill

In [None]:
env.close()
env = gym.make('PongDeterministic-v4')
env = WrapAtari(env)

In [None]:
mem = Memory(10000, (), object)

In [None]:
mem_fill(env, mem, steps=2000)

In [None]:
env.close()

In [None]:
len(mem)

In [None]:
states, actions, rewards_1, states_1, dones_1, indices = mem.pick_last(len(mem))

In [None]:
print(np.count_nonzero(dones_1))

In [None]:
print('rew  1', np.count_nonzero(rewards_1==1))
print('rew  0', np.count_nonzero(rewards_1==0))
print('rew -1', np.count_nonzero(rewards_1==-1))

# Test Mem Object

In [None]:
env = gym.make('PongDeterministic-v4')
env = WrapAtari(env)

In [None]:
env = gym.make('MovingDot3-v0')
env = WrapAtari(env)

In [None]:
lframes = env.reset()

In [None]:
lframes_, rew_, done_, _ = env.step(0)

In [None]:
mem = Memory(10, (), object)

In [None]:
mem.append(lframes, 0, rew_, lframes_, done_)

In [None]:
lframes = lframes_
lframes_, rew_, done_, _ = env.step(0)
mem.append(lframes, 0, rew_, lframes_, done_)

In [None]:
print(mem._hist_St)
print(mem._hist_At)
print(mem._hist_Rt_1)
print(mem._hist_St_1)
print(mem._hist_done_1)

In [None]:
arr = np.take(mem._hist_St, np.array([0, 1]), axis=0)

In [None]:
arr

In [None]:
np.stack(arr).shape

In [None]:
plot_frames(np.stack(arr)[0])

In [None]:
plot_frames(np.stack(arr)[1])

In [None]:
states, actions, rewards_1, states_1, dones_1, indices = mem.get_batch(2)

In [None]:
print(states.shape)
print(actions.shape)
print(rewards_1.shape)
print(states_1.shape)
print(dones_1.shape)

# Test render

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from matplotlib import animation, rc
from IPython.display import HTML

In [None]:
def frames_render(env, frames, episodes, eps, model, callback=None, trace=None, render=True, sleep=0):

    rendered_frames = []
    
    def policy(st, model, eps):
        if np.random.rand() > eps:
            stack = np.stack([st])  # convert lazyframe to nn input shape [1, 84, 84, 4]
            q_values = model.eval(stack)
            return np.argmax(q_values)
        else:
            return env.action_space.sample()
        
    total_reward = 0
    
    tts_ = 0                                 # total time step
    for e_ in itertools.count():             # count from 0 to infinity
        
        S = env.reset()
        
        if render:
            rendered_frames.append(env.render(mode='rgb_array'))
            time.sleep(sleep)
        
        for t_ in itertools.count():         # count from 0 to infinity
            
            A = policy(S, model, eps)
            
            S_, R, done, _ = env.step(A)
            
            total_reward += R
            
            if render:
                rendered_frames.append(env.render(mode='rgb_array'))
                time.sleep(sleep)
            
            if callback is not None:
                callback(tts_, e_, t_, S, A, R, done, eps, model, None, trace)
    
            if done:
                break
                
            if frames is not None and tts_ >= frames:
                return rendered_frames
                
            S = S_
                
            tts_ += 1
            
        if episodes is not None and e_ >= episodes-1:
            return rendered_frames

In [None]:
rendered_frames = frames_render(env, frames=None, episodes=1, eps=0.0, model=model, render=True)

In [None]:
plt.ioff()

fig = plt.figure(figsize=(rendered_frames[0].shape[1] / 72.0, rendered_frames[0].shape[0] / 72.0), dpi = 72)
ax = fig.add_subplot(111);

patch = ax.imshow(rendered_frames[0])
# plt.axis('off');

In [None]:
def animate(i):
    patch.set_data(rendered_frames[i])

In [None]:
anim = animation.FuncAnimation(fig, animate, frames=len(rendered_frames), interval=20, repeat=False)


In [None]:
HTML(anim.to_html5_video())