In [None]:
import gym
import random
import numpy as np
import tensorflow as tf
from keras import layers
from skimage.color import rgb2gray
from skimage.transform import resize
from keras.models import Model

from collections import deque
from keras.optimizers import RMSprop
from keras import backend as K
from datetime import datetime
import os.path
import time
from keras.models import load_model
from keras.models import clone_model
from keras.callbacks import TensorBoard

FLAGS = tf.app.flags.FLAGS

tf.app.flags.DEFINE_string('train_dir', 'tf_train_breakout',
                           """Directory where to write event logs and checkpoint. """)
tf.app.flags.DEFINE_string('restore_file_path',
                           '/home/paperspace/fastai/courses/tim/breakout_model_ep_sec_version.h5',
                           """Path of the restore file """)
tf.app.flags.DEFINE_integer('num_episode', 100000,
                            """number of epochs of the optimization loop.""")
# tf.app.flags.DEFINE_integer('observe_step_num', 5000,
tf.app.flags.DEFINE_integer('observe_step_num', 50000,
                            """Timesteps to observe before training.""")
# tf.app.flags.DEFINE_integer('epsilon_step_num', 50000,
tf.app.flags.DEFINE_integer('epsilon_step_num', 1000000,
                            """frames over which to anneal epsilon.""")
tf.app.flags.DEFINE_integer('refresh_target_model_num', 10000,  # update the target Q model every refresh_target_model_num
                            """frames over which to anneal epsilon.""")
tf.app.flags.DEFINE_integer('replay_memory', 400000,  # takes up to 20 GB to store this amount of history data
                            """number of previous transitions to remember.""")
tf.app.flags.DEFINE_integer('no_op_steps', 30,
                            """Number of the steps that runs before script begin.""")
tf.app.flags.DEFINE_float('regularizer_scale', 0.01,
                          """L1 regularizer scale.""")
tf.app.flags.DEFINE_integer('batch_size', 32,
                            """Size of minibatch to train.""")
tf.app.flags.DEFINE_float('learning_rate', 0.00025,
                          """Number of batches to run.""")
tf.app.flags.DEFINE_float('init_epsilon', 1.0,
                          """starting value of epsilon.""")
tf.app.flags.DEFINE_float('final_epsilon', 0.1,
                          """final value of epsilon.""")
tf.app.flags.DEFINE_float('gamma', 0.99,
                          """decay rate of past observations.""")
tf.app.flags.DEFINE_boolean('resume', False,
                            """Whether to resume from previous checkpoint.""")
tf.app.flags.DEFINE_boolean('render', False,
                            """Whether to display the game.""")

ATARI_SHAPE = (84, 84, 4)  # input image size to model
ACTION_SIZE = 3


# 210*160*3(color) --> 84*84(mono)
# float --> integer (to reduce the size of replay memory)
def pre_processing(observe):
    processed_observe = np.uint8(
        resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
    return processed_observe


def huber_loss(y, q_value):
    error = K.abs(y - q_value)
    quadratic_part = K.clip(error, 0.0, 1.0)
    linear_part = error - quadratic_part
    loss = K.mean(0.5 * K.square(quadratic_part) + linear_part)
    return loss


def atari_model():
    # With the functional API we need to define the inputs.
    frames_input = layers.Input(ATARI_SHAPE, name='frames')
    actions_input = layers.Input((ACTION_SIZE,), name='action_mask')

    # Assuming that the input frames are still encoded from 0 to 255. Transforming to [0, 1].
    normalized = layers.Lambda(lambda x: x / 255.0, name='normalization')(frames_input)

    # "The first hidden layer convolves 16 8×8 filters with stride 4 with the input image and applies a rectifier nonlinearity."
    conv_1 = layers.convolutional.Conv2D(
        16, (8, 8), strides=(4, 4), activation='relu'
    )(normalized)
    # "The second hidden layer convolves 32 4×4 filters with stride 2, again followed by a rectifier nonlinearity."
    conv_2 = layers.convolutional.Conv2D(
        32, (4, 4), strides=(2, 2), activation='relu'
    )(conv_1)
    # Flattening the second convolutional layer.
    conv_flattened = layers.core.Flatten()(conv_2)
    # "The final hidden layer is fully-connected and consists of 256 rectifier units."
    hidden = layers.Dense(256, activation='relu')(conv_flattened)
    # "The output layer is a fully-connected linear layer with a single output for each valid action."
    output = layers.Dense(ACTION_SIZE)(hidden)
    # Finally, we multiply the output by the mask!
    filtered_output = layers.Multiply(name='QValue')([output, actions_input])

    model = Model(inputs=[frames_input, actions_input], outputs=filtered_output)
    model.summary()
    optimizer = RMSprop(lr=FLAGS.learning_rate, rho=0.95, epsilon=0.01)
    # model.compile(optimizer, loss='mse')
    # to changed model weights more slowly, uses MSE for low values and MAE(Mean Absolute Error) for large values
    model.compile(optimizer, loss=huber_loss)
    return model


# get action from model using epsilon-greedy policy
def get_action(history, epsilon, step, model):
    if np.random.rand() <= epsilon or step <= FLAGS.observe_step_num:
        return random.randrange(ACTION_SIZE)
    else:
        q_value = model.predict([history, np.ones(ACTION_SIZE).reshape(1, ACTION_SIZE)])
        return np.argmax(q_value[0])


# save sample <s,a,r,s'> to the replay memory
def store_memory(memory, history, action, reward, next_history, dead):
    memory.append((history, action, reward, next_history, dead))


def get_one_hot(targets, nb_classes):
    return np.eye(nb_classes)[np.array(targets).reshape(-1)]


# train model by radom batch
def train_memory_batch(memory, model, log_dir):
    mini_batch = random.sample(memory, FLAGS.batch_size)
    history = np.zeros((FLAGS.batch_size, ATARI_SHAPE[0],
                        ATARI_SHAPE[1], ATARI_SHAPE[2]))
    next_history = np.zeros((FLAGS.batch_size, ATARI_SHAPE[0],
                             ATARI_SHAPE[1], ATARI_SHAPE[2]))
    target = np.zeros((FLAGS.batch_size,))
    action, reward, dead = [], [], []

    for idx, val in enumerate(mini_batch):
        history[idx] = val[0]
        next_history[idx] = val[3]
        action.append(val[1])
        reward.append(val[2])
        dead.append(val[4])

    actions_mask = np.ones((FLAGS.batch_size, ACTION_SIZE))
    next_Q_values = model.predict([next_history, actions_mask])

    # like Q Learning, get maximum Q value at s'
    # But from target model
    for i in range(FLAGS.batch_size):
        if dead[i]:
            target[i] = -1
            # target[i] = reward[i]
        else:
            target[i] = reward[i] + FLAGS.gamma * np.amax(next_Q_values[i])

    action_one_hot = get_one_hot(action, ACTION_SIZE)
    target_one_hot = action_one_hot * target[:, None]

    #tb_callback = TensorBoard(log_dir=log_dir, histogram_freq=0,
    #                          write_graph=True, write_images=False)

    ''''''
    h = model.fit(
        [history, action_one_hot], target_one_hot, epochs=1,
        batch_size=FLAGS.batch_size, verbose=0)
        #batch_size=FLAGS.batch_size, verbose=0, callbacks=[tb_callback])

    #if h.history['loss'][0] > 10.0:
    #    print('too large')

    return h.history['loss'][0]


def train():
    env = gym.make('BreakoutDeterministic-v4')

    # deque: Once a bounded length deque is full, when new items are added,
    # a corresponding number of items are discarded from the opposite end
    memory = deque(maxlen=FLAGS.replay_memory)
    episode_number = 0
    epsilon = FLAGS.init_epsilon
    epsilon_decay = (FLAGS.init_epsilon - FLAGS.final_epsilon) / FLAGS.epsilon_step_num
    global_step = 0

    if FLAGS.resume:
        #model = load_model(FLAGS.restore_file_path)
        model = atari_model()
        # Assume when we restore the model, the epsilon has already decreased to the final value
        epsilon = FLAGS.final_epsilon
    else:
        model = atari_model()

    now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
    log_dir = "{}/run-{}-log".format(FLAGS.train_dir, now)
    file_writer = tf.summary.FileWriter(log_dir, tf.get_default_graph())

    model_target = clone_model(model)
    model_target.set_weights(model.get_weights())

    while episode_number < FLAGS.num_episode:

        done = False
        dead = False
        # 1 episode = 5 lives
        step, score, start_life = 0, 0, 5
        loss = 0.0
        observe = env.reset()

        # this is one of DeepMind's idea.
        # just do nothing at the start of episode to avoid sub-optimal
        for _ in range(random.randint(1, FLAGS.no_op_steps)):
            observe, _, _, _ = env.step(1)
        # At start of episode, there is no preceding frame
        # So just copy initial states to make history
        state = pre_processing(observe)
        history = np.stack((state, state, state, state), axis=2)
        history = np.reshape([history], (1, 84, 84, 4))

        while not done:
            if FLAGS.render:
                #env.render()
                time.sleep(0.01)

            # get action for the current history and go one step in environment
            action = get_action(history, epsilon, global_step, model_target)
            # change action to real_action
            real_action = action + 1

            # scale down epsilon, the epsilon only begin to decrease after observe steps
            if epsilon > FLAGS.final_epsilon and global_step > FLAGS.observe_step_num:
                epsilon -= epsilon_decay

            observe, reward, done, info = env.step(real_action)
            # pre-process the observation --> history
            next_state = pre_processing(observe)
            next_state = np.reshape([next_state], (1, 84, 84, 1))
            next_history = np.append(next_state, history[:, :, :, :3], axis=3)

            # if the agent missed ball, agent is dead --> episode is not over
            if start_life > info['ale.lives']:
                dead = True
                start_life = info['ale.lives']

            # TODO: may be we should give negative reward if miss ball (dead)
            # reward = np.clip(reward, -1., 1.)  # clip here is not correct

            # save the statue to memory, each replay takes 2 * (84*84*4) bytes = 56448 B = 55.125 KB
            store_memory(memory, history, action, reward, next_history, dead)  #

            # check if the memory is ready for training
            if global_step > FLAGS.observe_step_num:
                loss = loss + train_memory_batch(memory, model, log_dir)
                # if loss > 100.0:
                #    print(loss)
                if global_step % FLAGS.refresh_target_model_num == 0:  # update the target model
                    model_target.set_weights(model.get_weights())

            score += reward

            # If agent is dead, set the flag back to false, but keep the history unchanged,
            # to avoid to see the ball up in the sky
            if dead:
                dead = False
            else:
                history = next_history

            #print("step: ", global_step)
            global_step += 1
            step += 1

            if done:
                if global_step <= FLAGS.observe_step_num:
                    state = "observe"
                elif FLAGS.observe_step_num < global_step <= FLAGS.observe_step_num + FLAGS.epsilon_step_num:
                    state = "explore"
                else:
                    state = "train"
                print('state: {}, episode: {}, score: {}, global_step: {}, avg loss: {}, step: {}, memory length: {}'
                      .format(state, episode_number, score, global_step, loss / float(step), step, len(memory)))

                if episode_number % 100 == 0 or (episode_number + 1) == FLAGS.num_episode:
                #if episode_number % 1 == 0 or (episode_number + 1) == FLAGS.num_episode:  # debug
                    now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
                    #file_name = "breakout_model_{}.h5".format(now)
                    #model_path = os.path.join(FLAGS.train_dir, file_name)
                    model.save(FLAGS.restore_file_path)

                # Add user custom data to TensorBoard
                loss_summary = tf.Summary(
                    value=[tf.Summary.Value(tag="loss", simple_value=loss / float(step))])
                file_writer.add_summary(loss_summary, global_step=episode_number)

                score_summary = tf.Summary(
                    value=[tf.Summary.Value(tag="score", simple_value=score)])
                file_writer.add_summary(score_summary, global_step=episode_number)

                episode_number += 1

    file_writer.close()


def test():
    env = gym.make('BreakoutDeterministic-v4')

    episode_number = 0
    epsilon = 0.001
    global_step = FLAGS.observe_step_num+1
    #model = load_model(FLAGS.restore_file_path)
    model = load_model(FLAGS.restore_file_path, custom_objects={'huber_loss': huber_loss})  # load model with customized loss func

    # test how to deep copy a model
    '''
    model_copy = clone_model(model)    # only copy the structure, not the value of the weights
    model_copy.set_weights(model.get_weights())
    '''

    while episode_number < FLAGS.num_episode:

        done = False
        dead = False
        # 1 episode = 5 lives
        score, start_life = 0, 5
        observe = env.reset()

        observe, _, _, _ = env.step(1)
        # At start of episode, there is no preceding frame
        # So just copy initial states to make history
        state = pre_processing(observe)
        history = np.stack((state, state, state, state), axis=2)
        history = np.reshape([history], (1, 84, 84, 4))

        while not done:
            #env.render()
            time.sleep(0.01)

            # get action for the current history and go one step in environment
            action = get_action(history, epsilon, global_step, model)
            # change action to real_action
            real_action = action + 1

            observe, reward, done, info = env.step(real_action)
            # pre-process the observation --> history
            next_state = pre_processing(observe)
            next_state = np.reshape([next_state], (1, 84, 84, 1))
            next_history = np.append(next_state, history[:, :, :, :3], axis=3)

            # if the agent missed ball, agent is dead --> episode is not over
            if start_life > info['ale.lives']:
                dead = True
                start_life = info['ale.lives']

            # TODO: may be we should give negative reward if miss ball (dead)
            reward = np.clip(reward, -1., 1.)

            score += reward

            # If agent is dead, set the flag back to false, but keep the history unchanged,
            # to avoid to see the ball up in the sky
            if dead:
                dead = False
            else:
                history = next_history

            # print("step: ", global_step)
            global_step += 1

            if done:
                episode_number += 1
                print('episode: {}, score: {}'.format(episode_number, score))


def main(argv=None):
     train()
    #test()


if __name__ == '__main__':
    tf.app.run()

Using TensorFlow backend.


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
frames (InputLayer)             (None, 84, 84, 4)    0                                            
__________________________________________________________________________________________________
normalization (Lambda)          (None, 84, 84, 4)    0           frames[0][0]                     
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 20, 20, 16)   4112        normalization[0][0]              
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 9, 9, 32)     8224        conv2d_1[0][0]                   
__________________________________________________________________________________________________
flatten_1 

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


state: observe, episode: 0, score: 0.0, global_step: 111, avg loss: 0.0, step: 111, memory length: 111
state: observe, episode: 1, score: 2.0, global_step: 305, avg loss: 0.0, step: 194, memory length: 305
state: observe, episode: 2, score: 1.0, global_step: 434, avg loss: 0.0, step: 129, memory length: 434
state: observe, episode: 3, score: 5.0, global_step: 767, avg loss: 0.0, step: 333, memory length: 767
state: observe, episode: 4, score: 0.0, global_step: 859, avg loss: 0.0, step: 92, memory length: 859
state: observe, episode: 5, score: 1.0, global_step: 1000, avg loss: 0.0, step: 141, memory length: 1000
state: observe, episode: 6, score: 0.0, global_step: 1112, avg loss: 0.0, step: 112, memory length: 1112
state: observe, episode: 7, score: 2.0, global_step: 1308, avg loss: 0.0, step: 196, memory length: 1308
state: observe, episode: 8, score: 1.0, global_step: 1478, avg loss: 0.0, step: 170, memory length: 1478
state: observe, episode: 9, score: 0.0, global_step: 1580, avg los

state: observe, episode: 78, score: 1.0, global_step: 12275, avg loss: 0.0, step: 138, memory length: 12275
state: observe, episode: 79, score: 1.0, global_step: 12434, avg loss: 0.0, step: 159, memory length: 12434
state: observe, episode: 80, score: 0.0, global_step: 12562, avg loss: 0.0, step: 128, memory length: 12562
state: observe, episode: 81, score: 1.0, global_step: 12736, avg loss: 0.0, step: 174, memory length: 12736
state: observe, episode: 82, score: 0.0, global_step: 12864, avg loss: 0.0, step: 128, memory length: 12864
state: observe, episode: 83, score: 2.0, global_step: 13074, avg loss: 0.0, step: 210, memory length: 13074
state: observe, episode: 84, score: 4.0, global_step: 13353, avg loss: 0.0, step: 279, memory length: 13353
state: observe, episode: 85, score: 0.0, global_step: 13461, avg loss: 0.0, step: 108, memory length: 13461
state: observe, episode: 86, score: 3.0, global_step: 13696, avg loss: 0.0, step: 235, memory length: 13696
state: observe, episode: 87,

state: observe, episode: 154, score: 2.0, global_step: 23986, avg loss: 0.0, step: 206, memory length: 23986
state: observe, episode: 155, score: 2.0, global_step: 24200, avg loss: 0.0, step: 214, memory length: 24200
state: observe, episode: 156, score: 3.0, global_step: 24417, avg loss: 0.0, step: 217, memory length: 24417
state: observe, episode: 157, score: 0.0, global_step: 24526, avg loss: 0.0, step: 109, memory length: 24526
state: observe, episode: 158, score: 0.0, global_step: 24649, avg loss: 0.0, step: 123, memory length: 24649
state: observe, episode: 159, score: 3.0, global_step: 24876, avg loss: 0.0, step: 227, memory length: 24876
state: observe, episode: 160, score: 1.0, global_step: 25027, avg loss: 0.0, step: 151, memory length: 25027
state: observe, episode: 161, score: 2.0, global_step: 25211, avg loss: 0.0, step: 184, memory length: 25211
state: observe, episode: 162, score: 0.0, global_step: 25329, avg loss: 0.0, step: 118, memory length: 25329
state: observe, epi

state: observe, episode: 230, score: 0.0, global_step: 35651, avg loss: 0.0, step: 97, memory length: 35651
state: observe, episode: 231, score: 0.0, global_step: 35786, avg loss: 0.0, step: 135, memory length: 35786
state: observe, episode: 232, score: 0.0, global_step: 35908, avg loss: 0.0, step: 122, memory length: 35908
state: observe, episode: 233, score: 2.0, global_step: 36092, avg loss: 0.0, step: 184, memory length: 36092
state: observe, episode: 234, score: 1.0, global_step: 36243, avg loss: 0.0, step: 151, memory length: 36243
state: observe, episode: 235, score: 0.0, global_step: 36346, avg loss: 0.0, step: 103, memory length: 36346
state: observe, episode: 236, score: 0.0, global_step: 36462, avg loss: 0.0, step: 116, memory length: 36462
state: observe, episode: 237, score: 2.0, global_step: 36659, avg loss: 0.0, step: 197, memory length: 36659
state: observe, episode: 238, score: 1.0, global_step: 36824, avg loss: 0.0, step: 165, memory length: 36824
state: observe, epis

state: observe, episode: 306, score: 2.0, global_step: 47194, avg loss: 0.0, step: 198, memory length: 47194
state: observe, episode: 307, score: 0.0, global_step: 47306, avg loss: 0.0, step: 112, memory length: 47306
state: observe, episode: 308, score: 1.0, global_step: 47469, avg loss: 0.0, step: 163, memory length: 47469
state: observe, episode: 309, score: 0.0, global_step: 47567, avg loss: 0.0, step: 98, memory length: 47567
state: observe, episode: 310, score: 2.0, global_step: 47745, avg loss: 0.0, step: 178, memory length: 47745
state: observe, episode: 311, score: 3.0, global_step: 48005, avg loss: 0.0, step: 260, memory length: 48005
state: observe, episode: 312, score: 1.0, global_step: 48164, avg loss: 0.0, step: 159, memory length: 48164
state: observe, episode: 313, score: 0.0, global_step: 48275, avg loss: 0.0, step: 111, memory length: 48275
state: observe, episode: 314, score: 0.0, global_step: 48391, avg loss: 0.0, step: 116, memory length: 48391
state: observe, epis

state: explore, episode: 374, score: 2.0, global_step: 58301, avg loss: 0.0031366883444248965, step: 173, memory length: 58301
state: explore, episode: 375, score: 2.0, global_step: 58483, avg loss: 0.003158634509268418, step: 182, memory length: 58483
state: explore, episode: 376, score: 1.0, global_step: 58651, avg loss: 0.0030218358532465572, step: 168, memory length: 58651
state: explore, episode: 377, score: 3.0, global_step: 58921, avg loss: 0.0030075906712395575, step: 270, memory length: 58921
state: explore, episode: 378, score: 1.0, global_step: 59100, avg loss: 0.003186081762527587, step: 179, memory length: 59100
state: explore, episode: 379, score: 2.0, global_step: 59293, avg loss: 0.003370330276371589, step: 193, memory length: 59293
state: explore, episode: 380, score: 2.0, global_step: 59494, avg loss: 0.0029898526355911182, step: 201, memory length: 59494
state: explore, episode: 381, score: 0.0, global_step: 59611, avg loss: 0.003200892486992781, step: 117, memory le

state: explore, episode: 439, score: 0.0, global_step: 68761, avg loss: 0.004099270781816813, step: 100, memory length: 68761
state: explore, episode: 440, score: 0.0, global_step: 68879, avg loss: 0.003173912680152063, step: 118, memory length: 68879
state: explore, episode: 441, score: 1.0, global_step: 69025, avg loss: 0.002594163476939298, step: 146, memory length: 69025
state: explore, episode: 442, score: 0.0, global_step: 69123, avg loss: 0.002287865954824564, step: 98, memory length: 69123
state: explore, episode: 443, score: 0.0, global_step: 69233, avg loss: 0.0028242905540206598, step: 110, memory length: 69233
state: explore, episode: 444, score: 0.0, global_step: 69336, avg loss: 0.002978150968718765, step: 103, memory length: 69336
state: explore, episode: 445, score: 1.0, global_step: 69491, avg loss: 0.0032136062137250255, step: 155, memory length: 69491
state: explore, episode: 446, score: 1.0, global_step: 69664, avg loss: 0.002988521348399413, step: 173, memory lengt

state: explore, episode: 504, score: 0.0, global_step: 79275, avg loss: 0.0026769886102398313, step: 115, memory length: 79275
state: explore, episode: 505, score: 2.0, global_step: 79477, avg loss: 0.002730910476886925, step: 202, memory length: 79477
state: explore, episode: 506, score: 1.0, global_step: 79623, avg loss: 0.0030749126706922816, step: 146, memory length: 79623
state: explore, episode: 507, score: 0.0, global_step: 79749, avg loss: 0.002648773193767637, step: 126, memory length: 79749
state: explore, episode: 508, score: 1.0, global_step: 79885, avg loss: 0.0033270589996297582, step: 136, memory length: 79885
state: explore, episode: 509, score: 0.0, global_step: 79996, avg loss: 0.0028548371375756425, step: 111, memory length: 79996
state: explore, episode: 510, score: 1.0, global_step: 80125, avg loss: 0.0028802305188678863, step: 129, memory length: 80125
state: explore, episode: 511, score: 1.0, global_step: 80275, avg loss: 0.0028167054495012657, step: 150, memory 

state: explore, episode: 569, score: 2.0, global_step: 89515, avg loss: 0.0024865124519214163, step: 193, memory length: 89515
state: explore, episode: 570, score: 4.0, global_step: 89782, avg loss: 0.0029610653797639596, step: 267, memory length: 89782
state: explore, episode: 571, score: 0.0, global_step: 89887, avg loss: 0.002920398152093098, step: 105, memory length: 89887
state: explore, episode: 572, score: 1.0, global_step: 90054, avg loss: 0.003109466302147868, step: 167, memory length: 90054
state: explore, episode: 573, score: 1.0, global_step: 90198, avg loss: 0.002882579857700623, step: 144, memory length: 90198
state: explore, episode: 574, score: 0.0, global_step: 90306, avg loss: 0.0028124584951506725, step: 108, memory length: 90306
state: explore, episode: 575, score: 1.0, global_step: 90447, avg loss: 0.0030583321230880484, step: 141, memory length: 90447
state: explore, episode: 576, score: 1.0, global_step: 90601, avg loss: 0.002815170316323504, step: 154, memory le

state: explore, episode: 634, score: 0.0, global_step: 100620, avg loss: 0.0036244556703280044, step: 121, memory length: 100620
state: explore, episode: 635, score: 0.0, global_step: 100750, avg loss: 0.0021945965211391293, step: 130, memory length: 100750
state: explore, episode: 636, score: 1.0, global_step: 100902, avg loss: 0.003105074933153829, step: 152, memory length: 100902
state: explore, episode: 637, score: 4.0, global_step: 101167, avg loss: 0.0033605211452320976, step: 265, memory length: 101167
state: explore, episode: 638, score: 1.0, global_step: 101333, avg loss: 0.002875323485475374, step: 166, memory length: 101333
state: explore, episode: 639, score: 2.0, global_step: 101536, avg loss: 0.0028597897645103254, step: 203, memory length: 101536
state: explore, episode: 640, score: 0.0, global_step: 101635, avg loss: 0.003143010012067724, step: 99, memory length: 101635
state: explore, episode: 641, score: 4.0, global_step: 101932, avg loss: 0.002727104662198288, step: 

state: explore, episode: 698, score: 0.0, global_step: 111378, avg loss: 0.00278447061672767, step: 123, memory length: 111378
state: explore, episode: 699, score: 0.0, global_step: 111475, avg loss: 0.0031717382076449307, step: 97, memory length: 111475
state: explore, episode: 700, score: 2.0, global_step: 111683, avg loss: 0.002807189127775718, step: 208, memory length: 111683
state: explore, episode: 701, score: 1.0, global_step: 111835, avg loss: 0.002920085907135217, step: 152, memory length: 111835
state: explore, episode: 702, score: 4.0, global_step: 112108, avg loss: 0.0033675255900884903, step: 273, memory length: 112108
state: explore, episode: 703, score: 3.0, global_step: 112355, avg loss: 0.003266942954908181, step: 247, memory length: 112355
state: explore, episode: 704, score: 1.0, global_step: 112515, avg loss: 0.003063230034007347, step: 160, memory length: 112515
state: explore, episode: 705, score: 0.0, global_step: 112632, avg loss: 0.003307467621081555, step: 117

state: explore, episode: 762, score: 1.0, global_step: 122268, avg loss: 0.003273153013206017, step: 149, memory length: 122268
state: explore, episode: 763, score: 3.0, global_step: 122539, avg loss: 0.003705094342564577, step: 271, memory length: 122539
state: explore, episode: 764, score: 0.0, global_step: 122662, avg loss: 0.003248033402508931, step: 123, memory length: 122662
state: explore, episode: 765, score: 1.0, global_step: 122834, avg loss: 0.003046265881932921, step: 172, memory length: 122834
state: explore, episode: 766, score: 0.0, global_step: 122941, avg loss: 0.00330096015083918, step: 107, memory length: 122941
state: explore, episode: 767, score: 2.0, global_step: 123142, avg loss: 0.002805460471722993, step: 201, memory length: 123142
state: explore, episode: 768, score: 1.0, global_step: 123276, avg loss: 0.003398808863989667, step: 134, memory length: 123276
state: explore, episode: 769, score: 2.0, global_step: 123460, avg loss: 0.0028341826647716594, step: 184

state: explore, episode: 826, score: 0.0, global_step: 132153, avg loss: 0.002447493613081778, step: 110, memory length: 132153
state: explore, episode: 827, score: 0.0, global_step: 132274, avg loss: 0.0025376436121052003, step: 121, memory length: 132274
state: explore, episode: 828, score: 0.0, global_step: 132394, avg loss: 0.0027743970279516363, step: 120, memory length: 132394
state: explore, episode: 829, score: 0.0, global_step: 132489, avg loss: 0.0029955834553747032, step: 95, memory length: 132489
state: explore, episode: 830, score: 0.0, global_step: 132589, avg loss: 0.002718086924924137, step: 100, memory length: 132589
state: explore, episode: 831, score: 0.0, global_step: 132718, avg loss: 0.0031165311600122457, step: 129, memory length: 132718
state: explore, episode: 832, score: 2.0, global_step: 132911, avg loss: 0.002808444342361873, step: 193, memory length: 132911
state: explore, episode: 833, score: 0.0, global_step: 133022, avg loss: 0.0025141054296057576, step:

state: explore, episode: 890, score: 1.0, global_step: 141885, avg loss: 0.0032670210769434474, step: 149, memory length: 141885
state: explore, episode: 891, score: 2.0, global_step: 142099, avg loss: 0.0031727055053107785, step: 214, memory length: 142099
state: explore, episode: 892, score: 2.0, global_step: 142289, avg loss: 0.0027083121851654525, step: 190, memory length: 142289
state: explore, episode: 893, score: 1.0, global_step: 142442, avg loss: 0.002460869045202332, step: 153, memory length: 142442
state: explore, episode: 894, score: 3.0, global_step: 142678, avg loss: 0.0032752312331005237, step: 236, memory length: 142678
state: explore, episode: 895, score: 6.0, global_step: 143003, avg loss: 0.0028484443267062978, step: 325, memory length: 143003
state: explore, episode: 896, score: 1.0, global_step: 143160, avg loss: 0.003041810903202421, step: 157, memory length: 143160
state: explore, episode: 897, score: 2.0, global_step: 143338, avg loss: 0.0024094015132177854, ste

state: explore, episode: 954, score: 1.0, global_step: 152368, avg loss: 0.003355998025064174, step: 148, memory length: 152368
state: explore, episode: 955, score: 0.0, global_step: 152465, avg loss: 0.002609030323798937, step: 97, memory length: 152465
state: explore, episode: 956, score: 2.0, global_step: 152633, avg loss: 0.00321337020500323, step: 168, memory length: 152633
state: explore, episode: 957, score: 1.0, global_step: 152788, avg loss: 0.0030354671844942194, step: 155, memory length: 152788
state: explore, episode: 958, score: 2.0, global_step: 152986, avg loss: 0.0031656100869358024, step: 198, memory length: 152986
state: explore, episode: 959, score: 1.0, global_step: 153148, avg loss: 0.002272584863173203, step: 162, memory length: 153148
state: explore, episode: 960, score: 2.0, global_step: 153348, avg loss: 0.003207908153685821, step: 200, memory length: 153348
state: explore, episode: 961, score: 1.0, global_step: 153475, avg loss: 0.0033590911659453337, step: 12

In [2]:
!pwd

/home/paperspace/fastai/courses/tim
