In [1]:
# credit: Karpathy. https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
# normal imports
import gym
import time
import keras
import numpy as np
from keras.layers import Input, Dense, Activation
from keras.models import Model, Sequential
from keras.models import load_model
from keras.utils import plot_model
from keras import regularizers
from keras import optimizers
import os.path
import keras.backend as K
import tensorflow as tf
import math
from easy_tf_log import tflog
from datetime import datetime
import shutil
import os

Using TensorFlow backend.


In [2]:
# model initialization
def initialization(nb_hidden_layer_neurons, input_dimensionality, learning_rate, decay_rate):
    model = Sequential()
    # added regularizer l2 (improvement from )
    model.add(Dense(units=nb_hidden_layer_neurons, input_dim=input_dimensionality, activation='relu', kernel_initializer='glorot_uniform'))#, kernel_regularizer=regularizers.l2(0.01)))
    model.add(Dense(units=1, activation='sigmoid', kernel_initializer='RandomNormal')) # by default stddev=0.05 for RandomNorma
    #rms_prop = keras.optimizers.RMSprop(lr=learning_rate, decay=decay_rate)
    adam = keras.optimizers.Adam()
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
    return model

In [3]:
# preprocessing used by Karpathy (cf. https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5)
def prepro(I):
  """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  I[I == 144] = 0 # erase background (background type 1)
  I[I == 109] = 0 # erase background (background type 2)
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(np.float).ravel()

In [4]:
# reward discount used by Karpathy (cf. https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5)
def discount_rewards(r):
  """ take 1D float array of rewards and compute discounted reward """
  discounted_r = np.zeros_like(r)
  running_add = 0
  # we go from last reward to first one so we don't have to do exponentiations
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # if the game ended (in Pong), reset the reward sum
    running_add = running_add * gamma + r[t] # the point here is to use Horner's method to compute those rewards efficiently
    discounted_r[t] = running_add
  return discounted_r

In [5]:
# our custom loss
def custom_loss(discounted_rewards):
    def custom_loss_aux(y_true, y_pred):
        return -custom_loss_sum(y_true, y_pred, discounted_rewards)
    return custom_loss_aux

In [6]:
# hyperparameters, global variables
nb_hidden_layer_neurons = 200  # number of hidden layer neurons
batch_size = 1 # every how many episodes to do a param update?
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
input_dimensionality = 80 * 80 # input dimensionality: 80x80 grid
learning_rate = 1e-4
resume = True # resume from previous checkpoint?
render = False # render the game
nb_episodes = 10000

UP_ACTION = 2
DOWN_ACTION = 3

In [None]:
# initialize tf session?
#sess = tf.InteractiveSession()

# cleaning
if (os.path.exists('./logs')):
    shutil.rmtree('./logs')
os.mkdir('./logs')
    
now = datetime.now()

# log directory

log_dir = './log' + now.strftime("%Y%m%d-%H%M%S") + "/"

# model initialization
model = initialization(nb_hidden_layer_neurons, input_dimensionality, decay_rate, learning_rate)

# gym initialization
env = gym.make("Pong-v0")
observation = env.reset()
prev_input = None # what we'll use for the difference between two frames
x_train, y_train, rewards = [],[],[] # initialize arrays
running_reward = None # exponentially weighted average of the rewards (per episode)
reward_sum = 0
episode_nb = 0

X_train, Y_train, discounted_rewards = np.array([]),np.array([]), np.array([])

# load pre-trained model if exist
if (resume and os.path.isfile('my_model_weights.h5')):
    print("loading previous weights")
    model.load_weights('my_model_weights.h5')

# add a callback tensorboard object to visualize learning
tbCallBack = keras.callbacks.TensorBoard(log_dir='./Graph' + now.strftime("%Y%m%d-%H%M%S") + "/", histogram_freq=0,  
          write_graph=True, write_images=True)
    
# main loop
while (episode_nb < nb_episodes):
    if render: 
        time.sleep(0.001)
        env.render() 

    # preprocess the observation, set input as difference between images
    cur_input = prepro(observation)
    x = cur_input - prev_input if prev_input is not None else np.zeros(input_dimensionality)
    prev_input = cur_input
    
    # forward the policy network and sample action according to the proba distribution
    proba = model.predict(np.expand_dims(x, axis=1).T) # need to reshape input to do forward in our model
    #tflog('proba', proba)
    action = UP_ACTION if np.random.uniform() < proba else DOWN_ACTION # only two actions, up and down, encoded by 2 and 3 in the gym env
    y = 1 if action == 2 else 0 # we will use our sampled action as a "label" for training later

    # log the input and label to train later
    x_train.append(x)
    y_train.append(y)
    
    # do one step in our environment
    observation, reward, done, info = env.step(action)
    reward_sum += reward
    rewards.append(reward)
    
    if done: # end of an episode
        print("episode ", episode_nb + 1, "/", nb_episodes)
        episode_nb += 1
        
        # stack all the x_train, y_train and rewards from the current episode
        x_train_ep = np.vstack(x_train)
        y_train_ep = np.vstack(y_train)
        rewards_ep = np.vstack(rewards)
        
        x_train, y_train, rewards = [],[],[] # reset our variables because the episode ended
        
        # compute the discounted rewards and normalize it to control variance
        discounted_rewards_ep = discount_rewards(rewards_ep)
        discounted_rewards_ep -= np.mean(discounted_rewards_ep)
        discounted_rewards_ep /= np.std(discounted_rewards_ep)

        # (For later) Karpathy computes the gradient directly here, so he can goes and do backprop. If I only do this, I will do the forward twice...
        Y_train = np.append(Y_train, y_train_ep)
        X_train = x_train_ep if (X_train.size == 0) else np.vstack((X_train, x_train_ep))
        discounted_rewards = np.append(discounted_rewards, discounted_rewards_ep)
                
        if episode_nb % batch_size == 0:
            #print("discounted_rewards", discounted_rewards)
            #loss = custom_loss(discounted_rewards)
            model.fit(x=X_train, y=Y_train, epochs=1, verbose=1, callbacks=[tbCallBack], sample_weight=discounted_rewards)
            X_train, Y_train, discounted_rewards = np.array([]),np.array([]), np.array([])
        if episode_nb % 50 == 0:    
            model.save_weights('my_model_weights' + datetime.now().strftime("%Y%m%d-%H%M%S") + '.h5')
        
        # Log the reward
        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
        print("running reward: ", running_reward, " reward this episode: ", reward_sum)
        print("logging into:", log_dir)
        tflog('running_reward', running_reward, custom_dir=log_dir)
        tflog('reward_sum', reward_sum, custom_dir=log_dir)
        
        # Reinitialization
        observation = env.reset() # resetting our env
        reward_sum = 0
        prev_input = None # The new episode must not depend of previous frame (from last episode)
        


loading previous weights
episode  1 / 10000
Epoch 1/1
running reward:  8.0  reward this episode:  8.0
logging into: ./log20181120-142143/
episode  2 / 10000
Epoch 1/1
running reward:  7.9  reward this episode:  -2.0
logging into: ./log20181120-142143/
episode  3 / 10000
Epoch 1/1
running reward:  7.791  reward this episode:  -3.0
logging into: ./log20181120-142143/
episode  4 / 10000
Epoch 1/1
running reward:  7.66309  reward this episode:  -5.0
logging into: ./log20181120-142143/
episode  5 / 10000
Epoch 1/1
running reward:  7.6664591  reward this episode:  8.0
logging into: ./log20181120-142143/
episode  6 / 10000
Epoch 1/1
running reward:  7.569794509  reward this episode:  -2.0
logging into: ./log20181120-142143/
episode  7 / 10000
Epoch 1/1
running reward:  7.434096563910001  reward this episode:  -6.0
logging into: ./log20181120-142143/
episode  8 / 10000
Epoch 1/1
running reward:  7.249755598270901  reward this episode:  -11.0
logging into: ./log20181120-142143/
episode  9 / 100

episode  39 / 10000
Epoch 1/1
running reward:  3.9420616422090196  reward this episode:  -5.0
logging into: ./log20181120-142143/
episode  40 / 10000
Epoch 1/1
running reward:  3.9226410257869295  reward this episode:  2.0
logging into: ./log20181120-142143/
episode  41 / 10000
Epoch 1/1
running reward:  3.93341461552906  reward this episode:  5.0
logging into: ./log20181120-142143/
episode  42 / 10000
Epoch 1/1
running reward:  3.8440804693737696  reward this episode:  -5.0
logging into: ./log20181120-142143/
episode  43 / 10000
Epoch 1/1
running reward:  3.775639664680032  reward this episode:  -3.0
logging into: ./log20181120-142143/
episode  44 / 10000
Epoch 1/1
running reward:  3.647883268033232  reward this episode:  -9.0
logging into: ./log20181120-142143/
episode  45 / 10000
Epoch 1/1
running reward:  3.5514044353528993  reward this episode:  -6.0
logging into: ./log20181120-142143/
episode  46 / 10000
Epoch 1/1
running reward:  3.4658903909993706  reward this episode:  -5.0
lo

running reward:  1.4958111694375045  reward this episode:  -2.0
logging into: ./log20181120-142143/
episode  77 / 10000
Epoch 1/1
running reward:  1.3608530577431295  reward this episode:  -12.0
logging into: ./log20181120-142143/
episode  78 / 10000
Epoch 1/1
running reward:  1.237244527165698  reward this episode:  -11.0
logging into: ./log20181120-142143/
episode  79 / 10000
Epoch 1/1
running reward:  1.194872081894041  reward this episode:  -3.0
logging into: ./log20181120-142143/
episode  80 / 10000
Epoch 1/1
running reward:  1.1929233610751007  reward this episode:  1.0
logging into: ./log20181120-142143/
episode  81 / 10000
Epoch 1/1
running reward:  1.1709941274643496  reward this episode:  -1.0
logging into: ./log20181120-142143/
episode  82 / 10000
Epoch 1/1
running reward:  1.029284186189706  reward this episode:  -13.0
logging into: ./log20181120-142143/
episode  83 / 10000
Epoch 1/1
running reward:  0.9089913443278089  reward this episode:  -11.0
logging into: ./log2018112

running reward:  -0.8370024468002836  reward this episode:  -13.0
logging into: ./log20181120-142143/
episode  114 / 10000
Epoch 1/1
running reward:  -0.8786324223322808  reward this episode:  -5.0
logging into: ./log20181120-142143/
episode  115 / 10000
Epoch 1/1
running reward:  -0.909846098108958  reward this episode:  -4.0
logging into: ./log20181120-142143/
episode  116 / 10000
Epoch 1/1
running reward:  -0.9707476371278685  reward this episode:  -7.0
logging into: ./log20181120-142143/
episode  117 / 10000
Epoch 1/1
running reward:  -1.0810401607565898  reward this episode:  -12.0
logging into: ./log20181120-142143/
episode  118 / 10000
Epoch 1/1
running reward:  -1.120229759149024  reward this episode:  -5.0
logging into: ./log20181120-142143/
episode  119 / 10000
Epoch 1/1
running reward:  -1.0190274615575337  reward this episode:  9.0
logging into: ./log20181120-142143/
episode  120 / 10000
Epoch 1/1
running reward:  -1.0288371869419584  reward this episode:  -2.0
logging into

running reward:  -2.121728099183994  reward this episode:  -8.0
logging into: ./log20181120-142143/
episode  151 / 10000
Epoch 1/1
running reward:  -2.230510818192154  reward this episode:  -13.0
logging into: ./log20181120-142143/
episode  152 / 10000
Epoch 1/1
running reward:  -2.3382057100102323  reward this episode:  -13.0
logging into: ./log20181120-142143/
episode  153 / 10000
Epoch 1/1
running reward:  -2.26482365291013  reward this episode:  5.0
logging into: ./log20181120-142143/
episode  154 / 10000
Epoch 1/1
running reward:  -2.222175416381029  reward this episode:  2.0
logging into: ./log20181120-142143/
episode  155 / 10000
Epoch 1/1
running reward:  -2.339953662217219  reward this episode:  -14.0
logging into: ./log20181120-142143/
episode  156 / 10000
Epoch 1/1
running reward:  -2.386554125595046  reward this episode:  -7.0
logging into: ./log20181120-142143/
episode  157 / 10000
Epoch 1/1
running reward:  -2.3926885843390955  reward this episode:  -3.0
logging into: ./l

running reward:  -3.0191232945337823  reward this episode:  -13.0
logging into: ./log20181120-142143/
episode  188 / 10000
Epoch 1/1
running reward:  -3.018932061588444  reward this episode:  -3.0
logging into: ./log20181120-142143/
episode  189 / 10000
Epoch 1/1
running reward:  -3.1187427409725594  reward this episode:  -13.0
logging into: ./log20181120-142143/
episode  190 / 10000
Epoch 1/1
running reward:  -3.217555313562834  reward this episode:  -13.0
logging into: ./log20181120-142143/
episode  191 / 10000
Epoch 1/1
running reward:  -3.2553797604272052  reward this episode:  -7.0
logging into: ./log20181120-142143/
episode  192 / 10000
Epoch 1/1
running reward:  -3.232825962822933  reward this episode:  -1.0
logging into: ./log20181120-142143/
episode  193 / 10000
Epoch 1/1
running reward:  -3.2904977031947036  reward this episode:  -9.0
logging into: ./log20181120-142143/
episode  194 / 10000
Epoch 1/1
running reward:  -3.3475927261627563  reward this episode:  -9.0
logging int

In [None]:
outputTensor = model.output #Or model.layers[index].output
#Then we need to choose the variables that are in respect to the gradient.

listOfVariableTensors = model.trainable_weights
#or variableTensors = model.trainable_weights[0]
#We can now calculate the gradients. It is as easy as the following:

gradients = K.gradients(outputTensor, listOfVariableTensors)
#To actually run the gradients given an input, we need to use a bit of Tensorflow.

trainingExample = X_train
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
evaluated_gradients = sess.run(gradients,feed_dict={model.input:trainingExample})
sess.close()

In [None]:
evaluated_gradients[0].shape

In [None]:
np.absolute(evaluated_gradients[0]).mean()

In [None]:
print(*Y_train, sep='\n')

In [None]:
Y_train.mean()

In [None]:
X_train[42].max()

In [None]:
X_train

In [None]:
Y_train