In [10]:
# credit: Karpathy. https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
# normal imports
import gym
import time
import keras
import numpy as np
from keras.layers import Dense
from keras.models import Sequential
#from keras.models import load_model
#from keras.utils import plot_model
#from keras import regularizers
#from keras import optimizers
import os.path
import keras.backend as K
import tensorflow as tf
import math
from easy_tf_log import tflog
from datetime import datetime
import shutil
import os

In [11]:
# model initialization
def initialization(nb_hidden_layer_neurons, input_dimensionality):
    model = Sequential()
    model.add(Dense(units=nb_hidden_layer_neurons, input_dim=input_dimensionality, activation='relu', kernel_initializer='glorot_uniform'))
    model.add(Dense(units=1, activation='sigmoid', kernel_initializer='RandomNormal'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [12]:
model = initialization(200, 80*80)

In [13]:
# preprocessing used by Karpathy (cf. https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5)
def prepro(I):
  """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  I[I == 144] = 0 # erase background (background type 1)
  I[I == 109] = 0 # erase background (background type 2)
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(np.float).ravel()

In [34]:
# reward discount used by Karpathy (cf. https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5)
def discount_rewards(r):
  """ take 1D float array of rewards and compute discounted reward """
  r = np.array(r)
  discounted_r = np.zeros_like(r)
  running_add = 0
  # we go from last reward to first one so we don't have to do exponentiations
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # if the game ended (in Pong), reset the reward sum
    running_add = running_add * gamma + r[t] # the point here is to use Horner's method to compute those rewards efficiently
    discounted_r[t] = running_add
    discounted_r -= np.mean(discounted_r) #normalizing the result
    discounted_r /= np.std(discounted_r) #idem
  return discounted_r

In [15]:
# hyperparameters, global variables
nb_hidden_layer_neurons = 200  # number of hidden layer neurons
batch_size = 1 # how many episodes before a param update
gamma = 0.99 # discount factor for reward
input_dimensionality = 80 * 80 # input dimensionality: 80x80 grid
resume = True # resume from previous checkpoint?
render = False # render the game
nb_episodes = 10000

UP_ACTION = 2
DOWN_ACTION = 3

In [35]:
# cleaning
if (os.path.exists('./logs')):
    shutil.rmtree('./logs')
os.mkdir('./logs')
    
now = datetime.now()

# log directory

log_dir = './log' + now.strftime("%Y%m%d-%H%M%S") + "/"

# model initialization
model = initialization(nb_hidden_layer_neurons, input_dimensionality)
# gym initialization
env = gym.make("Pong-v0")
observation = env.reset()
prev_input = None # what we'll use for the difference between two frames
x_train, y_train, rewards = [],[],[] # initialize arrays
running_reward = None # exponentially weighted average of the rewards (per episode)
reward_sum = 0
episode_nb = 0

X_train, Y_train, discounted_rewards = np.array([]),np.array([]), np.array([])

# load pre-trained model if exist
if (resume and os.path.isfile('my_model_weights.h5')):
    print("loading previous weights")
    model.load_weights('my_model_weights.h5')

# add a callback tensorboard object to visualize learning
tbCallBack = keras.callbacks.TensorBoard(log_dir='./Graph' + now.strftime("%Y%m%d-%H%M%S") + "/", histogram_freq=0,  
          write_graph=True, write_images=True)
    
# main loop
while (episode_nb < nb_episodes):
    if render: 
        time.sleep(0.001)
        env.render() 

    # preprocess the observation, set input as difference between images
    cur_input = prepro(observation)
    x = cur_input - prev_input if prev_input is not None else np.zeros(input_dimensionality)
    prev_input = cur_input
    
    # forward the policy network and sample action according to the proba distribution
    proba = model.predict(np.expand_dims(x, axis=1).T) # need to reshape input to do forward in our model
    action = UP_ACTION if np.random.uniform() < proba else DOWN_ACTION # only two actions, up and down, encoded by 2 and 3 in the gym env
    y = 1 if action == 2 else 0 # we will use our sampled action as a "label" for training later

    # log the input and label to train later
    x_train.append(x)
    y_train.append(y)
    
    # do one step in our environment
    observation, reward, done, info = env.step(action)
    reward_sum += reward
    rewards.append(reward)
    
    if done: # end of an episode
        print("episode ", episode_nb + 1, "/", nb_episodes)
        episode_nb += 1
                        
        model.fit(x=np.vstack(x_train), y=np.vstack(y_train), verbose=1, sample_weight=discount_rewards(rewards))
        
        # Log the reward
        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
        print("running reward: ", running_reward, " reward this episode: ", reward_sum)
        print("logging into:", log_dir)
        tflog('running_reward', running_reward, custom_dir=log_dir)
        tflog('reward_sum', reward_sum, custom_dir=log_dir)
        
        # Reinitialization
        observation = env.reset() # resetting our env
        reward_sum = 0
        prev_input = None # The new episode must not depend of previous frame (from last episode)
        


loading previous weights
episode  1 / 10000
Epoch 1/1
running reward:  -5.0  reward this episode:  -5.0
logging into: ./log20181125-153004/
episode  2 / 10000
Epoch 1/1

KeyboardInterrupt: 

In [21]:
y_train

[]

In [25]:
discounted_rewards

array([[-0.53005844],
       [-0.54073955],
       [-0.55152856],
       ...,
       [-1.83547801],
       [-1.85934518],
       [-1.88345343]])