In [10]:
# import necessary modules from keras
from keras.layers import Dense
from keras.models import Sequential

# creates a generic neural network architecture
model = Sequential()

# hidden layer takes a pre-processed frame as input, and has 200 units
model.add(Dense(units=200,input_dim=80*80, activation='relu', kernel_initializer='glorot_uniform'))

# output layer
model.add(Dense(units=1, activation='sigmoid', kernel_initializer='RandomNormal'))

# compile the model using traditional Machine Learning losses and optimizers
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [11]:
# preprocessing used by Karpathy (cf. https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5)
def prepro(I):
  """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  I[I == 144] = 0 # erase background (background type 1)
  I[I == 109] = 0 # erase background (background type 2)
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(np.float).ravel()

In [12]:
# reward discount used by Karpathy (cf. https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5)
def discount_rewards(r):
  """ take 1D float array of rewards and compute discounted reward """
  discounted_r = np.zeros_like(r)
  running_add = 0
  # we go from last reward to first one so we don't have to do exponentiations
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # if the game ended (in Pong), reset the reward sum
    running_add = running_add * gamma + r[t] # the point here is to use Horner's method to compute those rewards efficiently
    discounted_r[t] = running_add
  return discounted_r

In [32]:
# gym initialization
import gym, numpy as np
env = gym.make("Pong-v0")
observation = env.reset()

prev_input = None # what we'll use for the difference between two frames
x_train, y_train, rewards = np.array([]),np.array([]),np.array([]) # initialize arrays
reward_sum = 0
UP_ACTION = 2
DOWN_ACTION = 3
episode_nb = 0

# main loop
while (True):
    
    # preprocess the observation, set input as difference between images
    cur_input = prepro(observation)
    x = cur_input - prev_input if prev_input is not None else np.zeros(80 * 80)
    prev_input = cur_input
    
    # forward the policy network and sample action according to the proba distribution
    proba = model.predict(np.expand_dims(x, axis=1).T)
    action = UP_ACTION if np.random.uniform() < proba else DOWN_ACTION
    y = 1 if action == 2 else 0 # 0 and 1 are our labels

    # log the input and label to train later
    np.append(x_train,x)
    np.append(y_train, y)
    
    # do one step in our environment
    observation, reward, done, info = env.step(action)
    np.append(rewards, reward)
    reward_sum += reward
    
    # end of an episode
    if done:
        #increment episode number
        episode_nb += 1
        
        # compute the discounted rewards and normalize it to control variance
        discounted_rewards_ep = discount_rewards(rewards)
        discounted_rewards_ep -= np.mean(discounted_rewards_ep)
        discounted_rewards_ep /= np.std(discounted_rewards_ep)

        #Y_train = np.append(Y_train, y_train_ep)
        #X_train = x_train_ep if (X_train.size == 0) else np.vstack((X_train, x_train_ep))
        #discounted_rewards = np.append(discounted_rewards, discounted_rewards_ep)

        #if episode_nb % batch_size == 0:
        model.fit(x=x_train, y=y_train, epochs=1, verbose=1, sample_weight=discounted_rewards_ep)
        #X_train, Y_train, discounted_rewards = np.array([]),np.array([]), np.array([])
        
        #printing
        print('At the end of episode', episode_nb, 'the total reward was :', reward_sum)
                                                     
        # Reinitialization
        x_train, y_train, rewards = np.array([]),np.array([]),np.array([])
        observation = env.reset()
        reward_sum = 0
        # The new episode must not depend of the previous frame (from last episode)                                             
        prev_input = None 

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


ValueError: Error when checking input: expected dense_3_input to have shape (6400,) but got array with shape (1,)

In [27]:
np.appen

[]