Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. 

In [1]:
import numpy as np
import _pickle as pickle
import gym

ImportError: No module named 'gym'

### RL
* RL is a Branch of machine learning concerned with taking #sequences of actions
* Usually described in terms of agent interacting with a previously unknown environment, trying to maximize cumulative reward

### RL combined with other techniques is powerful
Policy Gradients > DQN

We'll build a 2 layer fully connected neural network with recieves image pixels, outputs probability of moving UP(stochasticity)

###  Policy gradients have 3 key differences from supervised learning
* We don't have the correct labels so as a fake label we substitute the action we happened to sample from the policy 
* We modulate the loss for each example multiplicatively based on the eventual outcome. Since we want to increase the log probability for actions that worked and decrease it for those that didnt. 
* runs on a continuously changing dataset (the episodes), scaled by the advantage, and we only want to do one (or very few) 
#updates based on each sampled dataset.

In [2]:
H = 200
batch_size = 10
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward (i.e later rewards are exponentially less important)
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?

In [4]:
# model initialization
D = 80*80
if resume:
    model = pickle.load(open('save.p', 'rb')) #load from pickled checkpoint
else:
    model = {}
    model['w1'] = np.random.randn(H, D)/np.sqrt(D)
    model['w2'] = np.random.randn(H)/np.sqrt(H)
#we will update buffers that add up gradients over a batch
grad_buffer = {k: np.zeros_like(v) for k, v in model.items()}
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } 

In [5]:
def sigmoid(x):
    return 1.0/(1.0+np.exp(-x))

In [6]:
#takes a single game frame as input. Preprocesses before feeding into model
def prepro(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype(np.float).ravel() #flattens 

In [7]:
def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        #if reward at index t is nonzero, reset the sum, since this was a game boundary (pong specific!)
        if r[t] != 0: running_add = 0 
        #increment the sum 
        #https://github.com/hunkim/ReinforcementZeroToAll/issues/1
        running_add = running_add * gamma + r[t]
        #earlier rewards given more value over time 
        #assign the calculated sum to our discounted reward matrix
        discounted_r[t] = running_add
    return discounted_r

In [8]:
def policy_forward(x):
    #matrix multiply input by the first set of weights to get hidden state
    #will be able to detect various game scenarios (e.g. the ball is in the top, and our paddle is in the middle)
    h = np.dot(model['W1'], x)
    #apply an activation function to it
    #f(x)=max(0,x) take max value, if less than 0, use 0
    h[h<0] = 0 # ReLU nonlinearity
    #repeat process once more
    #will decide if in each case we should be going UP or DOWN.
    logp = np.dot(model['W2'], h)
    #squash it with an activation (this time sigmoid to output probabilities)
    p = sigmoid(logp)
    return p, h # return probability of taking action 2, and hidden state

In [9]:
def policy_backward(eph, epdlogp):
    """ backward pass. """
    # eph is array of intermediate hidden states
    # epdlopgp modulates the gradient with advantage
    dW2 = np.dot(eph.T, epdlogp).ravel()
    #Compute derivative hidden. It's the outer product of gradient w/ advatange and weight matrix 2 of 2
    dh = np.outer(epdlogp, model['W2'])
    #apply activation
    dh[eph <= 0] = 0 # backpro prelu
    #compute derivative with respect to weight 1 using hidden states transpose and input observation
    dW1 = np.dot(dh.T, epx)
    #return both derivatives to update weights
    return {'W1':dW1, 'W2':dW2}

In [10]:
#environment
env = gym.make("Pong-v0")
#Each timestep, the agent chooses an action, and the environment returns an observation and a reward.
#The process gets started by calling reset, which returns an initial observation
observation = env.reset()
prev_x = None # used in computing the difference frame
#observation, hidden state, gradient, reward
xs,hs,dlogps,drs = [],[],[],[]
#current reward
running_reward = None
#sum rewards
reward_sum = 0
#where are we?
episode_number = 0

NameError: name 'gym' is not defined

In [None]:
# begin training
while True:
    # since we want our policy network to detect modtion
    # we use difference image
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x
    
    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    #this is the stochastic part 
    #since not apart of the model, model is easily differentiable
    #if it was apart of the model, we'd have to use a reparametrization trick (a la variational autoencoders. so badass)
    action = 2 if np.random.uniform() < aprob else 3 
    
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken
    
    # step the environment and get new measurements
    env.render()
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)
    
    if done: # an episode finished
        episode_number += 1

        # stack together all inputs, hidden states, action gradients, and rewards for this episode
        #each episode is a few dozen games
        epx = np.vstack(xs) #obsveration
        eph = np.vstack(hs) #hidden
        epdlogp = np.vstack(dlogps) #gradient
        epr = np.vstack(drs) #reward
        xs,hs,dlogps,drs = [],[],[],[] # reset array memory

        #the strength with which we encourage a sampled action is the weighted sum of all rewards afterwards, but later rewards are exponentially less important
        # compute the discounted reward backwards through time
        discounted_epr = discount_rewards(epr)
        # standardize the rewards to be unit normal (helps control the gradient estimator variance)
        discounted_epr -= np.mean(discounted_epr)
        discounted_epr /= np.std(discounted_epr)

        #advatnage - quantity which describes how good the action is compared to the average of all the action.
        epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
        grad = policy_backward(eph, epdlogp)
        for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

        # perform rmsprop parameter update every batch_size episodes
        #http://68.media.tumblr.com/2d50e380d8e943afdfd66554d70a84a1/tumblr_inline_o4gfjnL2xK1toi3ym_500.png
        if episode_number % batch_size == 0:
            for k,v in model.iteritems():
                g = grad_buffer[k] # gradient
                rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
                model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
                grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

        # boring book-keeping
        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
        print('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward))
        if episode_number % 100 == 0: pickle.dump(model, open('save.p', 'wb'))
        reward_sum = 0
        observation = env.reset() # reset env
        prev_x = None
    
    if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
        print ('ep %d: game finished, reward: %f' % (episode_number, reward)) + ('' if reward == -1 else ' !!!!!!!!')