In [None]:
# gym import and environment setting
import gym
env = gym.make('BreakoutDeterministic-v4')

from IPython import display
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

env.reset()
img = plt.imshow(env.render(mode='rgb_array')) # make one call only
done = False

while not done:
    img.set_data(env.render(mode='rgb_array')) # data updater
    display.display(plt.gcf())
    display.clear_output(wait=True)
    action = env.action_space.sample()
    
    observation, reward, done, info = env.step(action)

In [None]:
# initializing, hyperparameters
#TOTAL INITIALIZING

#architecture and functions framework based on work on Pong, see: https://medium.com/@dhruvp/how-to-write-a-neural-network-to-play-pong-from-scratch-956b57d4f6e0

batch_size = 10 # games played before updating the weights
gamma = 0.99 # discounting factor for the delayed rewards
decay_rate = 0.99 # decay rate for the RMSProp backpropagation algorithm
num_hidden_layer_neurons = 200 # hidden layer neurons

input_dimensions = 80 * 80 # "pixels" of image input, thus number of sensory neurons
learning_rate = 1e-4 #learning rate of the ANN

# Neural Network Setup

weights = {
  '1': np.random.randn(num_hidden_layer_neurons,input_dimensions) /np.sqrt(input_dimensions),
  '2': np.random.randn(num_hidden_layer_neurons) / np.sqrt(num_hidden_layer_neurons)
} # the key names the layer; weights normalized.

# the following is a ready-made setup for a specific backpropagation algo
# To be used with rmsprop algorithm (courtesy from http://sebastianruder.com/optimizing-gradient-descent/index.html#rmsprop)
expectation_g_squared = {}
g_dict = {}
for layer_name in weights.keys():
    expectation_g_squared[layer_name] = np.zeros_like(weights[layer_name])
    g_dict[layer_name] = np.zeros_like(weights[layer_name])
    
# This arrays are used in the application of the ANN transfer function
episode_hidden_layer_values, episode_observations, episode_gradient_log_ps, episode_rewards = [], [], [], []

def downsample(image):
    # Alternate sample to half the resolution
    return image[::2, ::2, :]

def remove_color(image):
    # Set to grey the RGB value
    return image[:, :, 0]

def preprocess_observations(input_observation, prev_processed_observation, input_dimensions):
    # convert the 210x160x3 uint8 frame into a float vector
    processed_observation = input_observation[35:195] # crop
    processed_observation = downsample(processed_observation) #see above
    processed_observation = remove_color(processed_observation) #see above
   
    # Flatten the matrix
    processed_observation = processed_observation.astype(np.float).ravel()

    # frame by frame subtraction to process only changes in the game
    if prev_processed_observation is not None:
        input_observation = processed_observation - prev_processed_observation
    else:
        input_observation = np.zeros(input_dimensions)
    # store the old frame to subtract from it the next one
    prev_processed_observations = processed_observation
    return input_observation, prev_processed_observations

# Neurons activation functions / action potentials
def sigmoid(x):
    if x >= 0:
        return 1.0/(1.0 + np.exp(-x))
    else:
        return np.exp(x)/(1.0 + np.exp(x))

def relu(vector):
    vector[vector < 0] = 0
    return vector

def apply_neural_nets(observation_matrix, weights):
    # matrix algebra to compute/process the image layer by layer
    hidden_layer_values = np.dot(weights['1'], observation_matrix)
    hidden_layer_values = relu(hidden_layer_values) 
    output_layer_values = np.dot(hidden_layer_values, weights['2'])
    output_layer_values = sigmoid(output_layer_values) 
    return hidden_layer_values, output_layer_values

def choose_action(probability):
    random_value = np.random.uniform()
    if random_value <= probability:
        return 2# means right in openai gym
    else:
        return 3 # means left in openai gym
    
def compute_gradient(gradient_log_p, hidden_layer_values, observation_values, weights):
    #See here: http://neuralnetworksanddeeplearning.com/chap2.html
    delta_L = gradient_log_p
    dC_dw2 = np.dot(hidden_layer_values.T, delta_L).ravel()
    delta_l2 = np.outer(delta_L, weights['2'])
    delta_l2 = relu(delta_l2)
    dC_dw1 = np.dot(delta_l2.T, observation_values)
    return {
    '1': dC_dw1,
    '2': dC_dw2
    }

def update_weights(weights, expectation_g_squared, g_dict, decay_rate, learning_rate):
    #See here: http://sebastianruder.com/optimizing-gradient-descent/index.html#rmsprop
    epsilon = 1e-5
    for layer_name in weights.keys():
        g = g_dict[layer_name]
        expectation_g_squared[layer_name] = decay_rate * expectation_g_squared[layer_name] + (1 - decay_rate) * g**2
        weights[layer_name] += (learning_rate * g)/(np.sqrt(expectation_g_squared[layer_name] + epsilon))
        g_dict[layer_name] = np.zeros_like(weights[layer_name]) # reset batch gradient buffer

def discount_rewards(rewards, gamma):
    # Older actions are less important to the present result than newest action.
    #This is a discounting based on how long ago they were taken
    discounted_rewards = np.zeros_like(rewards)
    running_add = 0
    for t in reversed(range(0, rewards.size)):
        running_add = running_add * gamma + rewards[t]
        discounted_rewards[t] = running_add
    return discounted_rewards

def discount_with_rewards(gradient_log_p, episode_rewards, gamma):
    #discount the gradient with the normalized rewards
    discounted_episode_rewards = discount_rewards(episode_rewards, gamma)
    # rewards standardization helps balancing the gradient
    discounted_episode_rewards -= np.mean(discounted_episode_rewards)
    if np.std(discounted_episode_rewards) != 0:
        discounted_episode_rewards /= np.std(discounted_episode_rewards)
    else:
        discounted_episode_rewards /= discounted_episode_rewards.size
    return gradient_log_p * discounted_episode_rewards


In [None]:
# teaching to hit (big rewards for hitting, no rewards for surviving, no punishments) ---------- 1

### Tentative learning: LEARNING TO HIT

observation = env.reset()
img = plt.imshow(env.render(mode='rgb_array')) 

episode_number = 0
reward_sum = 0
running_reward = None
prev_processed_observations = None
done = False
first = True
lives = 5
learning_record = []

###############################################
# Setup of arrays to store information used in the NN training
episode_hidden_layer_values, episode_observations, episode_gradient_log_ps, episode_rewards = [], [], [], []
episode_hidden2_layer_values = []
env.reset()

while episode_number <= 1000:
        
        processed_observations, prev_processed_observations = preprocess_observations(observation, prev_processed_observations, input_dimensions)
        hidden_layer_values, move_probability = apply_neural_nets(processed_observations, weights)
    
        episode_observations.append(processed_observations)
        episode_hidden_layer_values.append(hidden_layer_values)
        
        action = choose_action(move_probability)
        if first:
            action = 1
            first = False
        observation, reward, done, info = env.step(action) # carry out the chosen action
        
        if reward > 0:
            reward = 1
        if info['ale.lives'] < lives:
            lives = info['ale.lives']
            first = True
        
        reward_sum += reward
        episode_rewards.append(reward)

        fake_label = 1 if action == 2 else 0
        loss_function_gradient = fake_label - move_probability
        episode_gradient_log_ps.append(loss_function_gradient)

        if done: # an episode, i.e. a game of 5 lives finished
            episode_number += 1

            # Combine those values for the present episode
            episode_hidden_layer_values = np.vstack(episode_hidden_layer_values)
            episode_observations = np.vstack(episode_observations)
            episode_gradient_log_ps = np.vstack(episode_gradient_log_ps)
            episode_rewards = np.vstack(episode_rewards)

            # Adjust the gradient for the rewards, discounted
            episode_gradient_log_ps_discounted = discount_with_rewards(episode_gradient_log_ps, episode_rewards, gamma)

            gradient = compute_gradient(
                episode_gradient_log_ps_discounted,
                episode_hidden_layer_values,
                episode_observations,
                weights
                )

            # Sum the gradient for use when we hit the batch size
            for layer_name in gradient:
                g_dict[layer_name] += gradient[layer_name]
            
            running_reward = reward_sum if running_reward is None else (running_reward+reward_sum)
            
            if (episode_number % batch_size) == 0:
                update_weights(weights, expectation_g_squared, g_dict, decay_rate, learning_rate)
        
                print('resetting env. episode reward total was %f. batch mean: %f' % (reward_sum, running_reward/batch_size))
                print('episode number %f' %episode_number)
                learning_record.append(running_reward/batch_size)
                running_reward = None
        
        
            episode_streaks = []        
            episode_hidden_layer_values, episode_observations, episode_gradient_log_ps, episode_rewards = [], [], [], [] 
            observation = env.reset() # reset env
            reward_sum = 0
            prev_processed_observations = None
            first = True
            lives = 5
        

In [None]:
# teaching not to miss (rewards for surviving, punishment for missing) ----------------------- 2

observation = env.reset()
img = plt.imshow(env.render(mode='rgb_array')) # only call this once

episode_number = 0
reward_sum = 0
running_reward = None
prev_processed_observations = None
done = False
first = True
lives = 5

episode_hidden_layer_values, episode_observations, episode_gradient_log_ps, episode_rewards = [], [], [], []
episode_hidden2_layer_values = []
env.reset()

while episode_number <= 1000:
        
        processed_observations, prev_processed_observations = preprocess_observations(observation, prev_processed_observations, input_dimensions)
        hidden_layer_values, move_probability = apply_neural_nets(processed_observations, weights)
    
        episode_observations.append(processed_observations)
        episode_hidden_layer_values.append(hidden_layer_values)
        
        action = choose_action(move_probability)
        if first:
            action = 1
            first = False
        observation, reward, done, info = env.step(action)
        if reward > 0:
            reward = 0
        reward += 0.01
        if info['ale.lives'] < lives:
            lives = info['ale.lives']
            reward = -1
            first = True
        
        reward_sum += reward
        episode_rewards.append(reward)

        fake_label = 1 if action == 2 else 0
        loss_function_gradient = fake_label - move_probability
        episode_gradient_log_ps.append(loss_function_gradient)

        if done: 
            episode_number += 1

            episode_hidden_layer_values = np.vstack(episode_hidden_layer_values)
            episode_observations = np.vstack(episode_observations)
            episode_gradient_log_ps = np.vstack(episode_gradient_log_ps)
            episode_rewards = np.vstack(episode_rewards)

            episode_gradient_log_ps_discounted = discount_with_rewards(episode_gradient_log_ps, episode_rewards, gamma)

            gradient = compute_gradient(
                episode_gradient_log_ps_discounted,
                episode_hidden_layer_values,
                episode_observations,
                weights
                )

            for layer_name in gradient:
                g_dict[layer_name] += gradient[layer_name]
            
            running_reward = reward_sum if running_reward is None else (running_reward+reward_sum)
        
            if (episode_number % batch_size) == 0:
                update_weights(weights, expectation_g_squared, g_dict, decay_rate, learning_rate)
                print('resetting env. episode reward total was %f. batch mean: %f' % (reward_sum, running_reward/batch_size))
                print('episode number %f' %episode_number)
                learning_record.append(running_reward/batch_size)
                running_reward = None
        
        
            episode_streaks = []        
            episode_hidden_layer_values, episode_observations, episode_gradient_log_ps, episode_rewards = [], [], [], []
            observation = env.reset() 
            reward_sum = 0
            previous_points = None
            prev_processed_observations = None
            first = True
            lives = 5
     

In [None]:
# compound tuning and refinement ----------------------------------------------------- 3 

observation = env.reset()

batch_size = 10
learning_rate = 1e-5
episode_number = 0
reward_sum = 0
running_reward = None
prev_processed_observations = None
done = False
first = True
lives = 5
combo = 0
max_combo = 0
hit = False
episode_streaks = []

episode_hidden_layer_values, episode_observations, episode_gradient_log_ps, episode_rewards = [], [], [], []
episode_hidden2_layer_values = []
env.reset()

while episode_number <= 200:
    processed_observations, prev_processed_observations = preprocess_observations(observation, prev_processed_observations, input_dimensions)
    hidden_layer_values, move_probability = apply_neural_nets(processed_observations, weights)
    
    episode_observations.append(processed_observations)
    episode_hidden_layer_values.append(hidden_layer_values)
        
    action = choose_action(move_probability)
    if first:
        action = 1
        first = False
    observation, reward, done, info = env.step(action) 
    if reward > 0 and not hit:
        reward = 2
        hit = True
    elif reward > 0 and hit:
        combo += 1
        reward = (combo+1)*2
    if info['ale.lives'] < lives:
        lives = info['ale.lives']
        reward = -1
        if hit:
            reward = -2
            hit = False
            episode_streaks.append(combo)
            combo = 0
        first = True
          
    reward_sum += reward
    episode_rewards.append(reward)

    fake_label = 1 if action == 2 else 0
    loss_function_gradient = fake_label - move_probability
    episode_gradient_log_ps.append(loss_function_gradient)
    
    if done:
        episode_number += 1
        
        episode_hidden_layer_values = np.vstack(episode_hidden_layer_values)
        episode_observations = np.vstack(episode_observations)
        episode_gradient_log_ps = np.vstack(episode_gradient_log_ps)
        episode_rewards = np.vstack(episode_rewards)

        episode_gradient_log_ps_discounted = discount_with_rewards(episode_gradient_log_ps, episode_rewards, gamma)

        gradient = compute_gradient(
            episode_gradient_log_ps_discounted,
            episode_hidden_layer_values,
            episode_observations,
            weights
            )

        for layer_name in gradient:
            g_dict[layer_name] += gradient[layer_name]
            
        running_reward = reward_sum if running_reward is None else (running_reward+reward_sum)
        
        if episode_streaks and max_combo < max(episode_streaks):
            max_combo = max(episode_streaks) 
            
        if (episode_number % batch_size) == 0:
            update_weights(weights, expectation_g_squared, g_dict, decay_rate, learning_rate)
            print('resetting env. episode reward total was %f. batch mean: %f' % (reward_sum, running_reward/batch_size))
            print('episode number %f' %episode_number)
            print('longest combo of current batch %f' %max_combo)
            learning_record.append(running_reward/batch_size)
            max_combo = 0
            running_reward = None
        
        
        episode_streaks = []        
        episode_hidden_layer_values, episode_observations, episode_gradient_log_ps, episode_rewards = [], [], [], [] 
        observation = env.reset()
        reward_sum = 0
        prev_processed_observations = None
        first = True
        lives = 5
        hit = False


In [None]:
# play

# Tentative cycle POST learning
observation = env.reset()
img = plt.imshow(env.render(mode='rgb_array')) # only call this once

episode_number = 0
prev_processed_observations = None
done = False
first = True
lives = 5
env.reset()

while done == False:
    img.set_data(env.render(mode='rgb_array'))
    display.display(plt.gcf())
    display.clear_output(wait=True)    
    processed_observations, prev_processed_observations = preprocess_observations(observation, prev_processed_observations, input_dimensions)
    hidden_layer_values, move_probability = apply_neural_nets(processed_observations, weights)
    
    episode_observations.append(processed_observations)
    episode_hidden_layer_values.append(hidden_layer_values)
        
    action = choose_action(move_probability)
    if first:
        action = 1
        first = False
    observation, reward, done, info = env.step(action) 
    if info['ale.lives'] < lives:
            lives = info['ale.lives']
            first = True
        
    

In [None]:
#statistics
tot_reward = []
for _ in range(150):
    observation = env.reset()
    episode_number = 0
    prev_processed_observations = None
    done = False
    first = True
    lives = 5
    game_reward = 0
    env.reset()
    while done == False:
        processed_observations, prev_processed_observations = preprocess_observations(observation, prev_processed_observations, input_dimensions)
        hidden_layer_values, move_probability = apply_neural_nets(processed_observations, weights)
    
        episode_observations.append(processed_observations)
        episode_hidden_layer_values.append(hidden_layer_values)
        
        action = choose_action(move_probability)
        if first:
            action = 1
            first = False
        observation, reward, done, info = env.step(action)
        if info['ale.lives'] < lives:
                lives = info['ale.lives']
                first = True
        game_reward += reward
    tot_reward.append(game_reward)
    
print(np.mean(tot_reward))
print(np.median(tot_reward))
print(np.std(tot_reward))
    