In [2]:
# uncomment the statment bellow to install dependencies
# !pip install mitdeeplearning
# !apt-get install -y xvfb python-opengl x11-utils > /dev/null 2>&1
# !pip install gym pyvirtualdisplay scikit-video > /dev/null 2>&1

# Import Tensorflow 2.0
import tensorflow as tf

import mitdeeplearning as mdl

import numpy as np
import base64, io, time, gym
from PIL import Image

import IPython, functools
import matplotlib.pyplot as plt
from tqdm import tqdm
from IPython import display as ipythondisplay
from IPython.display import HTML



Before we dive in, let's take a step back and outline our approach, which is generally applicable to reinforcement learning problems in general:

1. **Initialize our environment and our agent**.
2. **Define our agent's memory**.
3. **Define a reward and loss functions**.
4. **Define the learning algorithm**.

# Initialize our environment and our agent

In Cartpole, a pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The pole starts upright, and the goal is to prevent it from falling over. The system is controlled by applying a force of +1 or -1 to the cart. A reward of +1 is provided for every timestep that the pole remains upright. The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center of the track. A visual summary of the cartpole environment is depicted below:

<img width="400px" src="https://danielpiedrahita.files.wordpress.com/2017/02/cart-pole.png"></img>

Given this setup for the environment and the objective of the game, we can think about: 1) what observations help define the environment's state; 2) what actions the agent can take. 

First, let's consider the observation space. In this Cartpole environment our observations are:

1. Cart position
2. Cart velocity
3. Pole angle
4. Pole rotation rate


In [3]:
# initiating the Envirment
env = gym.make("CartPole-v1")

In [4]:
n_observations = env.observation_space
print("Environment has observation space =", n_observations.shape)
n_actions = env.action_space.n
print("Number of possible actions that the agent can choose from =", n_actions)

Environment has observation space = (4,)
Number of possible actions that the agent can choose from = 2


In [5]:
# Create and init the agent (DQN)
def create_agent_model():
    model = tf.keras.models.Sequential([
      tf.keras.layers.Dense(units=32, activation='relu'),
      tf.keras.layers.Dense(units=n_actions, activation=None) ])
    return model

cartpole_model = create_agent_model()

In [6]:
def choose_action(model, observation, single=True):
    # we need to expand the observation before feeding it to the model 
    observation = np.expand_dims(observation, axis=0)
    # feed the obs to the model and get the expected rewards
    Q_Values_Expectation = model.predict(observation)
    # this statment will choose the action of the largest return 
    action = tf.random.categorical(Q_Values_Expectation, num_samples=1).numpy().flatten()
    return action[0]

# Define our agent's memory

In [7]:
class Memory:
    def __init__(self): 
        self.clear()
    def clear(self): 
        self.observations = []
        self.actions = []
        self.rewards = []
    def add_to_memory(self, new_observation, new_action, new_reward): 
        self.observations.append(new_observation)
        self.actions.append(new_action)
        self.rewards.append(new_reward) 

memory = Memory()

# Define a reward and loss  functions

In [8]:
'''
normlize is function that calculate x according to this equation 
X = (z - mean(z))/std(z)

'''
def normalize(z):
    z -= np.mean(z)
    x = z / np.std(z)
    return x.astype(np.float32)


def discount_rewards(rewards, gamma=0.95):
    discounted_rewards = np.zeros_like(rewards)
    R = 0
    for t in reversed(range(0, len(rewards))):
        R = R * (gamma**t) + rewards[t]
        discounted_rewards[t] = R

    return normalize(discounted_rewards)

In [9]:
def compute_loss(logits, actions, rewards):
    neg_logprob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=actions)
    '''
    tf.nn.sparse_softmax_cross_entropy_with_logits:
    
    Measures the probability error in discrete 
    classification tasks in which the classes 
    are mutually exclusive (each entry is in exactly one class).
    '''
    loss = tf.reduce_mean( neg_logprob * rewards ) 
    return loss

# Define the learning algorithm 

In [10]:
learning_rate = 1e-2
optimizer = tf.keras.optimizers.Adam(learning_rate)
reward_history = mdl.util.LossHistory(smoothing_factor=0.9)
# to plot the total reward of each episode
plotter = mdl.util.PeriodicPlotter(sec=2, xlabel='Episodes', ylabel='Rewards')
NUM_EPISODES = 250

In [11]:
def train_step(model, optimizer, observations, actions, discounted_rewards):
    with tf.GradientTape() as tape:
        Expected_returns = model(observations)
        loss = compute_loss(Expected_returns, actions, discounted_rewards)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

# Training process

In [None]:
for i_episode in range(NUM_EPISODES):
    # to plot in real time
    plotter.plot(reward_history.get())
    # get the first observation 
    observation = env.reset()
    memory.clear()
    while True:
        # Chose an action
        action = choose_action(cartpole_model, observation)
        # Perform the action and get the rewards and the next state
        next_observation, reward, done, info = env.step(action)
        # Save the data to memory
        memory.add_to_memory(observation, action, reward)
        # just to visualize What's happening 
        env.render()
        # if the episode ends train the network
        if done:
            # Calculate the Total Reward for the Current episode
            total_reward = sum(memory.rewards)
            # add the current reward to the plot
            reward_history.append(total_reward)
            # perform a training step over the data of the current episode 
            train_step(cartpole_model, optimizer,
                       observations=np.vstack(memory.observations),
                       actions=np.array(memory.actions),
                       discounted_rewards = discount_rewards(memory.rewards))
            # clear memory for the next episode
            memory.clear()
            break
        # save the next observation to use it in the next iteration
        observation = next_observation
# force close the envirment
env.close()

# testing

In [15]:
observation = env.reset()
while True:
        action = choose_action(cartpole_model, observation)
        observation, reward, done, info = env.step(action)
        env.render()
        if done:
            env.close()
            break

## Conclusion 

That's it! Congratulations on training RL agent and putting it to the test! I encourage you to consider the following:

*   How does the agent perform ?
*   Could you train it for shorter amounts of time  and still perform well?
* What are some things you could change about the agent or the learning process to potentially improve performance?

That's it Good luck!