In [54]:
%pwd

'/home/acr/Documents/Jupyter Notebook'

In [55]:
%cd /home/acr/Documents/Jupyter Notebook

/home/acr/Documents/Jupyter Notebook


# Install Dependencies

In [33]:
!pip install tensorflow==2.3.0
!pip install gym
!pip install keras
!pip install keras-rl2

[31mERROR: Could not find a version that satisfies the requirement tensorflow==2.3.0 (from versions: 2.5.0rc0, 2.5.0rc1, 2.5.0rc2, 2.5.0rc3, 2.5.0, 2.5.1, 2.5.2, 2.6.0rc0, 2.6.0rc1, 2.6.0rc2, 2.6.0, 2.6.1, 2.6.2, 2.7.0rc0, 2.7.0rc1, 2.7.0)[0m
[31mERROR: No matching distribution found for tensorflow==2.3.0[0m


# Creating a random environment

#### The goal is to balance the pole up, moving the cart to the left or right . For each step it takes a point, for a maximum of 200 points

In [13]:
import gym
import random
import time

In [3]:
env = gym.make('CartPole-v0')
states = env.observation_space.shape[0]
actions = env.action_space.n

In [4]:
states, actions # There are 4 possible states and 2 possible actions

(4, 2)

#### Now we're just setting some random episodes where there is no learning from each of the episodes

In [39]:
episodes = 10
for episode in range(1,episodes+1):
  state=env.reset()
  done=False
  score=0

  while not done:
    env.render() # Rendering of the screen
    time.sleep(0.03) # Slowing the rendering. It stops 0.03 seconds for each frame
    action = random.choice([0,1]) # 0 and 1 represent movements like right and left
    n_state, reward, done, info = env.step(action)
    score+=reward
    info = info
  print('Episode:{} Score: {} Info: {}'.format(episode, score, info))
env.close()   # Closing the video, otherwise the window will not close

Episode:1 Score: 12.0 Info: {}
Episode:2 Score: 24.0 Info: {}
Episode:3 Score: 16.0 Info: {}
Episode:4 Score: 14.0 Info: {}
Episode:5 Score: 17.0 Info: {}
Episode:6 Score: 10.0 Info: {}
Episode:7 Score: 21.0 Info: {}
Episode:8 Score: 36.0 Info: {}
Episode:9 Score: 40.0 Info: {}
Episode:10 Score: 43.0 Info: {}


In [37]:
help(env.step)

Help on method step in module gym.wrappers.time_limit:

step(action) method of gym.wrappers.time_limit.TimeLimit instance
    Run one timestep of the environment's dynamics. When end of
    episode is reached, you are responsible for calling `reset()`
    to reset this environment's state.
    
    Accepts an action and returns a tuple (observation, reward, done, info).
    
    Args:
        action (object): an action provided by the agent
    
    Returns:
        observation (object): agent's observation of the current environment
        reward (float) : amount of reward returned after previous action
        done (bool): whether the episode has ended, in which case further step() calls will return undefined results
        info (dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning)



# Create a Deep Learning Model with Keras

#### Ideally we want to take the score of each episode all the way up to 200. The deep RL is going to learn the best action to take in that specific environment to maximize the score

In [58]:
import numpy as np
import tensorflow  #allows a sequential model with Keras
# It's better to import tensorflow and use sequential than specifically importing the sequential from tensorlfow.keras.model
# from tensorflow.keras.layers import Dense, Flatten # We've imported the dense and flattten nodes
# from tensorflow.keras.optimizers import Adam # We've imported the optimizer Adam to train the deep learning model

In [59]:
def build_model(states, actions):
  model = tensorflow.keras.Sequential()  # initiating a sequential model
  model.add(Flatten(input_shape=(1,states))) # It passes through a flatten node which has the 4 different states
  model.add(Dense(24, activation='relu'))
  model.add(Dense(24, activation='relu'))
  model.add(Dense(actions, activation='linear')) # This makes sure that the output are the actions. First (or through the top), come the states and lastly comes the acitons
  return model
# This model is fed the states and gets out the actions, and it trains in order to maximize the reward

In [60]:
model_a = build_model(states,actions)

In [61]:
model_a.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (None, 4)                 0         
                                                                 
 dense_3 (Dense)             (None, 24)                120       
                                                                 
 dense_4 (Dense)             (None, 24)                600       
                                                                 
 dense_5 (Dense)             (None, 2)                 50        
                                                                 
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________


# Build Agent with Keras-RL

In [62]:
from rl.agents import DQNAgent # Agent from keras rl, there are several
from rl.policy import BoltzmannQPolicy # There are two types of policies, the value-based rl and policy-based rl. This case will use policy based policy. This is the boltzmann Qpolicy
from rl.memory import SequentialMemory # For the DQNAgent there is the need for some memory, the Sequential memory allow it

In [63]:
def build_agent(model, actions):
  policy_a = BoltzmannQPolicy()
  memory_a = SequentialMemory(limit=50000, window_length=1)
  dqn = DQNAgent(model=model_a, memory=memory_a, policy=policy_a, 
                 nb_actions=actions, nb_steps_warmup=1000, target_model_update=1e-2)
  return dqn

In [52]:
help(DQNAgent)

Help on class DQNAgent in module rl.agents.dqn:

class DQNAgent(AbstractDQNAgent)
 |  DQNAgent(model, policy=None, test_policy=None, enable_double_dqn=False, enable_dueling_network=False, dueling_type='avg', *args, **kwargs)
 |  
 |  # Arguments
 |      model__: A Keras model.
 |      policy__: A Keras-rl policy that are defined in [policy](https://github.com/keras-rl/keras-rl/blob/master/rl/policy.py).
 |      test_policy__: A Keras-rl policy.
 |      enable_double_dqn__: A boolean which enable target network as a second network proposed by van Hasselt et al. to decrease overfitting.
 |      enable_dueling_dqn__: A boolean which enable dueling architecture proposed by Mnih et al.
 |      dueling_type__: If `enable_dueling_dqn` is set to `True`, a type of dueling architecture must be chosen which calculate Q(s,a) from V(s) and A(s,a) differently. Note that `avg` is recommanded in the [paper](https://arxiv.org/abs/1511.06581).
 |          `avg`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)

In [73]:
dqn = build_agent(model_a,actions) # Agent initiated. It knows the model, where it has the environment, and the actions it is allowed to do 
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae']) # Compile the model through optimizer
dqn.fit(env, nb_steps=30000, visualize=False, verbose=1)

Training for 30000 steps ...
Interval 1 (0 steps performed)
53 episodes - episode_reward: 188.642 [151.000, 200.000] - loss: 2.555 - mae: 39.302 - mean_q: 79.218

Interval 2 (10000 steps performed)
52 episodes - episode_reward: 190.135 [34.000, 200.000] - loss: 6.931 - mae: 41.677 - mean_q: 83.709

Interval 3 (20000 steps performed)
done, took 254.463 seconds


<keras.callbacks.History at 0x7faeb850f2e0>

In [74]:
scores = dqn.test(env,nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200
Episode 11: reward: 200.000, steps: 200
Episode 12: reward: 200.000, steps: 200
Episode 13: reward: 200.000, steps: 200
Episode 14: reward: 200.000, steps: 200
Episode 15: reward: 200.000, steps: 200
Episode 16: reward: 200.000, steps: 200
Episode 17: reward: 200.000, steps: 200
Episode 18: reward: 200.000, steps: 200
Episode 19: reward: 200.000, steps: 200
Episode 20: reward: 200.000, steps: 200
Episode 21: reward: 200.000, steps: 200
Episode 22: reward: 200.000, steps: 200
Episode 23: reward: 200.000, steps: 200
Episode 24: reward: 200.000, steps: 200
Episode 25: reward: 

In [94]:
import rl.callbacks
class EpisodeLogger(rl.callbacks.Callback):
    def __init__(self):
        self.observations = {}
        self.rewards = {}
        self.actions = {}

    def on_episode_begin(self, episode, logs):
        self.observations[episode] = []
        self.rewards[episode] = []
        self.actions[episode] = []

    def on_step_end(self, step, logs):
        episode = logs['episode']
        self.observations[episode].append(logs['observation'])
        self.rewards[episode].append(logs['reward'])
        self.actions[episode].append(logs['action'])

cb_ep = EpisodeLogger()
# _ = dqn.test(env,nb_episodes=15, callbacks=[cb_ep], visualize=True)

dqn.test(env, nb_episodes=20, visualize=True)

## Tenho de ver como diminuir a velocidade do rendering. O time.sleep não resulta
env.close()

Testing for 20 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200
Episode 11: reward: 200.000, steps: 200
Episode 12: reward: 200.000, steps: 200
Episode 13: reward: 200.000, steps: 200
Episode 14: reward: 200.000, steps: 200
Episode 15: reward: 200.000, steps: 200
Episode 16: reward: 200.000, steps: 200
Episode 17: reward: 200.000, steps: 200
Episode 18: reward: 200.000, steps: 200
Episode 19: reward: 200.000, steps: 200
Episode 20: reward: 200.000, steps: 200


In [69]:
help(dqn.test)

Help on method test in module rl.core:

test(env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1) method of rl.agents.dqn.DQNAgent instance
    Callback that is called before training begins.
    
    # Arguments
        env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
        nb_episodes (integer): Number of episodes to perform.
        action_repetition (integer): Number of times the agent repeats the same action without
            observing the environment again. Setting this to a value > 1 can be useful
            if a single action only has a very small effect on the environment.
        callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
            List of callbacks to apply during training. See [callbacks](/callbacks) for details.
        verbose (integer): 0 for no logging, 1 for interval loggin

In [95]:
dqn.save_weights('CartPole_DQN_Weights_Good',overwrite=True)

In [96]:
del model_a
del dqn
del env

In [97]:
env=gym.make('CartPole-v0')
actions = env.action_space.n
states = env.observation_space.shape[0]
model_a = build_model(states,actions)
dqn = build_agent(model_a,actions)
dqn.compile(Adam(lr=1e-3),metrics=['mae'])

In [98]:
dqn.load_weights('CartPole_DQN_Weights_Good')

In [99]:
dqn.test(env, nb_episodes=15, visualize=True)

Testing for 15 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200
Episode 11: reward: 200.000, steps: 200
Episode 12: reward: 200.000, steps: 200
Episode 13: reward: 200.000, steps: 200
Episode 14: reward: 200.000, steps: 200
Episode 15: reward: 200.000, steps: 200


<keras.callbacks.History at 0x7fae997d2ac0>

In [100]:
env.close()