# Banchmark fully off-policy DQN on Cartpole

Is stable-baselines **Behavior Cloning** the right choice to make an off-policy dataset?
* The dokumentation says: _"for a given observation, the action taken by the policy must be the one taken by the expert"_
* That´s in my opption not the same as filling a replay buffer.

In [2]:
from stable_baselines import DQN

### Collecting off-policy data to tranin DQN

The dataset is saved as expert_cartpole.npz in this folder.

In [4]:
from stable_baselines.gail import generate_expert_traj

model = DQN('MlpPolicy', 'CartPole-v1', verbose=0)
      # Train a DQN agent for 1e5 timesteps and generate 10 trajectories
      # data will be saved in a numpy archive named `expert_cartpole.npz`
generate_expert_traj(model, 'expert_cartpole', n_timesteps=int(1e6), n_episodes=int(1e4))

actions (878987, 1)
obs (878987, 4)
rewards (878987,)
episode_returns (10000,)
episode_starts (878987,)


{'actions': array([[0],
        [0],
        [0],
        ...,
        [0],
        [1],
        [1]]),
 'obs': array([[-0.03360261,  0.04453236, -0.03984775,  0.01214404],
        [-0.03271196, -0.14999613, -0.03960487,  0.29199302],
        [-0.03571189, -0.34453166, -0.03376501,  0.5719267 ],
        ...,
        [ 1.8527644 ,  2.7364528 ,  0.20202377, -0.023985  ],
        [ 1.9074935 ,  2.5390933 ,  0.20154408,  0.32502544],
        [ 1.9582753 ,  2.730861  ,  0.20804459,  0.10205026]],
       dtype=float32),
 'rewards': array([1., 1., 1., ..., 1., 1., 1.]),
 'episode_returns': array([91., 85., 85., ..., 88., 82., 92.]),
 'episode_starts': array([ True, False, False, ..., False, False, False])}

### Traning DQN with fully off-policy data

* I´m note sure if `model.pretrain()` is the same like a `replay_buffer`
* The Dokumentation says: _"supervised learning given an expert dataset"_

In [7]:
from stable_baselines.gail import ExpertDataset
# Using only one expert trajectory
# you can specify `traj_limitation=-1` for using the whole dataset
dataset = ExpertDataset(expert_path='expert_cartpole.npz',
                        traj_limitation=-1, batch_size=128)

model = DQN('MlpPolicy', 'CartPole-v1', verbose=1)
# Pretrain the DQN
model.pretrain(dataset, n_epochs=100)

# Test the pre-trained model
env = model.get_env()
obs = env.reset()

reward_sum = 0.0
for _ in range(1000):
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward
#         env.render()
        if done:
                print(reward_sum)
                reward_sum = 0.0
                obs = env.reset()

env.close()

actions (878987, 1)
obs (878987, 4)
rewards (878987,)
episode_returns (10000,)
episode_starts (878987,)
Total trajectories: -1
Total transitions: 878987
Average returns: 87.8987
Std for returns: 5.305189752497077
Creating environment from the given name, wrapped in a DummyVecEnv.
Pretraining with Behavior Cloning...
==== Training progress 10.00% ====
Epoch 10
Training loss: 0.073304, Validation loss: 0.071991

==== Training progress 20.00% ====
Epoch 20
Training loss: 0.057887, Validation loss: 0.056496

==== Training progress 30.00% ====
Epoch 30
Training loss: 0.051736, Validation loss: 0.049762

==== Training progress 40.00% ====
Epoch 40
Training loss: 0.048213, Validation loss: 0.047909

==== Training progress 50.00% ====
Epoch 50
Training loss: 0.046434, Validation loss: 0.044619

==== Training progress 60.00% ====
Epoch 60
Training loss: 0.045257, Validation loss: 0.042969

==== Training progress 70.00% ====
Epoch 70
Training loss: 0.044263, Validation loss: 0.042591

==== Train

In [8]:
def learn_from_buffer(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN",
              reset_num_timesteps=True, replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            # Create the replay buffer
#             if self.prioritized_replay:
#                 self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha)
#                 if self.prioritized_replay_beta_iters is None:
#                     prioritized_replay_beta_iters = total_timesteps
#                 else:
#                     prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
#                 self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
#                                                     initial_p=self.prioritized_replay_beta0,
#                                                     final_p=1.0)
#             else:
            self.replay_buffer = ReplayBuffer(self.buffer_size)
            self.beta_schedule = None

#             if replay_wrapper is not None:
#                 assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
#                 self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
#             self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps),
#                                               initial_p=self.exploration_initial_eps,
#                                               final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            episode_successes = []

            callback.on_training_start(locals(), globals())
#             callback.on_rollout_start()

            reset = True
            obs = self.env.reset()
            # Retrieve unnormalized observation for saving into the buffer
            if self._vec_normalize_env is not None:
                obs_ = self._vec_normalize_env.get_original_obs().squeeze()

            for _ in range(total_timesteps):
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
#                 with self.sess.as_default():
#                     action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
#                 env_action = action
#                 reset = False
#                 new_obs, rew, done, info = self.env.step(env_action)

                self.num_timesteps += 1

#                 # Stop training if return value is False
#                 if callback.on_step() is False:
#                     break

#                 # Store only the unnormalized version
#                 if self._vec_normalize_env is not None:
#                     new_obs_ = self._vec_normalize_env.get_original_obs().squeeze()
#                     reward_ = self._vec_normalize_env.get_original_reward().squeeze()
#                 else:
#                     # Avoid changing the original ones
#                     obs_, new_obs_, reward_ = obs, new_obs, rew
#                 # Store transition in the replay buffer.
#                 self.replay_buffer.add(obs_, action, reward_, new_obs_, float(done))
#                 obs = new_obs
#                 # Save the unnormalized observation
#                 if self._vec_normalize_env is not None:
#                     obs_ = new_obs_

                if writer is not None:
                    ep_rew = np.array([reward_]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer,
                                                        self.num_timesteps)

#                 episode_rewards[-1] += reward_
#                 if done:
#                     maybe_is_success = info.get('is_success')
#                     if maybe_is_success is not None:
#                         episode_successes.append(float(maybe_is_success))
#                     if not isinstance(self.env, VecEnv):
#                         obs = self.env.reset()
#                     episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
#                 if can_sample and self.num_timesteps > self.learning_starts \
#                         and self.num_timesteps % self.train_freq == 0:

#                 callback.on_rollout_end()
#                 # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
#                 # pytype:disable=bad-unpacking
                if self.prioritized_replay:
                    assert self.beta_schedule is not None, \
                           "BUG: should be LinearSchedule when self.prioritized_replay True"
                    experience = self.replay_buffer.sample(self.batch_size,
                                                           beta=self.beta_schedule.value(self.num_timesteps),
                                                           env=self._vec_normalize_env)
                    (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size,
                                                                                            env=self._vec_normalize_env)
                    weights, batch_idxes = np.ones_like(rewards), None
                # pytype:enable=bad-unpacking

                if writer is not None:
                    # run loss backprop with summary, but once every 100 steps save the metadata
                    # (memory, compute time, ...)
                    if (1 + self.num_timesteps) % 100 == 0:
                        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                        run_metadata = tf.RunMetadata()
                        summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1,
                                                              dones, weights, sess=self.sess, options=run_options,
                                                              run_metadata=run_metadata)
                        writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps)
                    else:
                        summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1,
                                                              dones, weights, sess=self.sess)
                    writer.add_summary(summary, self.num_timesteps)
                else:
                    _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights,
                                                    sess=self.sess)

                if self.prioritized_replay:
                    new_priorities = np.abs(td_errors) + self.prioritized_replay_eps
                    assert isinstance(self.replay_buffer, PrioritizedReplayBuffer)
                    self.replay_buffer.update_priorities(batch_idxes, new_priorities)

#                 callback.on_rollout_start()

#                 if can_sample and self.num_timesteps > self.learning_starts and \
#                         self.num_timesteps % self.target_network_update_freq == 0:
#                     # Update target network periodically.
#                     self.update_target(sess=self.sess)

#                 if len(episode_rewards[-101:-1]) == 0:
#                     mean_100ep_reward = -np.inf
#                 else:
#                     mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

#                 num_episodes = len(episode_rewards)
#                 if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0:
#                     logger.record_tabular("steps", self.num_timesteps)
#                     logger.record_tabular("episodes", num_episodes)
#                     if len(episode_successes) > 0:
#                         logger.logkv("success rate", np.mean(episode_successes[-100:]))
#                     logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
#                     logger.record_tabular("% time spent exploring",
#                                           int(100 * self.exploration.value(self.num_timesteps)))
#                     logger.dump_tabular()

        callback.on_training_end()
        return self