[![Binder](https://mybinder.org/badge_logo.svg)](https://lab.mlpack.org/v2/gh/mlpack/examples/master?urlpath=lab%2Ftree%2Fq_learning%2Fpendulum_sac.ipynb)

You can easily run this notebook at https://lab.mlpack.org/

Here, we train a [Soft Actor-Critic](https://arxiv.org/abs/1801.01290) agent to get high scores for the [Pendulum](https://gym.openai.com/envs/Pendulum-v0/) environment. 

We make the agent train and test on OpenAI Gym toolkit's GUI interface provided through a distributed infrastructure (TCP API). More details can be found [here](https://github.com/zoq/gym_tcp_api).

A video of the trained agent can be seen in the end.

## Including necessary libraries and namespaces

In [1]:
#include <mlpack/core.hpp>

In [2]:
#include <mlpack/methods/ann/ffn.hpp>
#include <mlpack/methods/reinforcement_learning/sac.hpp>
#include <mlpack/methods/ann/loss_functions/empty_loss.hpp>
#include <mlpack/methods/ann/init_rules/gaussian_init.hpp>
#include <mlpack/methods/reinforcement_learning/environment/env_type.hpp>
#include <mlpack/methods/reinforcement_learning/training_config.hpp>

In [3]:
// Used to run the agent on gym's environment (provided externally) for testing.
#include <gym/environment.hpp>

In [4]:
// Used to generate and display a video of the trained agent.
#include "xwidgets/ximage.hpp"
#include "xwidgets/xvideo.hpp"
#include "xwidgets/xaudio.hpp"

In [5]:
using namespace mlpack;

In [6]:
using namespace mlpack::ann;

In [7]:
using namespace ens;

In [8]:
using namespace mlpack::rl;

## Initializing the agent

In [9]:
// Set up the state and action space.
ContinuousActionEnv::State::dimension = 3;
ContinuousActionEnv::Action::size = 1;

In [10]:
// Set up the actor and critic networks.
FFN<EmptyLoss<>, GaussianInitialization>
    policyNetwork(EmptyLoss<>(), GaussianInitialization(0, 0.1));
policyNetwork.Add(new Linear<>(ContinuousActionEnv::State::dimension, 32));
policyNetwork.Add(new ReLULayer<>());
policyNetwork.Add(new Linear<>(32, ContinuousActionEnv::Action::size));
policyNetwork.Add(new TanHLayer<>());

FFN<EmptyLoss<>, GaussianInitialization>
    qNetwork(EmptyLoss<>(), GaussianInitialization(0, 0.1));
qNetwork.Add(new Linear<>(ContinuousActionEnv::State::dimension +
                          ContinuousActionEnv::Action::size, 32));
qNetwork.Add(new ReLULayer<>());
qNetwork.Add(new Linear<>(32, 1));

In [11]:
// Set up the policy method.
RandomReplay<ContinuousActionEnv> replayMethod(32, 10000);

In [12]:
// Set up training configurations.
TrainingConfig config;
config.TargetNetworkSyncInterval() = 1;
config.UpdateInterval() = 1;

In [13]:
// Set up Soft actor-critic agent.
SAC<ContinuousActionEnv, decltype(qNetwork), decltype(policyNetwork), AdamUpdate>
    agent(config, qNetwork, policyNetwork, replayMethod);

## Preparation for training the agent

In [14]:
// Set up the gym training environment.
gym::Environment env("gym.kurg.org", "4040", "Pendulum-v0");

// Initializing training variables.
std::vector<double> returnList;
size_t episodes = 0;
bool converged = true;

// The number of episode returns to keep track of.
size_t consecutiveEpisodes = 25;

In [15]:
// Function to train the agent on the Pendulum gym environment.
void train(const size_t numSteps)
{
  agent.Deterministic() = false;
  std::cout << "Training for " << numSteps << " steps." << std::endl;
  while (agent.TotalSteps() < numSteps)
  {
    double episodeReturn = 0;
    env.reset();
    do
    {
      agent.State().Data() = env.observation;
      agent.SelectAction();
      arma::mat action = {double(agent.Action().action[0] * 2)};

      env.step(action);
      ContinuousActionEnv::State nextState;
      nextState.Data() = env.observation;

      replayMethod.Store(agent.State(), agent.Action(), env.reward, nextState,
          env.done, 0.99);
      episodeReturn += env.reward;
      agent.TotalSteps()++;
      if (agent.Deterministic() || agent.TotalSteps() < config.ExplorationSteps())
        continue;
      for (size_t i = 0; i < config.UpdateInterval(); i++)
        agent.Update();
    } while (!env.done);
    returnList.push_back(episodeReturn);
    episodes += 1;

    if (returnList.size() > consecutiveEpisodes)
      returnList.erase(returnList.begin());
        
    double averageReturn = std::accumulate(returnList.begin(),
                                           returnList.end(), 0.0) /
                           returnList.size();
    if(episodes % 2 == 0)
    {
      std::cout << "Avg return in last " << returnList.size()
          << " episodes: " << averageReturn
          << "\t Episode return: " << episodeReturn
          << "\t Total steps: " << agent.TotalSteps() << std::endl;
    }
  }
}

## Let the training begin

In [16]:
// Training the agent for a total of at least 5000 steps.
train(5000)

Training for 5000 steps.
Avg return in last 2 episodes: -1492.3	 Episode return: -1406.77	 Total steps: 400
Avg return in last 4 episodes: -1193.68	 Episode return: -972.225	 Total steps: 800
Avg return in last 6 episodes: -1193.38	 Episode return: -1112.53	 Total steps: 1200
Avg return in last 8 episodes: -1258.5	 Episode return: -1598.63	 Total steps: 1600
Avg return in last 10 episodes: -1219.42	 Episode return: -835.847	 Total steps: 2000
Avg return in last 12 episodes: -1199.54	 Episode return: -1108.56	 Total steps: 2400
Avg return in last 14 episodes: -1133.13	 Episode return: -800.636	 Total steps: 2800
Avg return in last 16 episodes: -1118.45	 Episode return: -1345.97	 Total steps: 3200
Avg return in last 18 episodes: -1150.39	 Episode return: -1285.81	 Total steps: 3600
Avg return in last 20 episodes: -1131.89	 Episode return: -1027.84	 Total steps: 4000
Avg return in last 22 episodes: -1145.8	 Episode return: -1433.4	 Total steps: 4400
Avg return in last 24 episodes: -1151.7

## Testing the trained agent

In [17]:
agent.Deterministic() = true;

// Creating and setting up the gym environment for testing.
gym::Environment envTest("gym.kurg.org", "4040", "Pendulum-v0");
envTest.monitor.start("./dummy/", true, true);

// Resets the environment.
envTest.reset();
envTest.render();

double totalReward = 0;
size_t totalSteps = 0;

// Testing the agent on gym's environment.
while (1)
{
  // State from the environment is passed to the agent's internal representation.
  agent.State().Data() = envTest.observation;

  // With the given state, the agent selects an action according to its defined policy.
  agent.SelectAction();

  // Action to take, decided by the policy.
  arma::mat action = {double(agent.Action().action[0] * 2)};

  envTest.step(action);
  totalReward += envTest.reward;
  totalSteps += 1;

  if (envTest.done)
  {
    std::cout << " Total steps: " << totalSteps << "\t Total reward: "
        << totalReward << std::endl;
    break;
  }

  // Uncomment the following lines to see the reward and action in each step.
  // std::cout << " Current step: " << totalSteps << "\t current reward: "
  //   << totalReward << "\t Action taken: " << action;
}

envTest.close();
std::string url = envTest.url();
std::cout << url;
auto video = xw::video_from_url(url).finalize();
video

 Total steps: 200	 Total reward: -1206.9
https://gym.kurg.org/92f59075e7084/output.webm

A Jupyter widget

## A little more training...

In [18]:
// Training the same agent for a total of at least 20000 steps.
train(20000)

Training for 20000 steps.
Avg return in last 25 episodes: -1147.37	 Episode return: -1161.32	 Total steps: 5200
Avg return in last 25 episodes: -1124.22	 Episode return: -667.611	 Total steps: 5600
Avg return in last 25 episodes: -1098.34	 Episode return: -635.474	 Total steps: 6000
Avg return in last 25 episodes: -1090.19	 Episode return: -770.662	 Total steps: 6400
Avg return in last 25 episodes: -1083.9	 Episode return: -1527.91	 Total steps: 6800
Avg return in last 25 episodes: -1086.25	 Episode return: -653.5	 Total steps: 7200
Avg return in last 25 episodes: -1129.77	 Episode return: -1495.39	 Total steps: 7600
Avg return in last 25 episodes: -1193.97	 Episode return: -1510.9	 Total steps: 8000
Avg return in last 25 episodes: -1177.44	 Episode return: -1240.38	 Total steps: 8400
Avg return in last 25 episodes: -1196.74	 Episode return: -1256.28	 Total steps: 8800
Avg return in last 25 episodes: -1200.86	 Episode return: -1083.71	 Total steps: 9200
Avg return in last 25 episodes: 

# Final agent testing!

In [21]:
agent.Deterministic() = true;

// Creating and setting up the gym environment for testing.
gym::Environment envTest("gym.kurg.org", "4040", "Pendulum-v0");
envTest.monitor.start("./dummy/", true, true);

// Resets the environment.
envTest.reset();
envTest.render();

double totalReward = 0;
size_t totalSteps = 0;

// Testing the agent on gym's environment.
while (1)
{
  // State from the environment is passed to the agent's internal representation.
  agent.State().Data() = envTest.observation;

  // With the given state, the agent selects an action according to its defined policy.
  agent.SelectAction();

  // Action to take, decided by the policy.
  arma::mat action = {double(agent.Action().action[0] * 2)};

  envTest.step(action);
  totalReward += envTest.reward;
  totalSteps += 1;

  if (envTest.done)
  {
    std::cout << " Total steps: " << totalSteps << "\t Total reward: "
        << totalReward << std::endl;
    break;
  }

  // Uncomment the following lines to see the reward and action in each step.
  // std::cout << " Current step: " << totalSteps << "\t current reward: "
  //   << totalReward << "\t Action taken: " << action;
}

envTest.close();
std::string url = envTest.url();
std::cout << url;
auto video = xw::video_from_url(url).finalize();
video

 Total steps: 200	 Total reward: -288.541
https://gym.kurg.org/6e74c868e2284/output.webm

A Jupyter widget