In [None]:
!pip install importlib-metadata==4.13.0
!pip install git+https://github.com/osigaud/bbrl
!pip install git+https://github.com/osigaud/bbrl_examples.git
!pip install omegaconf

In [3]:
from bbrl.workspace import Workspace
from bbrl import get_class, get_arguments, instantiate_class

#import bbrl_gym
import gym

from bbrl.agents.agent import Agent
from bbrl.agents import Agents, TemporalAgent, PrintAgent
from bbrl.agents.gymb import NoAutoResetGymAgent

from bbrl.utils.replay_buffer import ReplayBuffer
from bbrl.utils.chrono import Chrono

from bbrl.visu.visu_policies import plot_policy
from bbrl.visu.visu_critics import plot_critic
from bbrl.visu.common import final_show

from bbrl_examples.models.loggers import RewardLoader
from bbrl_examples.models.loggers import RewardLogger
from bbrl_examples.models.loggers import Logger
from bbrl_examples.models.plotters import Plotter
from bbrl_examples.models.shared_models import build_mlp, build_alt_mlp
from bbrl_examples.models.critics import DiscreteQAgent
from bbrl_examples.models.exploration_agents import EGreedyActionSelector

from omegaconf import OmegaConf
from omegaconf import DictConfig

import torch
import torch.nn as nn
import torch.nn.functional as F

import random
import numpy as np
import time
import matplotlib
import os
import functools
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import copy

In [165]:
class CartWrapper(gym.Wrapper):
    def __init__(self, env, goal_position, goal_angle, threshold=0.1):
        super().__init__(env)

        self.goal_position = goal_position
        self.goal_angle = goal_angle
        self.threshold = threshold
    
    def compute_distance(self,state):
        position,angle= state[0],state[2]
        return np.sqrt((position-self.goal_position)**2+(angle-self.goal_angle)**2)

    def step(self, action):
        state, reward, done, info = self.env.step(action)
        distance = self.compute_distance(state)

        if distance < self.threshold:
            reward = 1.0
            done = True
        else:
            reward = 0.0

        return state, reward, done, info


In [166]:
class GoalAgent(Agent):
    def __init__(self, env):
        super().__init__()
        self.env = env

    def forward(self, t, achieved_goal, **kwargs):
        # Modification du but
        self.set(("env/desired_goal", t), achieved_goal)
   

    def set_goal(self, t, achieved_goal):
        goal_position, goal_angle = achieved_goal
        self.env.goal_position = goal_position
        self.env.goal_angle = goal_angle



In [167]:
class HERAgent(Agent):
    def __init__(self,strategy= 'final'):
        super().__init__()
        self.strategy = strategy

    def forward(self, t, transitions):
        her_transitions = Workspace()
        if self.strategy == 'final':
            achieved_goal = transitions["env/env_obs"][-1]
        elif self.strategy == 'future':
            # Select a random future transition
            index = random.randint(0, len(transitions["env/env_obs"]) - 1)
            achieved_goal = transitions["env/env_obs"][index]
        else:
            raise ValueError("Invalid HER strategy")

        # Replace the desired_goal with the achieved_goal
      
        her_transitions.set(("env/achieved_goal", t), achieved_goal)
        return her_transitions




In [176]:
class GoalRelabellingAgent(Agent):
    def __init__(self):
        super().__init__()

    def forward(self, t, achieved_goal, **kwargs):
        # modification du but
        self.set(("env/desired_goal", t), achieved_goal)

In [177]:
class RewardAgent(Agent):
    def __init__(self,reward_threshold = 0.1):
        super().__init__()
        self.reward_threshold = reward_threshold

    def forward(self, t, **kwargs):
        if t != 0:
            desired_goal = self.get(("env/desired_goal", t))
            achieved_goal = self.get(("env/achieved_goal", t))
        
            distance = np.linalg.norm(desired_goal - achieved_goal)

            if distance < self.reward_threshold:
                reward = 1
                done = True
            else:
                reward = 0
                done = False
            self.set(("env/reward", t), reward)
            self.set(("env/done", t), done)

In [178]:
class GCDQNAgent(Agent):
    def __init__(self, state_dim, hidden_layers, action_dim):
        super().__init__()
        self.is_q_function = True
        self.model = build_alt_mlp(
            [state_dim +4] + list(hidden_layers) + [action_dim], activation=nn.ReLU()  )

    def forward(self, t, choose_action=True, **kwargs):
        obs = self.get(("env/env_obs", t))
        goal = self.get(("env/achieved_goal", t))
        agent_input = torch.cat([obs, goal], dim=1)

        q_values = self.model(agent_input).squeeze(-1)
        self.set(("q_values", t), q_values)

        if choose_action:
            action = q_values.argmax(-1)
            self.set(("action", t), action)

    def predict_action(self, obs, goal, stochastic):
        agent_input = torch.cat([obs, goal], dim=1)
        q_values = self.model(obs).squeeze(-1)
        if stochastic:
            probs = torch.softmax(q_values, dim=-1)
            action = torch.distributions.Categorical(probs).sample()
        else:
            action = q_values.argmax(-1)
        return action

    def predict_value(self, obs, goal, action):
        agent_input = torch.cat([obs, goal], dim=1)
        q_values = self.model(agent_input).squeeze(-1)
        return q_values[action[0].int()]

In [179]:
def make_gym_env(env_name):
    #env = CartWrapper(gym.make(env_name))
    #return env
    return gym.make(env_name)


def get_env_agents(cfg):
    train_env_agent = NoAutoResetGymAgent(
        get_class(cfg.gym_env),
        get_arguments(cfg.gym_env),
        cfg.algorithm.n_envs,
        cfg.algorithm.seed,
    )
    # print_agent = PrintAgent()
    eval_env_agent = NoAutoResetGymAgent(
        get_class(cfg.gym_env),
        get_arguments(cfg.gym_env),
        cfg.algorithm.nb_evals,
        cfg.algorithm.seed,
    )
    return train_env_agent, eval_env_agent


def create_dqn_agent(cfg, train_env_agent, eval_env_agent):
    obs_size, act_size = train_env_agent.get_obs_and_actions_sizes()

    critic = DiscreteQAgent(obs_size, cfg.algorithm.architecture.hidden_size, act_size)
    explorer = EGreedyActionSelector(cfg.algorithm.epsilon)
    target_critic = copy.deepcopy(critic)

    q_agent = TemporalAgent(critic)
    target_q_agent = TemporalAgent(target_critic)

    #her_agent = HERAgent()

    #tr_agent = Agents(train_env_agent, critic, explorer, her_agent)
    #ev_agent = Agents(eval_env_agent, critic, her_agent)
    tr_agent = Agents(train_env_agent, critic, explorer)
    ev_agent = Agents(eval_env_agent, critic)
    
    train_agent = TemporalAgent(tr_agent)
    eval_agent = TemporalAgent(ev_agent)

    goal_label_agent = TemporalAgent(GoalRelabellingAgent())
    reward_agent = TemporalAgent(RewardAgent())

    train_agent.seed(cfg.algorithm.seed)
    #return train_agent, eval_agent, q_agent, target_q_agent, goal_label_agent, reward_agent, her_agent
    return train_agent, eval_agent, q_agent, target_q_agent, goal_label_agent, reward_agent


# Configure the optimizer
def setup_optimizers(cfg, q_agent):
    optimizer_args = get_arguments(cfg.optimizer)
    parameters = q_agent.parameters()
    optimizer = get_class(cfg.optimizer)(parameters, **optimizer_args)
    return optimizer


def compute_critic_loss(cfg, reward, must_bootstrap, q_values, target_q_values, action):
    # Compute temporal difference
    max_q = target_q_values[1].max(-1)[0].detach()

    target = (
        reward[:-1]
        + cfg.algorithm.discount_factor * max_q * must_bootstrap.int()
    )

    vals = q_values.squeeze()
    qvals = torch.gather(vals, dim=1, index=action)
    qvals = qvals[:-1]

    mse = nn.MSELoss()
    critic_loss = mse(target, qvals)
    return critic_loss

In [184]:
def run_dqn(cfg, reward_logger):
    # 1)  Build the  logger
    logger = Logger(cfg)
    best_reward = -10e9

    # 2) Create the environment agent
    train_env_agent, eval_env_agent = get_env_agents(cfg)

    # 3) Create the DQN-like Agent
    #train_agent, eval_agent, q_agent, target_q_agent, goal_label_agent, reward_agent,her_agent = create_dqn_agent(cfg, train_env_agent, eval_env_agent)
    train_agent, eval_agent, q_agent, target_q_agent, goal_label_agent, reward_agent = create_dqn_agent(cfg, train_env_agent, eval_env_agent)

    # 4) Create the Replay Buffer Agent
    rb = ReplayBuffer(max_size=cfg.algorithm.buffer_size)

    # 6) Configure the optimizer
    optimizer = setup_optimizers(cfg, q_agent)
    nb_steps = 0
    tmp_steps = 0
    tmp_steps2 = 0

    # 7) Training
    # Train des épisodes
    for _ in range(cfg.algorithm.n_episodes):
        train_workspace = Workspace()
        train_agent(train_workspace, t=0, stop_variable="env/done", stochastic=True)

        transition_workspace = train_workspace.get_transitions()

        
        # comptage du nb de step de l'épisode
        action = transition_workspace["action"]
        nb_steps += action[0].shape[0]

        # ajout des transitions au RB
        rb.put(transition_workspace)

        # 7.1) Loop Replay Buffer
        for _ in range(cfg.algorithm.n_updates):
            # tirage aléatoire d'un minibatch dans un Workspace
            rb_workspace = rb.get_shuffled(cfg.algorithm.batch_size)

            achieved_goal = rb_workspace["env/env_obs"][-1]
            #goal_label_agent(rb_workspace, t=0, achieved_goal=achieved_goal, n_steps=2, choose_action=False)
            #reward_agent(rb_workspace, t=0, n_steps=2)

            # The q agent needs to be executed on the rb_workspace workspace (gradients are removed in workspace).
            q_agent(rb_workspace, t=0, n_steps=2, choose_action=False)

            q_values, done, truncated, reward, action = rb_workspace[
                "q_values", "env/done", "env/truncated", "env/reward", "action"
            ]

            with torch.no_grad():
                target_q_agent(rb_workspace, t=0, n_steps=2, stochastic=True)

            target_q_values = rb_workspace["q_values"]
            # assert torch.equal(q_values, target_q_values), "values differ"

            # Determines whether values of the critic should be propagated
            # True if the episode reached a time limit or if the task was not done
            # See https://colab.research.google.com/drive/1erLbRKvdkdDy0Zn1X_JhC01s1QAt4BBj?usp=sharing
            must_bootstrap = torch.logical_or(~done[1], truncated[1])

            if rb.size() > cfg.algorithm.learning_starts:
                # Compute critic loss
                critic_loss = compute_critic_loss(
                    cfg, reward, must_bootstrap, q_values[0], target_q_values[1], action
                )

                # Store the loss for tensorboard display
                logger.add_log("critic_loss", critic_loss, nb_steps)

                optimizer.zero_grad()
                critic_loss.backward()
                torch.nn.utils.clip_grad_norm_(
                    q_agent.parameters(), cfg.algorithm.max_grad_norm
                )
                optimizer.step()

        # 7.2) Maj du Q_target (sous conditions)
        if nb_steps - tmp_steps2 > cfg.algorithm.target_critic_update:
            tmp_steps2 = nb_steps
            target_q_agent.agent = copy.deepcopy(q_agent.agent)
        
        # 7.3) Évaluation régulère
        if nb_steps - tmp_steps > cfg.algorithm.eval_interval:
            tmp_steps = nb_steps
            eval_workspace = Workspace()  # Used for evaluation
            eval_agent(
                eval_workspace, t=0, stop_variable="env/done", choose_action=True
            )
            rewards = eval_workspace["env/cumulated_reward"][-1]
            mean = rewards.mean()
            logger.add_log("reward", mean, nb_steps)
            print(f"reward: {mean}")
            reward_logger.add(nb_steps, mean)
            
            if cfg.save_best and mean > best_reward:
                best_reward = mean
                directory = "./dqn_critic/"
                
                if not os.path.exists(directory):
                    os.makedirs(directory)
                
                filename = directory + "dqn_" + str(mean.item()) + ".agt"
                eval_agent.save_model(filename)
                
                if cfg.plot_agents:
                    policy = eval_agent.agent.agents[1]
                    plot_policy(
                        policy,
                        eval_env_agent,
                        "./dqn_plots/",
                        cfg.gym_env.env_name,
                        best_reward,
                        stochastic=False,
                    )
                    plot_critic(
                        policy,
                        eval_env_agent,
                        "./dqn_plots/",
                        cfg.gym_env.env_name,
                        best_reward,
                    )

In [185]:
params={
  "save_best": False,
  "plot_agents": True,
  
  "logger":{
    "classname": "bbrl.utils.logger.TFLogger",
    "log_dir": "./dqn_logs/",
    "cache_size": 10000,
    "every_n_seconds": 1,
    "verbose": False,    
  },

  "algorithm":{
    "seed": 3,                      # modifié par la main loop
    "nb_seeds": 1,                  # nb de seed testées (de 0 à valeur proposée)

    "epsilon": 0.05,                 # valeur pour epsilon-greedy
    "discount_factor": 0.99,        # delta
    "gae": 0.8,                     # ???

    "n_steps": 64,                  # nb max de step par épisode ?
    "n_envs": 10,                    # nb d'environnement en simultané
    "n_episodes": 20,                # nb d'épisodes
    "nb_evals": 5,                 # nb d'évaluation après train
    "eval_interval": 10,            # intervalle (steps) entre évaluations ?
    "target_critic_update": 10,    # intervalle (steps) entre chaque maj de Q_target ?

    "learning_starts": 1,           # ???

    "n_updates": 20,                # nb d'update par le Replay Buffer
    "buffer_size": 1e6,             # taille max du Replay Buffer
    "batch_size": 50,              # taille du batch Replay Buffer

    "max_grad_norm": 0.5,           # ???
    "architecture":{"hidden_size": [128, 128]},
  },

  "gym_env":{
    "classname": "__main__.make_gym_env",
    "env_name": "CartPole-v1"
  },

  "optimizer":{
    "classname": "torch.optim.Adam",
    "lr": 2.3e-3,
  }
}

config = OmegaConf.create(params)

In [186]:
def main_loop(cfg):
    chrono = Chrono()
    logdir = "./plot/"

    if not os.path.exists(logdir):
        os.makedirs(logdir)

    reward_logger = RewardLogger(
        logdir + "dqn.steps", logdir + "dqn.rwd"
    )

    for seed in range(cfg.algorithm.nb_seeds):
        #cfg.algorithm.seed = seed
        torch.manual_seed(cfg.algorithm.seed)
        run_dqn(cfg, reward_logger)

        if seed < cfg.algorithm.nb_seeds - 1:
            reward_logger.new_episode()

    reward_logger.save()
    chrono.stop()
    plotter = Plotter(logdir + "dqn.steps", logdir + "dqn.rwd")
    plotter.plot_reward("dqn", cfg.gym_env.env_name)

In [187]:
main_loop(config)

reward: 9.399999618530273
reward: 9.399999618530273
reward: 9.199999809265137
reward: 9.199999809265137
reward: 9.399999618530273
reward: 9.800000190734863
reward: 9.0
reward: 8.800000190734863
reward: 10.0
reward: 10.199999809265137
reward: 9.199999809265137
reward: 9.800000190734863
reward: 9.600000381469727
reward: 9.600000381469727
reward: 9.399999618530273
reward: 9.399999618530273
reward: 10.0
reward: 9.0
reward: 9.0
reward: 9.399999618530273
Time : 5s 801ms


: 

: 