# Exploring BEAR Algorithm

## Imports
* __TODO__ Check why cuda is not available

In [8]:
import gym
import numpy as np
import argparse
import os
import pickle
import gzip

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.distributions as td

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Cuda is available")
else:
    device = torch.device("cpu")
    print("Cuda is NOT available")

# Intern Imports
import BEAR.algos as algos
from BEAR.logger import logger, setup_logger
from BEAR.logger import create_stats_ordered_dict

Cuda is NOT available


## Utils
* __TODO__ Explore step by step

In [12]:
class ReplayBuffer(object):
    def __init__(self, state_dim=10, action_dim=4, size=1000000):
        self.storage = dict()
        self.storage['observations'] = np.zeros((size, state_dim), np.float32)
        self.storage['next_observations'] = np.zeros((size, state_dim), np.float32)
        self.storage['actions'] = np.zeros((size, action_dim), np.float32)
        self.storage['rewards'] = np.zeros((size, 1), np.float32)
        self.storage['terminals'] = np.zeros((size, 1), np.float32)
        self.storage['bootstrap_mask'] = np.zeros((size, 4), np.float32)
        self.buffer_size = size
        self.ctr = 0

    # Expects tuples of (state, next_state, action, reward, done)
    def add(self, data):
        self.storage['observations'][self.ctr] = data[0]
        self.storage['next_observations'][self.ctr] = data[1]
        self.storage['actions'][self.ctr] = data[2]
        self.storage['rewards'][self.ctr] = data[3]
        self.storage['terminals'][self.ctr] = data[4]
        self.ctr += 1
        self.ctr = self.ctr % self.buffer_size

    def sample(self, batch_size, with_data_policy=False):
        ind = np.random.randint(0, self.storage['observations'].shape[0], size=batch_size)
        state, next_state, action, reward, done = [], [], [], [], []
        # import ipdb; ipdb.set_trace()

        s = self.storage['observations'][ind]
        a = self.storage['actions'][ind]
        r = self.storage['rewards'][ind]
        s2 = self.storage['next_observations'][ind]
        d = self.storage['terminals'][ind]
        mask = self.storage['bootstrap_mask'][ind]

        if with_data_policy:
                data_mean = self.storage['data_policy_mean'][ind]
                data_cov = self.storage['data_policy_logvar'][ind]

                return (np.array(s), 
                        np.array(s2), 
                        np.array(a), 
                        np.array(r).reshape(-1, 1), 
                        np.array(d).reshape(-1, 1),
                        np.array(mask),
                        np.array(data_mean),
                        np.array(data_cov))

        return (np.array(s), 
                np.array(s2), 
                np.array(a), 
                np.array(r).reshape(-1, 1), 
                np.array(d).reshape(-1, 1),
                np.array(mask))

    def save(self, path):
        np.save(path+".npy", self.storage)

    def load(self, filename, bootstrap_dim=None):
#         with gzip.open(filename, 'rb') as f:
#                 self.storage = pickle.load(f)
                
#         with open(filename, 'rb') as f:
#                self.storage = pickle.load(f)
        keys = ('observations', 'next_observations', 'actions', 'rewards', 'terminals', 'bootstrap_mask')
        storage = np.load(filename, allow_pickle=True).item()
        self.storage = dict()
        for key in storage.keys():
              self.storage[key] = storage[key]
                
        sum_returns = self.storage['rewards'].sum()
        num_traj = self.storage['terminals'].sum()
        if num_traj == 0:
                num_traj = 1000
        average_per_traj_return = sum_returns/num_traj
        print ("Average Return: ", average_per_traj_return)
        # import ipdb; ipdb.set_trace()
        
        num_samples = self.storage['observations'].shape[0]
        if bootstrap_dim is not None:
                self.bootstrap_dim = bootstrap_dim
                bootstrap_mask = np.random.binomial(n=1, size=(1, num_samples, bootstrap_dim,), p=0.8)
                bootstrap_mask = np.squeeze(bootstrap_mask, axis=0)
                self.storage['bootstrap_mask'] = bootstrap_mask[:num_samples]

## BEAR
* __Q__ Was macht der Actor?
* __Q__ Was macht der Critic? Warum berechnet er 2 bzw. 4 Qs? Warum hat der Critic eine `with_var`? 
* __Q__ Warum wird ein VAE benötigt? Für __BC__? Warum wird ist der Decoder input die latente Represnetation und die Observation?
* __Q__ Was passiert wenn `mode='auto'`?


In [3]:
class BEAR(object):
    def __init__(self, num_qs, state_dim, action_dim, max_action, delta_conf=0.1, use_bootstrap=True, version=0, lambda_=0.4,
                 threshold=0.05, mode='auto', num_samples_match=10, mmd_sigma=10.0,
                 lagrange_thresh=10.0, use_kl=False, use_ensemble=True, kernel_type='laplacian'):
        
        
        """ 
        Setup Actor
        # (l1): ReLu - Linear(state_dim, 400)
        # (l2): ReLu - Linear(400, 300)
        # (mean / log_std): Linear(300, action_dim) / Linear(300, action_dim)
        # (z) mean + std * randomTensor --> Sampalning from learnd Distribution (reparameterization trick)
        # (return) max_action * torch.tanh(z) --> ??? 
        """
        self.actor = algos.RegularActor(state_dim, action_dim, max_action).to(device)
        self.actor_target = algos.RegularActor(state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())

        
        """ 
        Setup Critic
        # num_qs is never used
        # For all Qs: 
        #    (l1): ReLu - Linear(state_dim + action_dim, 400)
        #    (l2): ReLu - Linear(400, 300)
        #    (l3): ReLu - Linear(300, 1)
        # return 2 or 4 q-values
        """ 
        self.critic = algos.EnsembleCritic(num_qs, state_dim, action_dim).to(device)
        self.critic_target = algos.EnsembleCritic(num_qs, state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
        
        
        """ 
        Setup VAE (Vanilla Variational Auto-Encoder)
        # Encoder
        #   (e1): Linear(state_dim + action_dim, 750)
        #   (e2): Linear(750, 750)
        # Sampel z (reparameterization trick)
        #   (mean): Linear(in_features=750, latent_dim)
        #   (log_std): Linear(in_features=750, latent_dim)
        #   (z) mean + std * randomTensor
        # Decoder
        #   (d1): Linear(state_dim + latent_dim, 750) --> Why is latent + state(obs)
        #   (d2): Linear(750, 750)
        #   (d3): Linear(750, action_dim)
        """ 
        latent_dim = action_dim * 2
        self.vae = algos.VAE(state_dim, action_dim, latent_dim, max_action).to(device)
        self.vae_optimizer = torch.optim.Adam(self.vae.parameters()) 

        """ 
        Save args in self
        """ 
        self.max_action = max_action
        self.action_dim = action_dim
        self.delta_conf = delta_conf
        self.use_bootstrap = use_bootstrap
        self.version = version
        self._lambda = lambda_
        self.threshold = threshold
        self.mode = mode
        self.num_qs = num_qs
        self.num_samples_match = num_samples_match
        self.mmd_sigma = mmd_sigma
        self.lagrange_thresh = lagrange_thresh
        self.use_kl = use_kl
        self.use_ensemble = use_ensemble
        self.kernel_type = kernel_type
        
    ### Karam Fragen was hier passiert ### 
        if self.mode == 'auto':
            # Use lagrange multipliers on the constraint if set to auto mode 
            # for the purpose of maintaing support matching at all times
            self.log_lagrange2 = torch.randn((), requires_grad=True, device=device)
            self.lagrange2_opt = torch.optim.Adam([self.log_lagrange2,], lr=1e-3)

        self.epoch = 0

    #MMD distance between actions as a measure of support divergence.
    def mmd_loss_laplacian(self, samples1, samples2, sigma=0.2):
        """MMD constraint with Laplacian kernel for support matching"""
        # sigma is set to 10.0 for hopper, cheetah and 20 for walker/ant
        diff_x_x = samples1.unsqueeze(2) - samples1.unsqueeze(1)  # B x N x N x d
        diff_x_x = torch.mean((-(diff_x_x.abs()).sum(-1)/(2.0 * sigma)).exp(), dim=(1,2))

        diff_x_y = samples1.unsqueeze(2) - samples2.unsqueeze(1)
        diff_x_y = torch.mean((-(diff_x_y.abs()).sum(-1)/(2.0 * sigma)).exp(), dim=(1, 2))

        diff_y_y = samples2.unsqueeze(2) - samples2.unsqueeze(1)  # B x N x N x d
        diff_y_y = torch.mean((-(diff_y_y.abs()).sum(-1)/(2.0 * sigma)).exp(), dim=(1,2))

        overall_loss = (diff_x_x + diff_y_y - 2.0 * diff_x_y + 1e-6).sqrt()
        return overall_loss
    
    def mmd_loss_gaussian(self, samples1, samples2, sigma=0.2):
        """MMD constraint with Gaussian Kernel support matching"""
        # sigma is set to 10.0 for hopper, cheetah and 20 for walker/ant
        diff_x_x = samples1.unsqueeze(2) - samples1.unsqueeze(1)  # B x N x N x d
        diff_x_x = torch.mean((-(diff_x_x.pow(2)).sum(-1)/(2.0 * sigma)).exp(), dim=(1,2))

        diff_x_y = samples1.unsqueeze(2) - samples2.unsqueeze(1)
        diff_x_y = torch.mean((-(diff_x_y.pow(2)).sum(-1)/(2.0 * sigma)).exp(), dim=(1, 2))

        diff_y_y = samples2.unsqueeze(2) - samples2.unsqueeze(1)  # B x N x N x d
        diff_y_y = torch.mean((-(diff_y_y.pow(2)).sum(-1)/(2.0 * sigma)).exp(), dim=(1,2))

        overall_loss = (diff_x_x + diff_y_y - 2.0 * diff_x_y + 1e-6).sqrt()
        return overall_loss

    def kl_loss(self, samples1, state, sigma=0.2):
        """We just do likelihood, we make sure that the policy is close to the
           data in terms of the KL."""
        state_rep = state.unsqueeze(1).repeat(1, samples1.size(1), 1).view(-1, state.size(-1))
        samples1_reshape = samples1.view(-1, samples1.size(-1))
        samples1_log_pis = self.actor.log_pis(state=state_rep, raw_action=samples1_reshape)
        samples1_log_prob = samples1_log_pis.view(state.size(0), samples1.size(1))
        return (-samples1_log_prob).mean(1)
    
    def entropy_loss(self, samples1, state, sigma=0.2):
        state_rep = state.unsqueeze(1).repeat(1, samples1.size(1), 1).view(-1, state.size(-1))
        samples1_reshape = samples1.view(-1, samples1.size(-1))
        samples1_log_pis = self.actor.log_pis(state=state_rep, raw_action=samples1_reshape)
        samples1_log_prob = samples1_log_pis.view(state.size(0), samples1.size(1))
        # print (samples1_log_prob.min(), samples1_log_prob.max())
        samples1_prob = samples1_log_prob.clamp(min=-5, max=4).exp()
        return (samples1_prob).mean(1)
    
    def select_action(self, state):      
        """When running the actor, we just select action based on the max of the Q-function computed over
            samples from the policy -- which biases things to support."""
        with torch.no_grad():
            state = torch.FloatTensor(state.reshape(1, -1)).repeat(10, 1).to(device)
            action = self.actor(state)                       # sampels 10 actions
            q1 = self.critic.q1(state, action)               # returns a Q-Value for evry sampeld action 
            ind = q1.max(0)[1]                               # use action withe highest Q-Velue
        return action[ind].cpu().data.numpy().flatten()
    
    def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005):
        for it in range(iterations):
            
            
            # sample a batch of sasr and transform them from numpy to tensor
            state_np, next_state_np, action, reward, done, mask = replay_buffer.sample(batch_size)
            state           = torch.FloatTensor(state_np).to(device)
            action          = torch.FloatTensor(action).to(device)
            next_state      = torch.FloatTensor(next_state_np).to(device)
            reward          = torch.FloatTensor(reward).to(device)
            done            = torch.FloatTensor(1 - done).to(device)
            mask            = torch.FloatTensor(mask).to(device)
            
            
            # Train the Behaviour cloning policy to be able to take more than 1 sample for MMD
            recon, mean, std = self.vae(state, action)
            recon_loss = F.mse_loss(recon, action)
            KL_loss = -0.5 * (1 + torch.log(std.pow(2)) - mean.pow(2) - std.pow(2)).mean()
            vae_loss = recon_loss + 0.5 * KL_loss

            self.vae_optimizer.zero_grad()
            vae_loss.backward()
            self.vae_optimizer.step()

            # Critic Training: In this step, we explicitly compute the actions 
            with torch.no_grad():
                # Duplicate state 10 times (10 is a hyperparameter chosen by BCQ)
                state_rep = torch.FloatTensor(np.repeat(next_state_np, 10, axis=0)).to(device)
                
                # Compute value of perturbed actions sampled from the VAE
                target_Qs = self.critic_target(state_rep, self.actor_target(state_rep))

                # Soft Clipped Double Q-learning 
                target_Q = 0.75 * target_Qs.min(0)[0] + 0.25 * target_Qs.max(0)[0]
                target_Q = target_Q.view(batch_size, -1).max(1)[0].view(-1, 1)
                target_Q = reward + done * discount * target_Q

            current_Qs = self.critic(state, action, with_var=False)
            if self.use_bootstrap: 
                critic_loss = (F.mse_loss(current_Qs[0], target_Q, reduction='none') * mask[:, 0:1]).mean() +\
                            (F.mse_loss(current_Qs[1], target_Q, reduction='none') * mask[:, 1:2]).mean() 
                            # (F.mse_loss(current_Qs[2], target_Q, reduction='none') * mask[:, 2:3]).mean() +\
                            # (F.mse_loss(current_Qs[3], target_Q, reduction='none') * mask[:, 3:4]).mean()
            else:
                critic_loss = F.mse_loss(current_Qs[0], target_Q) + F.mse_loss(current_Qs[1], target_Q) #+ F.mse_loss(current_Qs[2], target_Q) + F.mse_loss(current_Qs[3], target_Q)

            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # Action Training
            # If you take less samples (but not too less, else it becomes statistically inefficient), it is closer to a uniform support set matching
            num_samples = self.num_samples_match
            sampled_actions, raw_sampled_actions = self.vae.decode_multiple(state, num_decode=num_samples)  # B x N x d
            actor_actions, raw_actor_actions = self.actor.sample_multiple(state, num_samples)#  num)

            # MMD done on raw actions (before tanh), to prevent gradient dying out due to saturation
            if self.use_kl:
                mmd_loss = self.kl_loss(raw_sampled_actions, state)
            else:
                if self.kernel_type == 'gaussian':
                    mmd_loss = self.mmd_loss_gaussian(raw_sampled_actions, raw_actor_actions, sigma=self.mmd_sigma)
                else:
                    mmd_loss = self.mmd_loss_laplacian(raw_sampled_actions, raw_actor_actions, sigma=self.mmd_sigma)

            action_divergence = ((sampled_actions - actor_actions)**2).sum(-1)
            raw_action_divergence = ((raw_sampled_actions - raw_actor_actions)**2).sum(-1)

            # Update through TD3 style
            critic_qs, std_q = self.critic.q_all(state, actor_actions[:, 0, :], with_var=True)
            critic_qs = self.critic.q_all(state.unsqueeze(0).repeat(num_samples, 1, 1).view(num_samples*state.size(0), state.size(1)), actor_actions.permute(1, 0, 2).contiguous().view(num_samples*actor_actions.size(0), actor_actions.size(2)))
            critic_qs = critic_qs.view(self.num_qs, num_samples, actor_actions.size(0), 1)
            critic_qs = critic_qs.mean(1)
            std_q = torch.std(critic_qs, dim=0, keepdim=False, unbiased=False)

            if not self.use_ensemble:
                std_q = torch.zeros_like(std_q).to(device)
                
            if self.version == '0':
                critic_qs = critic_qs.min(0)[0]
            elif self.version == '1':
                critic_qs = critic_qs.max(0)[0]
            elif self.version == '2':
                critic_qs = critic_qs.mean(0)

            # We do support matching with a warmstart which happens to be reasonable around epoch 20 during training
            if self.epoch >= 20: 
                if self.mode == 'auto':
                    actor_loss = (-critic_qs +\
                        self._lambda * (np.sqrt((1 - self.delta_conf)/self.delta_conf)) * std_q +\
                        self.log_lagrange2.exp() * mmd_loss).mean()
                else:
                    actor_loss = (-critic_qs +\
                        self._lambda * (np.sqrt((1 - self.delta_conf)/self.delta_conf)) * std_q +\
                        100.0*mmd_loss).mean()      # This coefficient is hardcoded, and is different for different tasks. I would suggest using auto, as that is the one used in the paper and works better.
            else:
                if self.mode == 'auto':
                    actor_loss = (self.log_lagrange2.exp() * mmd_loss).mean()
                else:
                    actor_loss = 100.0*mmd_loss.mean()

            std_loss = self._lambda*(np.sqrt((1 - self.delta_conf)/self.delta_conf)) * std_q.detach() 

            self.actor_optimizer.zero_grad()
            if self.mode =='auto':
                actor_loss.backward(retain_graph=True)
            else:
                actor_loss.backward()
            # torch.nn.utils.clip_grad_norm(self.actor.parameters(), 10.0)
            self.actor_optimizer.step()

            # Threshold for the lagrange multiplier
            thresh = 0.05
            if self.use_kl:
                thresh = -2.0

            if self.mode == 'auto':
                lagrange_loss = (-critic_qs +\
                        self._lambda * (np.sqrt((1 - self.delta_conf)/self.delta_conf)) * (std_q) +\
                        self.log_lagrange2.exp() * (mmd_loss - thresh)).mean()

                self.lagrange2_opt.zero_grad()
                (-lagrange_loss).backward()
                # self.lagrange1_opt.step()
                self.lagrange2_opt.step() 
                self.log_lagrange2.data.clamp_(min=-5.0, max=self.lagrange_thresh)   
            
            # Update Target Networks 
            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

        # Do all logging here
        logger.record_dict(create_stats_ordered_dict(
            'Q_target',
            target_Q.cpu().data.numpy(),
        ))
        if self.mode == 'auto':
            # logger.record_tabular('Lagrange1', self.log_lagrange1.exp().cpu().data.numpy())
            logger.record_tabular('Lagrange2', self.log_lagrange2.exp().cpu().data.numpy())

        logger.record_tabular('Actor Loss', actor_loss.cpu().data.numpy())
        logger.record_tabular('Critic Loss', critic_loss.cpu().data.numpy())
        logger.record_tabular('Std Loss', std_loss.cpu().data.numpy().mean())
        logger.record_dict(create_stats_ordered_dict(
            'MMD Loss',
            mmd_loss.cpu().data.numpy()
        ))
        logger.record_dict(create_stats_ordered_dict(
            'Sampled Actions',
            sampled_actions.cpu().data.numpy()
        ))
        logger.record_dict(create_stats_ordered_dict(
            'Actor Actions',
            actor_actions.cpu().data.numpy()
        ))
        logger.record_dict(create_stats_ordered_dict(
            'Current_Q',
            current_Qs.cpu().data.numpy()
        ))
        logger.record_dict(create_stats_ordered_dict(
            'Action_Divergence',
            action_divergence.cpu().data.numpy()
        ))
        logger.record_dict(create_stats_ordered_dict(
            'Raw Action_Divergence',
            raw_action_divergence.cpu().data.numpy()
        ))
        self.epoch = self.epoch + 1

## Evaluation (main)

In [4]:
# Runs policy for X episodes and returns average reward
def evaluate_policy(policy, eval_episodes=10, discounted=False, gamma=0.99):
    avg_reward = 0.
    all_rewards = []
    for _ in range(eval_episodes):
        obs = env.reset()
        done = False
        cntr = 0
        gamma_t = 1 # discounted
        while ((not done)):
            action = policy.select_action(np.array(obs))
            obs, reward, done, _ = env.step(action)
            if discounted:
                avg_reward += (gamma_t * reward) # discounted
                gamma_t = gamma * gamma_t # discounted
            else:
                avg_reward += reward
            cntr += 1
        all_rewards.append(avg_reward)
    avg_reward /= eval_episodes
    for j in range(eval_episodes-1, 1, -1):
        all_rewards[j] = all_rewards[j] - all_rewards[j-1]

    all_rewards = np.array(all_rewards)
    std_rewards = np.std(all_rewards)
    median_reward = np.median(all_rewards)
    print ("---------------------------------------")
    print ("Evaluation over %d episodes: %f" % (eval_episodes, avg_reward))
    print ("---------------------------------------")
    return avg_reward, std_rewards, median_reward

In [6]:
def bear_main(env_name='Pendulum-v0', seed=0, buffer_type="Robust", eval_freq=5e3, 
              max_timesteps=1e6, buffer_name=None, version='0', lamda=0.5, threshold=0.05, use_bootstrap=False,
              algo_name="OursBCQ", mode='hardcoded', num_samples_match=10, mmd_sigma=10.0, kernel_type='laplacian',
              lagrange_thresh=10.0, distance_type="MMD", log_dir='./data_hopper/', use_ensemble_variance='True',
              use_behaviour_policy='False', cloning="False", num_random=10, margin_threshold=10):
    
    # Generate file name
    file_name = algo_name + "_%s_%s_%s_%s_%s_%s_%s_%s_%s_%s_%s_%s_%s_%s_0.1" % (env_name, str(seed), str(version), str(lamda), str(threshold), str(use_bootstrap), str(mode),\
         str(kernel_type), str(num_samples_match), str(mmd_sigma), str(lagrange_thresh), str(distance_type), str(use_behaviour_policy), str(num_random))
    
    print ("---------------------------------------")
    print ("Settings: " + file_name)
    print ("---------------------------------------")
    
    if not os.path.exists("./results"):
        os.makedirs("./results")
    
    # Setup enve
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0] 
    max_action = float(env.action_space.high[0])
    print (state_dim, action_dim)
    print ('Max action: ', max_action)
    
    # Set seeds
    seed = np.random.randint(10, 1000)
    env.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    
    # Setup logger
    variant = dict(
        algorithm=algo_name,
        version=version,
        env_name=env_name,
        seed=seed,
        lamda=lamda,
        threshold=threshold,
        use_bootstrap=str(use_bootstrap),
        bootstrap_dim=4,
        delta_conf=0.1,
        mode=mode,
        kernel_type=kernel_type,
        num_samples_match=num_samples_match,
        mmd_sigma=mmd_sigma,
        lagrange_thresh=lagrange_thresh,
        distance_type=distance_type,
        use_ensemble_variance=use_ensemble_variance,
        use_data_policy=use_behaviour_policy,
        num_random=num_random,
        margin_threshold=margin_threshold,
    )
    setup_logger(file_name, variant=variant, log_dir=log_dir + file_name)
    
    # Setup policy (only BEAR available)
    policy = BEAR(2, state_dim, action_dim, max_action, delta_conf=0.1, use_bootstrap=False,
                  version=version,
                  lambda_=float(lamda),
                  threshold=float(threshold),
                  mode=mode,
                  num_samples_match=num_samples_match,
                  mmd_sigma=mmd_sigma,
                  lagrange_thresh=lagrange_thresh,
                  use_kl=(True if distance_type == "KL" else False),
                  use_ensemble=(False if use_ensemble_variance == "False" else True),
                  kernel_type=kernel_type)
    
    replay_buffer = ReplayBuffer()
    replay_buffer.load(buffer_name, bootstrap_dim=4)
    
    # Evaluat Algorithem
    evaluations = []
    episode_num = 0
    done = True 
    training_iters = 0
    while training_iters < max_timesteps: 
        pol_vals = policy.train(replay_buffer, iterations=int(eval_freq))
        ret_eval, var_ret, median_ret = evaluate_policy(policy)
        evaluations.append(ret_eval)
        np.save("./results/" + file_name, evaluations)
        training_iters += eval_freq
        print ("Training iterations: " + str(training_iters))
        logger.record_tabular('Training Epochs', int(training_iters // int(eval_freq)))
        logger.record_tabular('AverageReturn', ret_eval)
        logger.record_tabular('VarianceReturn', var_ret)
        logger.record_tabular('MedianReturn', median_ret)
        logger.dump_tabular()

In [11]:
torch.autograd.set_detect_anomaly(True)
bear_main(env_name='Pendulum-v0', buffer_name="./data/buffers/Pendulum-v0_expert_1E06.npy", mmd_sigma=20.0, kernel_type="gaussian", num_samples_match=5, version=0, lagrange_thresh=10.0, mode="auto")

---------------------------------------
Settings: OursBCQ_Pendulum-v0_0_0_0.5_0.05_False_auto_gaussian_5_20.0_10.0_MMD_False_10_0.1
---------------------------------------
3 1
Max action:  2.0
2020-05-07 14:49:36.356053 CEST | [OursBCQ_Pendulum-v0_0_0_0.5_0.05_False_auto_gaussian_5_20.0_10.0_MMD_False_10_0.1] [OursBCQ_Pendulum-v0_0_0_0.5_0.05_False_auto_gaussian_5_20.0_10.0_MMD_False_10_0.1] Variant:
2020-05-07 14:49:36.356567 CEST | [OursBCQ_Pendulum-v0_0_0_0.5_0.05_False_auto_gaussian_5_20.0_10.0_MMD_False_10_0.1] [OursBCQ_Pendulum-v0_0_0_0.5_0.05_False_auto_gaussian_5_20.0_10.0_MMD_False_10_0.1] {
  "algorithm": "OursBCQ",
  "version": 0,
  "env_name": "Pendulum-v0",
  "seed": 670,
  "lamda": 0.5,
  "threshold": 0.05,
  "use_bootstrap": "False",
  "bootstrap_dim": 4,
  "delta_conf": 0.1,
  "mode": "auto",
  "kernel_type": "gaussian",
  "num_samples_match": 5,
  "mmd_sigma": 20.0,
  "lagrange_thresh": 10.0,
  "distance_type": "MMD",
  "use_ensemble_variance": "True",
  "use_data_poli

RuntimeError: size mismatch, m1: [100 x 14], m2: [4 x 750] at /pytorch/aten/src/TH/generic/THTensorMath.cpp:41

### Archiv

In [None]:
# Use this if you want to eval other algos

#     # Setup policy
#     if algo_name == 'BCQ':
#         policy = algos.BCQ(state_dim, action_dim, max_action)
#     elif algo_name == 'BC':
#         policy = algos.BCQ(state_dim, action_dim, max_action, cloning=True)
#     elif algo_name == 'DQfD':
#         policy = algos.DQfD(state_dim, action_dim, max_action, lambda_=args.lamda, margin_threshold=float(args.margin_threshold))
#     elif algo_name == 'KLControl':
#         policy = algos.KLControl(2, state_dim, action_dim, max_action)
#     elif algo_name == 'BEAR':
#         policy = algos.BEAR(2, state_dim, action_dim, max_action, delta_conf=0.1, use_bootstrap=False,
#             version=args.version,
#             lambda_=float(args.lamda),
#             threshold=float(args.threshold),
#             mode=args.mode,
#             num_samples_match=args.num_samples_match,
#             mmd_sigma=args.mmd_sigma,
#             lagrange_thresh=args.lagrange_thresh,
#             use_kl=(True if args.distance_type == "KL" else False),
#             use_ensemble=(False if args.use_ensemble_variance == "False" else True),
#             kernel_type=args.kernel_type)
#     elif algo_name == 'BEAR_IS':
#         policy = algos.BEAR_IS(2, state_dim, action_dim, max_action, delta_conf=0.1, use_bootstrap=False,
#             version=args.version,
#             lambda_=float(args.lamda),
#             threshold=float(args.threshold),
#             mode=args.mode,
#             num_samples_match=args.num_samples_match,
#             mmd_sigma=args.mmd_sigma,
#             lagrange_thresh=args.lagrange_thresh,
#             use_kl=(True if args.distance_type == "KL" else False),
#             use_ensemble=(False if args.use_ensemble_variance == "False" else True),
#             kernel_type=args.kernel_type)

In [16]:
algos.RegularActor(10, 2, 1.0)

RegularActor(
  (l1): Linear(in_features=10, out_features=400, bias=True)
  (l2): Linear(in_features=400, out_features=300, bias=True)
  (mean): Linear(in_features=300, out_features=2, bias=True)
  (log_std): Linear(in_features=300, out_features=2, bias=True)
)

In [20]:
algos.EnsembleCritic(4, 10, 2)

EnsembleCritic(
  (l1): Linear(in_features=12, out_features=400, bias=True)
  (l2): Linear(in_features=400, out_features=300, bias=True)
  (l3): Linear(in_features=300, out_features=1, bias=True)
  (l4): Linear(in_features=12, out_features=400, bias=True)
  (l5): Linear(in_features=400, out_features=300, bias=True)
  (l6): Linear(in_features=300, out_features=1, bias=True)
)

In [21]:
algos.VAE(10, 2, 5, 1.0)

VAE(
  (e1): Linear(in_features=12, out_features=750, bias=True)
  (e2): Linear(in_features=750, out_features=750, bias=True)
  (mean): Linear(in_features=750, out_features=5, bias=True)
  (log_std): Linear(in_features=750, out_features=5, bias=True)
  (d1): Linear(in_features=15, out_features=750, bias=True)
  (d2): Linear(in_features=750, out_features=750, bias=True)
  (d3): Linear(in_features=750, out_features=2, bias=True)
)