In [21]:
import joblib
import numpy as np
import tensorflow as tf
from garage.sampler.utils import rollout
import time
from mylab.samplers.ast_vectorized_sampler import ASTVectorizedSampler
from garage.tf.algos.batch_polopt import BatchPolopt
from mylab.simulators.example_av_simulator import ExampleAVSimulator
from mylab.rewards.example_av_reward import ExampleAVReward
from mylab.spaces.example_av_spaces import ExampleAVSpaces
import pdb

In [108]:
with tf.Session() as sess:
    with tf.variable_scope('Loader', reuse=True):
        data = joblib.load('./data/debug/g5/itr_1000.pkl')

In [11]:
type(data)

dict

In [14]:
print(data.keys())

dict_keys(['baseline', 'env', 'policy', 'itr', 'paths'])


In [22]:
dir(data['policy'])

['_Serializable__args',
 '_Serializable__kwargs',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_cached_assign_ops',
 '_cached_assign_placeholders',
 '_cached_param_dtypes',
 '_cached_param_shapes',
 '_cached_params',
 '_env_spec',
 '_input_layers',
 '_output_layers',
 '_serializable_initialized',
 'action_dim',
 'action_space',
 'clone',
 'dist',
 'dist_info',
 'dist_info_sym',
 'distribution',
 'env_spec',
 'f_step_mean_std',
 'feature_network',
 'flat_to_params',
 'get_action',
 'get_actions',
 'get_param_dtypes',
 'get_param_shapes',
 'get_param_values',
 'get_params',
 'get_params_internal',
 'hidden_dim',
 'input_dim',
 'l_input',
 'l_log_std',
 'l

AttributeError: 'NoneType' object has no attribute 'run'

In [133]:
type(data['itr'])

int

In [19]:
type(data['paths'])

list

In [72]:
dir(data['env'].wrapped_env.wrapped_env)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_action',
 '_actions',
 '_done',
 '_first_step',
 '_info',
 '_init_state',
 '_reward',
 '_sample_init_state',
 '_step',
 'action_dim',
 'action_only',
 'action_space',
 'get_cache_list',
 'get_param_values',
 'horizon',
 'log',
 'log_diagnostics',
 'observation_space',
 'render',
 'reset',
 'reward_function',
 'set_param_values',
 'simulate',
 'simulator',
 'spaces',
 'spec',
 'step',
 'terminate']

In [64]:
with tf.Session() as sess:
    with tf.variable_scope('Loader', reuse=True):
        data = joblib.load('./data/debug/d_gen7/itr_990.pkl')
        start = time.clock()
        for i in range(200):
            x = rollout(data['env'],data['policy'], max_path_length=50, animated=False, speedup=1,always_return_paths=False)
        print(time.clock() - start)
# print(x)

23.791196


In [58]:
print(x.keys())

dict_keys(['actions', 'env_infos', 'rewards', 'agent_infos', 'observations'])


In [4]:
import time

import tensorflow as tf

from garage.algos import RLAlgorithm
import garage.misc.logger as logger
from garage.tf.plotter import Plotter
from garage.tf.samplers import BatchSampler
from garage.tf.samplers import OnPolicyVectorizedSampler


class BatchPoloptCustom(RLAlgorithm):
    """
    Base class for batch sampling-based policy optimization methods.
    This includes various policy gradient methods like vpg, npg, ppo, trpo, etc.
    """

    def __init__(
            self,
            env,
            policy,
            baseline,
            scope=None,
            n_itr=500,
            start_itr=0,
            batch_size=5000,
            max_path_length=500,
            discount=0.99,
            gae_lambda=1,
            plot=False,
            pause_for_plot=False,
            center_adv=True,
            positive_adv=False,
            store_paths=False,
            whole_paths=True,
            fixed_horizon=False,
            sampler_cls=None,
            sampler_args=None,
            force_batch_sampler=False,
            **kwargs
    ):
        """
        :param env: Environment
        :param policy: Policy
        :type policy: Policy
        :param baseline: Baseline
        :param scope: Scope for identifying the algorithm. Must be specified if running multiple algorithms
        simultaneously, each using different environments and policies
        :param n_itr: Number of iterations.
        :param start_itr: Starting iteration.
        :param batch_size: Number of samples per iteration.
        :param max_path_length: Maximum length of a single rollout.
        :param discount: Discount.
        :param gae_lambda: Lambda used for generalized advantage estimation.
        :param plot: Plot evaluation run after each iteration.
        :param pause_for_plot: Whether to pause before contiuing when plotting.
        :param center_adv: Whether to rescale the advantages so that they have mean 0 and standard deviation 1.
        :param positive_adv: Whether to shift the advantages so that they are always positive. When used in
        conjunction with center_adv the advantages will be standardized before shifting.
        :param store_paths: Whether to save all paths data to the snapshot.
        :return:
        """
        self.env = env
        self.policy = policy
        self.baseline = baseline
        self.scope = scope
        self.n_itr = n_itr
        self.start_itr = start_itr
        self.batch_size = batch_size
        self.max_path_length = max_path_length
        self.discount = discount
        self.gae_lambda = gae_lambda
        self.plot = plot
        self.pause_for_plot = pause_for_plot
        self.center_adv = center_adv
        self.positive_adv = positive_adv
        self.store_paths = store_paths
        self.whole_paths = whole_paths
        self.fixed_horizon = fixed_horizon
        if sampler_cls is None:
            if self.policy.vectorized and not force_batch_sampler:
                sampler_cls = VectorizedSampler
            else:
                sampler_cls = BatchSampler
        if sampler_args is None:
            sampler_args = dict()
        self.sampler = sampler_cls(self, **sampler_args)
        self.init_opt()

    def start_worker(self):
        self.sampler.start_worker()

    def shutdown_worker(self):
        self.sampler.shutdown_worker()

    def obtain_samples(self, itr):
        return self.sampler.obtain_samples(itr)

    def process_samples(self, itr, paths):
        return self.sampler.process_samples(itr, paths)

    def test(self, sess=None, checkpoint=None):
        created_session = True if (sess is None) else False
        if sess is None:
            sess = tf.Session()
            sess.__enter__()
            
        sess.run(tf.global_variables_initializer())
        if checkpoint is not None:
            pdb.set_trace()
            saver = tf.train.Saver()
            saver.restore(sess, checkpoint)
        self.start_worker()
        start_time = time.time()
        itr_start_time = time.time()
#         logger.log("Obtaining samples...")
        paths = self.obtain_samples(0)
#         logger.log("Processing samples...")
        samples_data = self.process_samples(0, paths)
#         logger.log("Logging diagnostics...")
#         self.log_diagnostics(paths)
#             logger.log("Optimizing policy...")
#             self.optimize_policy(itr, samples_data)
#             logger.log("Saving snapshot...")
#             params = self.get_itr_snapshot(itr, samples_data)  # , **kwargs)
#         if self.store_paths:
#             params["paths"] = samples_data["paths"]
#             logger.save_itr_params(itr, params)
#             logger.log("Saved")
#             logger.record_tabular('Time', time.time() - start_time)
#             logger.record_tabular('ItrTime', time.time() - itr_start_time)
#             logger.dump_tabular(with_prefix=False)
#             if self.plot:
#                 rollout(self.env, self.policy, animated=True, max_path_length=self.max_path_length)
#                 if self.pause_for_plot:
#                     input("Plotting evaluation run: Press Enter to "
#                           "continue...")
        self.shutdown_worker()
        if created_session:
            sess.close()
        return samples_data

    def log_diagnostics(self, paths):
        self.env.log_diagnostics(paths)
        self.policy.log_diagnostics(paths)
        self.baseline.log_diagnostics(paths)

    def init_opt(self):
        """
        Initialize the optimization procedure. If using tensorflow, this may
        include declaring all the variables and compiling functions
        """
        return None

    def get_itr_snapshot(self, itr, samples_data):
        """
        Returns all the data that should be saved in the snapshot for this
        iteration.
        """
        return samples_data

    def optimize_policy(self, itr, samples_data):
        return None



In [125]:
sampler_cls = ASTVectorizedSampler
algo = BatchPoloptCustom(
    env=data['env'],
    policy=data['policy'],
    baseline=data['baseline'],
    batch_size=50000,
    step_size=0.0,
    n_itr=1,
    store_paths=True,
    optimizer=None,
    max_path_length=50,
    sampler_cls=sampler_cls,
    sampler_args={"interactive":False,
                  "sim": data['env'].wrapped_env.wrapped_env.simulator,
                  "reward_function": data['env'].wrapped_env.wrapped_env.reward_function})
with tf.Session() as sess:
    with tf.variable_scope('Loader', reuse=True):
        data = joblib.load('./data/debug/g5/itr_1000.pkl')
        start = time.clock()
        for i in range(1):
            x = algo.test(sess=sess)
        print(time.clock() - start)
max(x['rewards'][:,-1])

2018-12-12 20:41:33.468855 PST | Obtaining samples for iteration 0...


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:10


2018-12-12 20:41:54.570549 PST | fitting baseline...
2018-12-12 20:41:54.625193 PST | fitted
24.686548000000016


-85233.145623913893

In [128]:
test = data['env'].reset()

In [96]:
print(x.keys())

dict_keys(['rewards', 'env_infos', 'valids', 'returns', 'agent_infos', 'observations', 'actions', 'advantages', 'paths'])


In [98]:
len(x['paths'])

1000

In [114]:
np.max(x['rewards'])

-0.88322380728369576

In [116]:
dir(sampler_cls)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'obtain_samples',
 'process_samples',
 'shutdown_worker',
 'slice_dict',
 'start_worker']

In [138]:
with tf.Session() as sess:
    with tf.variable_scope('Loader', reuse=True):
        data = joblib.load('./data/debug/g5/itr_1000.pkl')
        data['policy'].reset()
        vals = data['policy'].get_param_values()

In [22]:
with tf.Session() as sess:
    with tf.variable_scope('', reuse=True):
        data = joblib.load('./data/debug/g1/itr_100.pkl')
        sampler_cls = ASTVectorizedSampler
        algo = BatchPoloptCustom(
        env=data['env'],
        policy=data['policy'],
        baseline=data['baseline'],
        batch_size=50000,
        step_size=0.0,
        n_itr=1,
        store_paths=True,
        optimizer=None,
        max_path_length=50,
        sampler_cls=sampler_cls,
        sampler_args={"interactive":False,
                      "sim": data['env'].env.env.simulator,
                      "reward_function": data['env'].env.env.reward_function})
        start = time.clock()
        for i in range(1):
            x = algo.test(sess=sess, checkpoint='./data/debug/g1/model.ckpt')
        print(time.clock() - start)
    max(x['rewards'][:,-1])

> <ipython-input-4-9fb0159c3f37>(111)test()
-> saver = tf.train.Saver()
(Pdb) saver = tf.train.Saver()
(Pdb) saver.restore(sess, checkpoint)
INFO:tensorflow:Restoring parameters from ./data/debug/g1/model.ckpt
(Pdb) c
INFO:tensorflow:Restoring parameters from ./data/debug/g1/model.ckpt
2018-12-13 22:38:32 | Obtaining samples for iteration 0...


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:13


91.596936


In [13]:
with tf.Session() as sess:
    with tf.variable_scope('', reuse=True):
        data = joblib.load('./data/debug/g1/itr_100.pkl')

In [18]:
dir(data['env'].env.env)

['_Serializable__args',
 '_Serializable__kwargs',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_action',
 '_actions',
 '_done',
 '_first_step',
 '_info',
 '_init_state',
 '_reward',
 '_sample_init_state',
 '_serializable_initialized',
 '_step',
 'action_only',
 'action_space',
 'clone',
 'close',
 'get_cache_list',
 'log',
 'log_diagnostics',
 'metadata',
 'observation_space',
 'quick_init',
 'render',
 'reset',
 'reward_function',
 'reward_range',
 'seed',
 'simulate',
 'simulator',
 'spaces',
 'spec',
 'step',
 'unwrapped']

In [31]:
y = np.sum(x['rewards'], axis=1)


(1000,)

In [29]:
x['rewards'].shape

(1000, 50)