##Setup

You will need to make a copy of this notebook in your Google Drive before you can edit the homework files. You can do so with **File &rarr; Save a copy in Drive**.

In [None]:

import os
import time

from cs285.infrastructure.rl_trainer import RL_Trainer
from cs285.agents.mb_agent import MBAgent

In [None]:
#@title set up virtual display

from pyvirtualdisplay import Display

display = Display(visible=0, size=(1400, 900))
display.start()

# For later
from cs285.infrastructure.local_utils_video import (
    wrap_env,
    show_video
)

In [None]:
#@title test virtual display

#@markdown If you see a video of a four-legged ant fumbling about, setup is complete!

import gym
import matplotlib
matplotlib.use('Agg')

env = wrap_env(gym.make("Ant-v2"))

observation = env.reset()
for i in range(10):
    env.render(mode='rgb_array')
    obs, rew, term, _ = env.step(env.action_space.sample() ) 
    if term:
      break;
            
env.close()
print('Loading video...')
show_video()

## Editing Code

To edit code, click the folder icon on the left menu. Navigate to the corresponding file (`cs285_f2021/...`). Double click a file to open an editor. There is a timeout of about ~12 hours with Colab while it is active (and less if you close your browser window). We sync your edits to Google Drive so that you won't lose your work in the event of an instance timeout, but you will need to re-mount your Google Drive and re-install packages with every new instance.

## Run MBRL

In [None]:
#@title imports
import os
import time

from cs285.infrastructure.rl_trainer import RL_Trainer
from cs285.agents.mb_agent import MBAgent

%load_ext autoreload
%autoreload 2

In [None]:
#@title runtime arguments

class Args:

  def __getitem__(self, key):
    return getattr(self, key)

  def __setitem__(self, key, val):
    setattr(self, key, val)

  def __contains__(self, key):
    return hasattr(self, key)

  env_name = "cheetah-cs285-v0" #@param ["cheetah-cs285-v0", "obstacles-cs285-v0", "reacher-cs285-v0"]
  exp_name = "TODO"#@param
  n_iter = 20 #@param {type:"integer"}

  if env_name == 'reacher-cs285-v0':
    ep_len = 200
  if env_name == 'cheetah-cs285-v0':
    ep_len = 500
  if env_name == 'obstacles-cs285-v0':
    ep_len = 100

  #@markdown batches and steps
  batch_size = 8000 #@param {type: "integer"}
  eval_batch_size = 400 #@param {type: "integer"}
  train_batch_size = 512 #@param {type: "integer"}
  batch_size_initial = 20000 #@param {type: "integer"}

  num_agent_train_steps_per_iter = 1000 #@param {type: "integer"}

  #@markdown MBRL parameters
  ensemble_size = 3 #@param {type:"integer"}
  mpc_horizon = 10 #@param {type:"integer"}
  mpc_num_action_sequences = 1000 #@param {type:"integer"}
  mpc_action_sampling_strategy = 'random' #@param ["random", "cem"]
  cem_iterations = 4 #@param {type: "integer"}
  cem_num_elites = 5 #@param {type: "integer"}
  cem_alpha = 1.0 #@param {type: "raw"}

  #@markdown Learning parameters
  learning_rate = 0.001 #@param {type:"raw"}
  n_layers = 2 #@param {type:"integer"}
  size = 250 #@param {type:"integer"}
  add_sl_noise = True #@param {type:"boolean"}

  #@markdown system
  save_params = False #@param {type: "boolean"}
  no_gpu = False #@param {type: "boolean"}
  which_gpu = 0 #@param {type: "integer"}
  seed = 1 #@param {type: "integer"}

  #@markdown logging
  ## default is to not log video so
  ## that logs are small enough to be
  ## uploaded to gradscope
  video_log_freq = -1 #@param {type: "integer"}
  scalar_log_freq = 1#@param {type: "integer"}


args = Args()

## ensure compatibility with hw1 code
args['train_batch_size'] = args['batch_size']

if args['video_log_freq'] > 0:
  import warnings
  warnings.warn(
      '''\nLogging videos will make eventfiles too'''
      '''\nlarge for the autograder. Set video_log_freq = -1'''
      '''\nfor the runs you intend to submit.''')

In [None]:
#@title create directories for logging

# data_path = '../../data/'

# if not (os.path.exists(data_path)):
#     os.makedirs(data_path)

# logdir = 'hw4_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
# logdir = os.path.join(data_path, logdir)
# args['logdir'] = logdir
# if not(os.path.exists(logdir)):
#     os.makedirs(logdir)

# print("LOGGING TO: ", logdir)


In [None]:
#@title Define Model Based trainer

class MB_Trainer(object):

    def __init__(self, params):

        computation_graph_args = {
            'ensemble_size': params['ensemble_size'],
            'n_layers': params['n_layers'],
            'size': params['size'],
            'learning_rate': params['learning_rate'],
            }

        train_args = {
            'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
        }

        controller_args = {
            'mpc_horizon': params['mpc_horizon'],
            'mpc_num_action_sequences': params['mpc_num_action_sequences'],
            'mpc_action_sampling_strategy': params['mpc_action_sampling_strategy'],
            'cem_iterations': params['cem_iterations'],
            'cem_num_elites': params['cem_num_elites'],
            'cem_alpha': params['cem_alpha'],
        }

        agent_params = {**computation_graph_args, **train_args, **controller_args}

        self.params = params
        self.params['agent_class'] = MBAgent
        self.params['agent_params'] = agent_params

        self.rl_trainer = RL_Trainer(self.params)

    def run_training_loop(self):

        self.rl_trainer.run_training_loop(
            self.params['n_iter'],
            collect_policy = self.rl_trainer.agent.actor,
            eval_policy = self.rl_trainer.agent.actor,
            )


In [None]:
#@title run training

# trainer = MB_Trainer(args)
# trainer.run_training_loop()

In [None]:
#Q2
#python cs285/scripts/run_hw4_mb.py --exp_name q2_obstacles_singleiteration 
# --env_name obstacles-cs285-v0 --add_sl_noise --num_agent_train_steps_per_iter 20 --n_iter 1 
# --batch_size_initial 5000 --batch_size 1000 --mpc_horizon 10 --mpc_action_sampling_strategy 'random'

args = Args()
args.exp_name = 'q2_obstacles_singleiteration'
args.env_name = 'obstacles-cs285-v0'
args.add_sl_noise = True
args.num_agent_train_steps_per_iter = 20
args.n_iter = 1
args.batch_size_initial = 5000
args.batch_size = 1000
args.mpc_horizon = 10
args.mpc_action_sampling_strategy = 'random'

data_path = '../../data/'
logdir = 'hw4_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

print("LOGGING TO: ", logdir)

trainer = MB_Trainer(args)
trainer.run_training_loop()


In [None]:
# Problem 3
# python cs285/scripts/run_hw4_mb.py --exp_name q3_obstacles --env_name obstacles-cs285-v0 
# --add_sl_noise --num_agent_train_steps_per_iter 20 --batch_size_initial 5000 --batch_size 1000 
# --mpc_horizon 10 --n_iter 12 --mpc_action_sampling_strategy 'random'
args = Args()
args.exp_name = 'q3_obstacles'
args.env_name = 'obstacles-cs285-v0'
args.add_sl_noise = True
args.num_agent_train_steps_per_iter = 20
args.batch_size_initial = 5000
args.batch_size = 1000
args.mpc_horizon = 10
args.n_iter = 12
args.mpc_action_sampling_strategy = 'random'

data_path = '../../data/'
logdir = 'hw4_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

print("LOGGING TO: ", logdir)

trainer = MB_Trainer(args)
trainer.run_training_loop()


In [None]:
#Q3 2nd command
# python cs285/scripts/run_hw4_mb.py --exp_name q3_reacher 
# --env_name reacher-cs285-v0 --add_sl_noise --mpc_horizon 10 
# --num_agent_train_steps_per_iter 1000 --batch_size_initial 5000 
# --batch_size 5000 --n_iter 15 --mpc_action_sampling_strategy 'random

args = Args()
args.exp_name = 'q3_reacher'
args.env_name = 'reacher-cs285-v0'
args.add_sl_noise = True
args.mpc_horizon = 10
args.num_agent_train_steps_per_iter = 1000
args.batch_size_initial = 5000
args.batch_size = 5000
args.n_iter = 15
args.mpc_action_sampling_strategy = 'random'

data_path = '../../data/'
logdir = 'hw4_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

print("LOGGING TO: ", logdir)

trainer = MB_Trainer(args)
trainer.run_training_loop()


In [None]:
#Q# - 3rd command
# python cs285/scripts/run_hw4_mb.py --exp_name q3_cheetah --env_name cheetah-cs285-v0 
# --mpc_horizon 15 --add_sl_noise --num_agent_train_steps_per_iter 1500 
# --batch_size_initial 5000 --batch_size 5000 --n_iter 20 
# --mpc_action_sampling_strategy 'random

args = Args()
args.exp_name = 'q3_cheetah'
args.env_name = 'cheetah-cs285-v0'
args.mpc_horizon = 15
args.add_sl_noise = True
args.num_agent_train_steps_per_iter = 1500
args.batch_size_initial = 5000
args.batch_size = 5000
args.n_iter = 20
args.mpc_action_sampling_strategy = 'random'

data_path = '../../data/'
logdir = 'hw4_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

print("LOGGING TO: ", logdir)

trainer = MB_Trainer(args)
trainer.run_training_loop()

In [9]:
#Q4
data_path = '../../data/'

# python cs285/scripts/run_hw4_mb.py --exp_name q4_reacher_horizon5 --env_name reacher-cs285-v0 
# --add_sl_noise --mpc_horizon 5 --mpc_action_sampling_strategy 'random' 
# --num_agent_train_steps_per_iter 1000 --batch_size 800 --n_iter 15 
# --mpc_action_sampling_strategy 'random'
args = Args()
args.exp_name = 'q4_reacher_horizon5'
args.env_name = 'reacher-cs285-v0'
args.add_sl_noise = True
args.mpc_horizon = 5
args.num_agent_train_steps_per_iter = 1000
args.batch_size = 800
args.n_iter = 15
args.mpc_action_sampling_strategy = 'random'

logdir = 'hw4_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

print("LOGGING TO: ", logdir)

trainer = MB_Trainer(args)
trainer.run_training_loop()

# 
# python cs285/scripts/run_hw4_mb.py --exp_name q4_reacher_horizon15 --env_name reacher-cs285-v0 
# --add_sl_noise --mpc_horizon 15 --num_agent_train_steps_per_iter 1000 --batch_size 800 
# --n_iter 15 --mpc_action_sampling_strategy 'random'

args = Args()
args.exp_name = 'q4_reacher_horizon15'
args.env_name = 'reacher-cs285-v0'
args.add_sl_noise = True
args.mpc_horizon = 15
args.num_agent_train_steps_per_iter = 1000
args.batch_size = 800
args.n_iter = 15
args.mpc_action_sampling_strategy = 'random'

logdir = 'hw4_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

print("LOGGING TO: ", logdir)

trainer = MB_Trainer(args)
trainer.run_training_loop()

# python cs285/scripts/run_hw4_mb.py --exp_name q4_reacher_horizon30 --env_name reacher-cs285-v0 
# --add_sl_noise --mpc_horizon 30 --num_agent_train_steps_per_iter 1000 --batch_size 800 
# --n_iter 15 --mpc_action_sampling_strategy 'random'
# 
args = Args()
args.exp_name = 'q4_reacher_horizon30'
args.env_name = 'reacher-cs285-v0'
args.add_sl_noise = True
args.mpc_horizon = 30
args.num_agent_train_steps_per_iter = 1000
args.batch_size = 800
args.n_iter = 15
args.mpc_action_sampling_strategy = 'random'

logdir = 'hw4_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

print("LOGGING TO: ", logdir)

trainer = MB_Trainer(args)
trainer.run_training_loop()



# python cs285/scripts/run_hw4_mb.py --exp_name q4_reacher_numseq100 --env_name reacher-cs285-v0 
# --add_sl_noise --mpc_horizon 10 --num_agent_train_steps_per_iter 1000 --batch_size 800 
# --n_iter 15 --mpc_num_action_sequences 100 --mpc_action_sampling_strategy 'random'
# 

args = Args()
args.exp_name = 'q4_reacher_numseq100'
args.env_name = 'reacher-cs285-v0'
args.add_sl_noise = True
args.mpc_horizon = 10
args.num_agent_train_steps_per_iter = 1000
args.batch_size = 800
args.n_iter = 15
args.mpc_num_action_sequences = 100
args.mpc_action_sampling_strategy = 'random'

logdir = 'hw4_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

print("LOGGING TO: ", logdir)

trainer = MB_Trainer(args)
trainer.run_training_loop()

# python cs285/scripts/run_hw4_mb.py --exp_name q4_reacher_numseq1000 --env_name reacher-cs285-v0 
# --add_sl_noise --mpc_horizon 10 --num_agent_train_steps_per_iter 1000 --batch_size 800 
# --n_iter 15 --mpc_num_action_sequences 1000 --mpc_action_sampling_strategy 'random'
# 
args = Args()
args.exp_name = 'q4_reacher_numseq1000'
args.env_name = 'reacher-cs285-v0'
args.add_sl_noise = True
args.mpc_horizon = 10
args.num_agent_train_steps_per_iter = 1000
args.batch_size = 800
args.n_iter = 15
args.mpc_num_action_sequences = 1000
args.mpc_action_sampling_strategy = 'random'

logdir = 'hw4_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

print("LOGGING TO: ", logdir)

trainer = MB_Trainer(args)
trainer.run_training_loop()



Training agent...

Training agent using sampled data from replay buffer...

Beginning logging procedure...

Collecting data for eval...
Eval_AverageReturn : -388.1238708496094
Eval_StdReturn : 0.0
Eval_MaxReturn : -388.1238708496094
Eval_MinReturn : -388.1238708496094
Eval_AverageEpLen : 500.0
Train_AverageReturn : -416.55743408203125
Train_StdReturn : 11.799728393554688
Train_MaxReturn : -404.7576904296875
Train_MinReturn : -428.3571472167969
Train_AverageEpLen : 500.0
Train_EnvstepsSoFar : 32000
TimeSinceStart : 1013.988255739212
Training Loss : 0.16945350170135498
Initial_DataCollection_AverageReturn : -4489.67822265625
Done logging...




********** Iteration 13 ************

Collecting data to be used for training...

Training agent...

Training agent using sampled data from replay buffer...

Beginning logging procedure...

Collecting data for eval...
Eval_AverageReturn : -483.6348876953125
Eval_StdReturn : 0.0
Eval_MaxReturn : -483.6348876953125
Eval_MinReturn : -483.63488769531

In [10]:


# python cs285/scripts/run_hw4_mb.py --exp_name q4_reacher_ensemble1 --env_name reacher-cs285-v0 
# --ensemble_size 1 --add_sl_noise --mpc_horizon 10 --num_agent_train_steps_per_iter 1000 
# --batch_size 800 --n_iter 15 --mpc_action_sampling_strategy 'random'
# 
args = Args()
args.exp_name = 'q4_reacher_ensemble1'
args.env_name = 'reacher-cs285-v0'
args.ensemble_size = 1
args.add_sl_noise = True
args.mpc_horizon = 10
args.num_agent_train_steps_per_iter = 1000
args.batch_size = 800
args.n_iter = 15
args.mpc_action_sampling_strategy = 'random'

logdir = 'hw4_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

print("LOGGING TO: ", logdir)

trainer = MB_Trainer(args)
trainer.run_training_loop()

# python cs285/scripts/run_hw4_mb.py --exp_name q4_reacher_ensemble3 --env_name reacher-cs285-v0 
# --ensemble_size 3 --add_sl_noise --mpc_horizon 10 --num_agent_train_steps_per_iter 1000 
# --batch_size800 --n_iter 15 --mpc_action_sampling_strategy 'random'
# 
args = Args()
args.exp_name = 'q4_reacher_ensemble3'
args.env_name = 'reacher-cs285-v0'
args.ensemble_size = 3
args.add_sl_noise = True
args.mpc_horizon = 10
args.num_agent_train_steps_per_iter = 1000
args.batch_size = 800
args.n_iter = 15
args.mpc_action_sampling_strategy = 'random'

logdir = 'hw4_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

print("LOGGING TO: ", logdir)

trainer = MB_Trainer(args)
trainer.run_training_loop()

# python cs285/scripts/run_hw4_mb.py --exp_name q4_reacher_ensemble5 --env_name reacher-cs285-v0 
# --ensemble_size 5 --add_sl_noise --mpc_horizon 10 --num_agent_train_steps_per_iter 1000 --batch_size800 --n_iter 15 
# --mpc_action_sampling_strategy 'random'

args = Args()
args.exp_name = 'q4_reacher_ensemble5'
args.env_name = 'reacher-cs285-v0'
args.ensemble_size = 5
args.add_sl_noise = True
args.mpc_horizon = 10
args.num_agent_train_steps_per_iter = 1000
args.batch_size = 800
args.n_iter = 15
args.mpc_action_sampling_strategy = 'random'

logdir = 'hw4_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

print("LOGGING TO: ", logdir)

trainer = MB_Trainer(args)
trainer.run_training_loop()


LOGGING TO:  ../../data/hw4_q4_reacher_ensemble1_reacher-cs285-v0_23-10-2021_08-45-43
########################
logging outputs to  ../../data/hw4_q4_reacher_ensemble1_reacher-cs285-v0_23-10-2021_08-45-43
########################
Using GPU id 0
Using action sampling strategy: random


********** Iteration 0 ************

Collecting data to be used for training...

Training agent...

Training agent using sampled data from replay buffer...

Beginning logging procedure...

Collecting data for eval...
Eval_AverageReturn : -2279.636962890625
Eval_StdReturn : 0.0
Eval_MaxReturn : -2279.636962890625
Eval_MinReturn : -2279.636962890625
Eval_AverageEpLen : 500.0
Train_AverageReturn : -4489.67822265625
Train_StdReturn : 1366.086669921875
Train_MaxReturn : -2048.99365234375
Train_MinReturn : -6259.1376953125
Train_AverageEpLen : 500.0
Train_EnvstepsSoFar : 20000
TimeSinceStart : 7.638526916503906
Training Loss : 0.18329642713069916
Initial_DataCollection_AverageReturn : -4489.67822265625
Done logg

In [11]:
# python cs285/scripts/run_hw4_mb.py --exp_name q5_cheetah_cem_1000 --env_name 
# 'cheetah-cs285-v0' --mpc_horizon 15 --add_sl_noise --num_agent_train_steps_per_iter 1500
#  --batch_size_initial 5000 --batch_size 5000 --n_iter 5 
# --mpc_action_sampling_strategy 'random'
# 
args = Args()
args.exp_name = 'q5_cheetah_random'
args.env_name = 'cheetah-cs285-v0'
args.mpc_horizon = 15
args.add_sl_noise = True
args.num_agent_train_steps_per_iter = 1500
args.batch_size_initial = 5000
args.batch_size = 5000
args.n_iter = 5
args.mpc_action_sampling_strategy = 'random'

logdir = 'hw4_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

print("LOGGING TO: ", logdir)

trainer = MB_Trainer(args)
trainer.run_training_loop()

# python cs285/scripts/run_hw4_mb.py --exp_name q5_cheetah_cem_2 --env_name 
# 'cheetah-cs285-v0' --mpc_horizon 15 --add_sl_noise --num_agent_train_steps_per_iter 1500
#  --batch_size_initial 5000 --batch_size 5000 --n_iter 5 
# --mpc_action_sampling_strategy 'cem' --cem_iterations 2

args = Args()
args.exp_name = 'q5_cheetah_cem_2'
args.env_name = 'cheetah-cs285-v0'
args.mpc_horizon = 15
args.add_sl_noise = True
args.num_agent_train_steps_per_iter = 1500
args.batch_size_initial = 5000
args.batch_size = 5000
args.n_iter = 5
args.mpc_action_sampling_strategy = 'cem'
args.cem_iterations = 2

logdir = 'hw4_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

print("LOGGING TO: ", logdir)

trainer = MB_Trainer(args)
trainer.run_training_loop()


# 
# python cs285/scripts/run_hw4_mb.py --exp_name q5_cheetah_cem_4 --env_name 
# 'cheetah-cs285-v0' --mpc_horizon 15 --add_sl_noise --num_agent_train_steps_per_iter 1500
#  --batch_size_initial 5000 --batch_size 5000 --n_iter 5 
# --mpc_action_sampling_strategy 'cem' --cem_iterations 4

args = Args()
args.exp_name = 'q5_cheetah_cem_4'
args.env_name = 'cheetah-cs285-v0'
args.mpc_horizon = 15
args.add_sl_noise = True
args.num_agent_train_steps_per_iter = 1500
args.batch_size_initial = 5000
args.batch_size = 5000
args.n_iter = 5
args.mpc_action_sampling_strategy = 'cem'
args.cem_iterations = 4

logdir = 'hw4_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

print("LOGGING TO: ", logdir)

trainer = MB_Trainer(args)
trainer.run_training_loop()


LOGGING TO:  ../../data/hw4_q5_cheetah_random_cheetah-cs285-v0_23-10-2021_09-06-48
########################
logging outputs to  ../../data/hw4_q5_cheetah_random_cheetah-cs285-v0_23-10-2021_09-06-48
########################
Using GPU id 0
Using action sampling strategy: random


********** Iteration 0 ************

Collecting data to be used for training...

Training agent...

Training agent using sampled data from replay buffer...

Beginning logging procedure...

Collecting data for eval...
Eval_AverageReturn : 186.33712768554688
Eval_StdReturn : 0.0
Eval_MaxReturn : 186.33712768554688
Eval_MinReturn : 186.33712768554688
Eval_AverageEpLen : 500.0
Train_AverageReturn : -2504.96630859375
Train_StdReturn : 321.12841796875
Train_MaxReturn : -2021.8564453125
Train_MinReturn : -2978.619140625
Train_AverageEpLen : 500.0
Train_EnvstepsSoFar : 5000
TimeSinceStart : 20.848231315612793
Training Loss : 0.07709154486656189
Initial_DataCollection_AverageReturn : -2504.96630859375
Done logging...





In [None]:
#@markdown You can visualize your runs with tensorboard from within the notebook

## requires tensorflow==2.3.0
%load_ext tensorboard
%tensorboard --logdir ../../data/