In [1]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import random

import torch

import context_changers
import ct_model
import dmc
import drqv2
import utils
import rl_model

In [15]:
task_name = 'reacher_hard'
expert_frame_stack = 3
action_repeat = 2
seed = 432335
xml_path = 'domain_xmls/reacher.xml'
episode_len = 1000
context_camera_ids = [0]
learner_camera_id = 0
im_w = 64
im_h = 64
n_video = 1
state_dim = 1024
cam_id = random.choice(context_camera_ids)

num_eval_episodes = 50

  and should_run_async(code)


In [3]:
expert: drqv2.DrQV2Agent = drqv2.DrQV2Agent.load('experts/reacher_hard.pt')
expert.train(training=False)

context_translator: ct_model.CTNet = ct_model.CTNet.load('ct/reacher_hard.pt').to(utils.device())
context_translator.eval()

mlp_context_translator: ct_model.CTNet = ct_model.CTNet.load('ct/reacher_hard_mlp.pt').to(utils.device())
context_translator.eval()

  and should_run_async(code)


CTNet(
  (enc1): EncoderNet(
    (leaky_relu): LeakyReLU(negative_slope=0.2)
    (flatten): Flatten(start_dim=1, end_dim=-1)
    (conv_1): Conv2d(3, 64, kernel_size=(5, 5), stride=(2, 2))
    (b_norm_1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv_2): Conv2d(64, 128, kernel_size=(5, 5), stride=(2, 2))
    (b_norm_2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv_3): Conv2d(128, 256, kernel_size=(5, 5), stride=(2, 2))
    (b_norm_3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv_4): Conv2d(256, 512, kernel_size=(5, 5), stride=(2, 2))
    (b_norm_4): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (fc1): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1))
    (b_norm_fc_1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (fc2): Conv2d(1024, 1024, kernel_size=(1, 1), stride=(1, 1)

In [4]:
expert_env = dmc.make(task_name, expert_frame_stack, action_repeat, seed, xml_path, episode_len=episode_len)

  and should_run_async(code)


In [13]:
def evaluate_expert():
    expert_reward = 0.
    for _ in range(num_eval_episodes):
        with torch.no_grad():
            time_step = expert_env.reset()
            expert_reward += time_step.reward
            while not time_step.last():
                action = expert.act(time_step.observation, 1, eval_mode=True)
                time_step = expert_env.step(action)
                expert_reward += time_step.reward
    expert_reward /= num_eval_episodes
    return expert_reward

def evaluate_agent(agent, context_translator, frame_stack, n_video):
    eval_env = dmc.make(task_name, frame_stack, action_repeat, seed + 1, xml_path, learner_camera_id, im_w, im_h, context_changers.ReacherHardContextChanger(), episode_len)
    eval_env = dmc.EncodeStackWrapper(
        eval_env, expert, context_translator, expert_env, context_camera_ids,
        n_video, im_w, im_h, state_dim, frame_stack, context_changers.ReacherHardContextChanger(), dist_reward=False)

    agent_reward = 0
    for _ in range(num_eval_episodes):
        time_step = eval_env.reset()
        agent_reward += time_step.reward
        episode_reward = time_step.reward
        while not time_step.last():
            with torch.no_grad(), utils.eval_mode(agent):
                state = torch.tensor(time_step.observation, device=utils.device(), dtype=torch.float)
                action = agent.act(state, 1, eval_mode=True)
            time_step = eval_env.step(action)
            episode_reward += time_step.reward
        agent_reward += episode_reward

    agent_reward /= num_eval_episodes
    return agent_reward

  and should_run_async(code)


## Evaluations

Expert

In [6]:
evaluate_expert()

  and should_run_async(code)


910.0333333333333

New Algo

In [7]:
agent_file = 'rl_exp_local/reacher_hard/na/snapshot.pt'
evaluate_agent(rl_model.RLAgent.load(agent_file), context_translator, frame_stack=1, n_video=1)

  and should_run_async(code)


3.3333333333333335

In [8]:
agent_file = 'rl_exp_local/reacher_hard/na_1/snapshot.pt'
evaluate_agent(rl_model.RLAgent.load(agent_file), context_translator, frame_stack=1, n_video=1)

  and should_run_async(code)


27.033333333333335

In [9]:
agent_file = 'rl_exp_local/reacher_hard/na_mlp/snapshot.pt'
evaluate_agent(rl_model.RLAgent.load(agent_file), mlp_context_translator, frame_stack=1, n_video=1)

  and should_run_async(code)


5.466666666666667

In [10]:
agent_file = 'rl_exp_local/reacher_hard/na_fs_3/snapshot.pt'
evaluate_agent(rl_model.RLAgent.load(agent_file), context_translator, frame_stack=3, n_video=1)

  and should_run_async(code)


1.9

In [11]:
agent_file = 'rl_exp_local/reacher_hard/na_fs_3_n_video_10/snapshot.pt'
evaluate_agent(rl_model.RLAgent.load(agent_file), context_translator, frame_stack=3, n_video=10)

  and should_run_async(code)


1.5666666666666667

In [12]:
agent_file = 'rl_exp_local/reacher_hard/na_fs_3_discount_0_7/snapshot.pt'
evaluate_agent(rl_model.RLAgent.load(agent_file), context_translator, frame_stack=3, n_video=1)

  and should_run_async(code)


0.4

In [16]:
agent_file = 'rl_exp_local/reacher_hard/na_1/snapshot.pt'
evaluate_agent(rl_model.RLAgent.load(agent_file), context_translator, frame_stack=1, n_video=1)

  and should_run_async(code)


0.0
0.0
0.0
0.0
9.0
0.0
0.0
0.0
0.0
7.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
19.0
0.0
0.0
0.0
4.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


0.78