In [None]:
# default_exp agents.navigate


# Basic MineRL Agent
> Run an agent for a navigation task

In [None]:
# export

import time
import gym
import minerl

# OpenCog
from opencog.atomspace import *
from opencog.logger import log
from opencog.pln import *
from opencog.type_constructors import *
from opencog.utilities import set_default_atomspace

# ROCCA
from rocca.envs.wrappers import GymWrapper
from rocca.envs.wrappers.utils import minerl_single_action
from rocca.agents import OpencogAgent
from rocca.agents.utils import *

from rocca.utils import *

## Agent definition

Below is pretty much `OpencogAgent` but with the `step` method fixed to use the `minerl_single_action` helper.

In [None]:
# export


class NavigateAgent(OpencogAgent):
    def __init__(self, env, action_space, p_goal, n_goal, log_level="info"):
        OpencogAgent.__init__(self, env, action_space, p_goal, n_goal, log_level)

    def step(self):
        """Run one step of observation, decision and env update"""
        agent_log.debug("atomese_obs = {}".format(self.observation))
        obs_record = [
            self.record(o, self.step_count, tv=TRUE_TV) for o in self.observation
        ]
        agent_log.debug("obs_record = {}".format(obs_record))

        # Make the goal for that iteration
        goal = self.make_goal()
        agent_log.debug("goal = {}".format(goal))

        # Plan, i.e. come up with cognitive schematics as plans.  Here the
        # goal expiry is 2, i.e. must be fulfilled set for the next two iterations.
        cogscms = self.plan(goal, self.expiry)
        agent_log.debug("cogscms = {}".format(cogscms))

        # Deduce the action distribution
        mxmdl = self.deduce(cogscms)
        agent_log.debug("mxmdl = {}".format(mxmdl_to_str(mxmdl)))

        # Select the next action
        action, pblty = self.decide(mxmdl)
        agent_log.debug(
            "action with probability of success = {}".format(
                act_pblt_to_str((action, pblty))
            )
        )

        # Timestamp the action that is about to be executed
        action_record = self.record(action, self.step_count, tv=TRUE_TV)
        agent_log.debug("action_record = {}".format(action_record))
        agent_log.debug("action = {}".format(action))

        # Increment the counter for that action and log it
        self.action_counter[action] += 1
        agent_log.debug("action_counter = {}".format(self.action_counter))

        # Increase the step count and run the next step of the environment
        self.step_count += 1
        # TODO gather environment info.
        reward, self.observation, done = self.env.step(
            minerl_single_action(self.env, action)
        )

        self.accumulated_reward += float(reward.out[1].name)
        agent_log.debug("observation = {}".format(self.observation))
        agent_log.debug("reward = {}".format(reward))
        agent_log.debug("accumulated reward = {}".format(self.accumulated_reward))

        reward_record = self.record(reward, self.step_count, tv=TRUE_TV)
        agent_log.debug("reward_record = {}".format(reward_record))

        if done:
            return False

        return True

## Experiment

For this experiment we use the basic _MineRLNavigateDense-v0_ environment of MineRL. The task of the agent is to touch a diamond block placed somewhere in the vicinity. This is a _Dense_ version of the environment, which means the agent receives rewards all the time, based on whether the distance to the target decreases or increases.

In [None]:
env = gym.make("MineRLNavigateDense-v0")


In [None]:
env.action_space

In [None]:
env.observation_space

In [None]:
atomspace = AtomSpace()
set_default_atomspace(atomspace)

In [None]:
wrapped_env = GymWrapper(env)

In [None]:
# Create Goal
pgoal = EvaluationLink(PredicateNode("Reward"), NumberNode("100"))
ngoal = EvaluationLink(PredicateNode("Reward"), NumberNode("0"))

In [None]:
action_space = {
    ExecutionLink(SchemaNode("attack"), NumberNode("0")),
    ExecutionLink(SchemaNode("attack"), NumberNode("1")),
    ExecutionLink(SchemaNode("forward"), NumberNode("0")),
    ExecutionLink(SchemaNode("forward"), NumberNode("1")),
    ExecutionLink(SchemaNode("back"), NumberNode("0")),
    ExecutionLink(SchemaNode("back"), NumberNode("1")),
    ExecutionLink(SchemaNode("left"), NumberNode("0")),
    ExecutionLink(SchemaNode("left"), NumberNode("1")),
    ExecutionLink(SchemaNode("right"), NumberNode("0")),
    ExecutionLink(SchemaNode("right"), NumberNode("1")),
    ExecutionLink(SchemaNode("jump"), NumberNode("0")),
    ExecutionLink(SchemaNode("jump"), NumberNode("1")),
    ExecutionLink(SchemaNode("sprint"), NumberNode("0")),
    ExecutionLink(SchemaNode("sprint"), NumberNode("1")),
    ExecutionLink(SchemaNode("sneak"), NumberNode("0")),
    ExecutionLink(SchemaNode("sneak"), NumberNode("1")),
    ExecutionLink(SchemaNode("place"), ConceptNode("dirt")),
    ExecutionLink(SchemaNode("place"), ConceptNode("none")),
    ExecutionLink(SchemaNode("camera"), ListLink(NumberNode("2.5"), NumberNode("0.0"))),
    ExecutionLink(
        SchemaNode("camera"), ListLink(NumberNode("0.0"), NumberNode("-1.5"))
    ),
}

In [None]:
agent = NavigateAgent(wrapped_env, action_space, pgoal, ngoal)

In [None]:
from tensorboardX import SummaryWriter

tb_writer = SummaryWriter(comment="minerl-navigate")

epochs = 3  # Number of epochs (learning / interacting episodes)
epoch_len = 200

for i in range(epochs):
    wrapped_env.restart()
    agent.reset_action_counter()
    accreward = agent.accumulated_reward  # Keep track of the reward before

    # Learning phase: discover patterns to make more informed decisions
    log_msg(agent_log, f"Learning phase started. ({i + 1}/{epochs})")
    agent.learn()

    # Run agent to accumulate percepta
    log_msg(agent_log, f"Interaction phase started. ({i + 1}/{epochs})")
    for j in range(epoch_len):
        done = agent.step()
        time.sleep(0.01)
        log.info("step_count = {}".format(agent.step_count))
        if done:
            break

    new_reward = agent.accumulated_reward - accreward
    tb_writer.add_scalar("train/accumulated_reward", new_reward, agent.step_count)
    log_msg(
        agent_log, "Accumulated reward during {}th epoch = {}".format(i + 1, new_reward)
    )
    log_msg(
        agent_log,
        "Action counter during {}th epoch:\n{}".format(i + 1, agent.action_counter),
    )  # TODO: make the action counter look good
