In [None]:
#default_exp agents.cartpole

# CartPole Agent
> An agent for solving the CartPole-v1 environment

In [None]:
#export

# OpenCog
from opencog.pln import *
from opencog.type_constructors import *
from opencog.utilities import set_default_atomspace

# ROCCA
from rocca.envs.wrappers import CartPoleWrapper
from rocca.agents import OpencogAgent
from rocca.agents.utils import *

from rocca.utils import *

In [None]:
import gym
import time
import logging

from opencog.logger import log
from rocca.agents.core import logger as ac_logger
from tensorboardX import SummaryWriter

## Agent Definition

### Fixed Rule Agent

In [None]:
#export

class FixedCartPoleAgent(OpencogAgent):
    def __init__(self, env: CartPoleWrapper, atomspace: AtomSpace):
        set_default_atomspace(atomspace)
        
        # Create Action Space. The set of allowed actions an agent can take.
        # TODO take care of action parameters.
        action_space = {ExecutionLink(SchemaNode(a)) for a in env.action_names}

        # Create Goal
        pgoal = EvaluationLink(PredicateNode("Reward"), NumberNode("1"))
        ngoal = EvaluationLink(PredicateNode("Reward"), NumberNode("0"))

        # Call super ctor
        super().__init__(env, atomspace, action_space, pgoal, ngoal)

    def plan(self, goal, expiry) -> list:
        """Plan the next actions given a goal and its expiry time offset

        Return a python list of cognitive schematics.  Whole cognitive
        schematics are output (instead of action plans) in order to
        make a decision based on their truth values.  Alternatively it
        could return a pair (action plan, tv), where tv has been
        evaluated to take into account the truth value of the context
        as well (which would differ from the truth value of rule in
        case the context is uncertain).

        The format for a cognitive schematic is as follows

        BackPredictiveImplicationScope <tv>
          <vardecl>
          <expiry>
          And (or SimultaneousAnd?)
            <context>
            Execution
              <action>
              <input> [optional]
              <output> [optional]
          <goal>

        """

        # For now we provide 2 hardwired rules
        #
        # 1. Push cart to the left (0) if angle is negative
        # 2. Push cart to the right (1) if angle is positive
        #
        # with some arbitrary truth value (stv 0.9, 0.1)
        angle = VariableNode("$angle")
        numt = TypeNode("NumberNode")
        time_offset = to_nat(1)
        pole_angle = PredicateNode("Pole Angle")
        go_right = SchemaNode("Go Right")
        go_left = SchemaNode("Go Left")
        reward = PredicateNode("Reward")
        epsilon = NumberNode("0.01")
        mepsilon = NumberNode("-0.01")
        unit = NumberNode("1")
        hTV = TruthValue(0.9, 0.1)  # High TV
        lTV = TruthValue(0.1, 0.1)  # Low TV

        # BackPredictiveImplicationScope <high TV>
        #   TypedVariable
        #     Variable "$angle"
        #     Type "NumberNode"
        #   Time "1"
        #   And
        #     Evaluation
        #       Predicate "Pole Angle"
        #       Variable "$angle"
        #     GreaterThan
        #       Variable "$angle"
        #       0
        #     Execution
        #       Schema "Go Right"
        #   Evaluation
        #     Predicate "Reward"
        #     Number "1"
        cs_rr = BackPredictiveImplicationScopeLink(
            TypedVariableLink(angle, numt),
            time_offset,
            AndLink(
                # Context
                EvaluationLink(pole_angle, angle),
                GreaterThanLink(angle, epsilon),
                # Action
                ExecutionLink(go_right),
            ),
            # Goal
            EvaluationLink(reward, unit),
            # TV
            tv=hTV,
        )

        # BackPredictiveImplicationScope <high TV>
        #   TypedVariable
        #     Variable "$angle"
        #     Type "NumberNode"
        #   Time "1"
        #   And
        #     Evaluation
        #       Predicate "Pole Angle"
        #       Variable "$angle"
        #     GreaterThan
        #       0
        #       Variable "$angle"
        #     Execution
        #       Schema "Go Left"
        #   Evaluation
        #     Predicate "Reward"
        #     Number "1"
        cs_ll = BackPredictiveImplicationScopeLink(
            TypedVariableLink(angle, numt),
            time_offset,
            AndLink(
                # Context
                EvaluationLink(pole_angle, angle),
                GreaterThanLink(mepsilon, angle),
                # Action
                ExecutionLink(go_left),
            ),
            # Goal
            EvaluationLink(reward, unit),
            # TV
            tv=hTV,
        )

        # To cover all possibilities we shouldn't forget the complementary
        # actions, i.e. going left when the pole is falling to the right
        # and such, which should make the situation worse.

        # BackPredictiveImplicationScope <low TV>
        #   TypedVariable
        #     Variable "$angle"
        #     Type "NumberNode"
        #   Time "1"
        #   And (or SimultaneousAnd?)
        #     Evaluation
        #       Predicate "Pole Angle"
        #       Variable "$angle"
        #     GreaterThan
        #       Variable "$angle"
        #       0
        #     Execution
        #       Schema "Go Left"
        #   Evaluation
        #     Predicate "Reward"
        #     Number "1"
        cs_rl = BackPredictiveImplicationScopeLink(
            TypedVariableLink(angle, numt),
            time_offset,
            AndLink(
                # Context
                EvaluationLink(pole_angle, angle),
                GreaterThanLink(angle, epsilon),
                # Action
                ExecutionLink(go_left),
            ),
            # Goal
            EvaluationLink(reward, unit),
            # TV
            tv=lTV,
        )

        # BackPredictiveImplicationScope <low TV>
        #   TypedVariable
        #     Variable "$angle"
        #     Type "NumberNode"
        #   Time "1"
        #   And (or SimultaneousAnd?)
        #     Evaluation
        #       Predicate "Pole Angle"
        #       Variable "$angle"
        #     GreaterThan
        #       0
        #       Variable "$angle"
        #     Execution
        #       Schema "Go Right"
        #   Evaluation
        #     Predicate "Reward"
        #     Number "1"
        cs_lr = BackPredictiveImplicationScopeLink(
            TypedVariableLink(angle, numt),
            time_offset,
            AndLink(
                # Context
                EvaluationLink(pole_angle, angle),
                GreaterThanLink(mepsilon, angle),
                # Action
                ExecutionLink(go_right),
            ),
            # Goal
            EvaluationLink(reward, unit),
            # TV
            tv=lTV,
        )

        # Ideally we want to return only relevant cognitive schematics
        # (i.e. with contexts probabilistically currently true) for
        # now however we return everything and let to the deduction
        # process deal with it, as it should be able to.
        return [cs_ll, cs_rr, cs_rl, cs_lr]

In [None]:
def seed_with(agent, knowledge):  # TODO: figure out how to pass atoms to be inserted.
    set_default_atomspace(agent.atomspace)

    angle = VariableNode("$angle")
    numt = TypeNode("NumberNode")
    time_offset = to_nat(1)
    pole_angle = PredicateNode("Pole Angle")
    go_right = SchemaNode("Go Right")
    go_left = SchemaNode("Go Left")
    reward = PredicateNode("Reward")
    epsilon = NumberNode("0.01")
    mepsilon = NumberNode("-0.01")
    unit = NumberNode("1")
    hTV = TruthValue(0.9, 0.1)  # High TV
    lTV = TruthValue(0.1, 0.1)  # Low TV

    cs_rr = BackPredictiveImplicationScopeLink(
            TypedVariableLink(angle, numt),
            time_offset,
            AndLink(
                # Context
                EvaluationLink(pole_angle, angle),
                GreaterThanLink(angle, epsilon),
                # Action
                ExecutionLink(go_right),
            ),
            # Goal
            EvaluationLink(reward, unit),
            # TV
            tv=hTV,
        )

    cs_ll = BackPredictiveImplicationScopeLink(
            TypedVariableLink(angle, numt),
            time_offset,
            AndLink(
                # Context
                EvaluationLink(pole_angle, angle),
                GreaterThanLink(mepsilon, angle),
                # Action
                ExecutionLink(go_left),
            ),
            # Goal
            EvaluationLink(reward, unit),
            # TV
            tv=hTV,
        )

    cs_rl = BackPredictiveImplicationScopeLink(
            TypedVariableLink(angle, numt),
            time_offset,
            AndLink(
                # Context
                EvaluationLink(pole_angle, angle),
                GreaterThanLink(angle, epsilon),
                # Action
                ExecutionLink(go_left),
            ),
            # Goal
            EvaluationLink(reward, unit),
            # TV
            tv=lTV,
        )
        
    cs_lr = BackPredictiveImplicationScopeLink(
            TypedVariableLink(angle, numt),
            time_offset,
            AndLink(
                # Context
                EvaluationLink(pole_angle, angle),
                GreaterThanLink(mepsilon, angle),
                # Action
                ExecutionLink(go_right),
            ),
            # Goal
            EvaluationLink(reward, unit),
            # TV
            tv=lTV,
        )

    agent.cognitive_schematics.update(set([cs_ll, cs_rr, cs_rl, cs_lr]))  # TODO: the code should update Python-side automatically.

### Learning Agent

In [None]:
#export

class LearningCartPoleAgent(OpencogAgent):
    def __init__(self, env: CartPoleWrapper, atomspace: AtomSpace, log_level="debug"):
        set_default_atomspace(atomspace)
        
        # Create Action Space. The set of allowed actions an agent can take.
        # TODO take care of action parameters.
        action_space = {ExecutionLink(SchemaNode(a)) for a in env.action_names}

        # Create Goal
        pgoal = EvaluationLink(PredicateNode("Reward"), NumberNode("1"))
        ngoal = EvaluationLink(PredicateNode("Reward"), NumberNode("0"))

        # Call super ctor
        super().__init__(env, atomspace, action_space, pgoal, ngoal, log_level=log_level)

        # Overwrite some OpencogAgent parameters
        self.monoaction_general_succeedent_mining = False
        self.polyaction_mining = False
        self.temporal_deduction = False

## Experiment

In [None]:
env = gym.make("CartPole-v1")

In [None]:
env.action_space

In [None]:
env.observation_space

### Fixed Agent

In [None]:
atomspace = AtomSpace()
set_default_atomspace(atomspace)
wrapped_env = CartPoleWrapper(env, atomspace)

In [None]:
# tb_writer = SummaryWriter(comment="-cartpole-fixed")

cpa = FixedCartPoleAgent(wrapped_env, atomspace)
cpa.delta = 1.0e-16

# Run control loop
while not cpa.control_cycle():
    time.sleep(0.1)
    log.debug("cycle_count = {}".format(cpa.cycle_count))
    # tb_writer.add_scalar(
    #     "accumulated_reward", cpa.accumulated_reward, cpa.cycle_count
    # )

log_msg(agent_log, f"The final reward is {cpa.accumulated_reward}.")

### Learning Agent

In [None]:
atomspace = AtomSpace()
set_default_atomspace(atomspace)
wrapped_env = CartPoleWrapper(env, atomspace)

In [None]:
agent = LearningCartPoleAgent(wrapped_env, atomspace, log_level="fine")
agent.delta = 1.0e-16

# seed_with(agent, ["fixme"])

In [None]:
tb_writer = SummaryWriter(comment="-cartpole-learning-seeded")
ac_logger.setLevel(logging.DEBUG)  # The agents.core logger

epochs = 1  # Number of epochs (learning / interacting episodes)
epoch_len = 200

for i in range(epochs):
    wrapped_env.restart()
    agent.reset_action_counter()
    accreward = agent.accumulated_reward  # Keep track of the reward before

    # Learning phase: discover patterns to make more informed decisions
    log_msg(agent_log, f"Learning phase started. ({i + 1}/{epochs})")
    agent.learn()
    
    # Run agent to accumulate percepta
    log_msg(agent_log, f"Interaction phase started. ({i + 1}/{epochs})")
    for j in range(epoch_len):
        done = agent.control_cycle()
        # wrapped_env.render() uncomment to see the rendered env
        time.sleep(0.01)
        log.debug("cycle_count = {}".format(agent.cycle_count))
        if done:
            break
        
    new_reward = agent.accumulated_reward - accreward
    tb_writer.add_scalar("train/accumulated_reward", new_reward, agent.cycle_count)
    log_msg(agent_log, "Accumulated reward during {}th epoch = {}".format(i + 1, new_reward))
    log_msg(agent_log, "Action counter during {}th epoch:\n{}".format(i + 1, agent.action_counter))  # TODO: make the action counter look good

log_msg(agent_log, f"The average total reward over {epochs} trials (training): {agent.accumulated_reward / epochs}.")
# TODO: add a separate testing loop and measure average total reward.