## RDP Learning

In this notebook, we show how to perform PAC learning
over RDPs.

### Experiment 1: Rotating MAB

In [1]:
from functools import partial
from typing import Callable, Sequence
import gym
import numpy as np
from gym.wrappers import TimeLimit
from notebooks.utils import render_digraph
from src import NonMarkovianRotatingMAB
from src.learn_pdfa.balle.core import learn_subgraph
from src.learn_pdfa.base import learn_pdfa, Algorithm
from src.learn_pdfa.common import Generator, MultiprocessedGenerator
from src.pdfa import PDFA
from src.pdfa.render import to_graphviz_from_graph
from src.pdfa.types import Word

env = NonMarkovianRotatingMAB(winning_probs=[0.9, 0.2])

print(f"Observation space: {env.observation_space}")
print(f"Action space: {env.action_space}")

s = env.reset()
print(f"Initial state: {s}")

action = 0
sp, reward, _, _ = env.step(action)
print("-" * 10)
print(f"Action: {action}")
print(f"Next state: {sp}")
print(f"Reward: {reward}")

sp, reward, _, _ = env.step(action)
print("-" * 10)
print(f"Action: {action}")
print(f"Next state: {sp}")
print(f"Reward: {reward}")


Observation space: Discrete(2)
Action space: Discrete(2)
Initial state: 0
----------
Action: 0
Next state: 1
Reward: 1.0
----------
Action: 0
Next state: 1
Reward: 1.0


### Learning

In [2]:
class RDPGenerator(Generator):
    """Generate a trace against."""

    def __init__(self, env: gym.Env, policy: Callable, stop_probability: float = 0.05):
        self._env = env
        self._policy = policy
        self._stop_probability = stop_probability

        self.obs_space_dim = self._env.observation_space.n
        self.action_dim = self._env.action_space.n
        self.nb_rewards = 2  # TODO fix
        self.encoder = partial(np.ravel_multi_index, dims=(self.action_dim, self.nb_rewards, self.obs_space_dim))
        self.decoder = partial(np.unravel_index, dims=(self.action_dim, self.nb_rewards, self.obs_space_dim))

    def alphabet_size(self) -> int:
        """Get the alphabet size."""
        return int(np.prod([self.action_dim, self.nb_rewards, self.obs_space_dim]))

    def sample(self, n: int = 1) -> Sequence[Word]:
        result = []
        for _ in range(n):
            word = self._sample_word()
            result.append(word)
        return result

    def _should_stop(self) -> bool:
        """Return True if the current episode should stop, false otherwise."""
        return np.random.random() < self._stop_probability

    def _sample_word(self) -> Word:
        """Sample one word."""
        initial_state = self._env.reset()
        done = False
        trace = [(0, 0, initial_state)]
        while not done:
            if self._should_stop():
                break
            action = self._policy()
            obs, reward, done, _ = self._env.step(action)
            trace += [(action, int(reward), obs)]

        trace = [self.encoder(x) for x in trace]
        return trace

In [28]:
stop_probability = 0.2
winning_probs = (0.7, 0.3)
optimal_avg_reward = max(winning_probs)
env = gym.make("NonMarkovianRotatingMAB-v0", winning_probs=winning_probs)
env = TimeLimit(env, max_episode_steps=15)

def exploration_policy(env: gym.Env):
    return env.action_space.sample()
policy = partial(exploration_policy, env)

rdp_generator = RDPGenerator(env, policy=policy, stop_probability=stop_probability)

In [29]:
examples = rdp_generator.sample(n=1000)

print(f"Apriori expected length of traces: 1/stop_prob = {1/(stop_probability + np.finfo(float).eps)}")
print(f"Average length of traces: {np.mean([len(e) for e in examples])}")

Apriori expected length of traces: 1/stop_prob = 4.999999999999994
Average length of traces: 4.679


In [30]:
N = 100000
mp_rdp_generator = MultiprocessedGenerator(rdp_generator, nb_processes=8)
v, t = learn_pdfa(
    algorithm=Algorithm.BALLE,
    nb_samples=N,
    sample_generator=mp_rdp_generator,
    alphabet_size=rdp_generator.alphabet_size(),
    delta=0.1,
    n=4,
)
render_digraph(to_graphviz_from_graph(v, t, char2str=lambda c: str(rdp_generator.decoder(c))))

[2020-11-06 12:05:36,772][src.learn_pdfa][INFO] Parameters: ('BalleParams(sample_generator=<src.learn_pdfa.common.MultiprocessedGenerator '
 'object at 0x7fc7794bd6d0>, nb_samples=100000, n=4, alphabet_size=8, '
 'delta=0.1)')
[2020-11-06 12:05:43,290][src.learn_pdfa][INFO] Iteration 0
[2020-11-06 12:05:44,441][src.learn_pdfa][INFO] Iteration 1
[2020-11-06 12:05:46,010][src.learn_pdfa][INFO] Iteration 2
[2020-11-06 12:05:47,473][src.learn_pdfa][INFO] Iteration 3
[2020-11-06 12:05:48,653][src.learn_pdfa][INFO] Iteration 4
[2020-11-06 12:05:49,758][src.learn_pdfa][INFO] Iteration 5
[2020-11-06 12:05:50,887][src.learn_pdfa][INFO] Iteration 6
[2020-11-06 12:05:52,432][src.learn_pdfa][INFO] Iteration 7
[2020-11-06 12:05:53,658][src.learn_pdfa][INFO] Iteration 8
[2020-11-06 12:05:54,780][src.learn_pdfa][INFO] Iteration 9
  # This is added back by InteractiveShellApp.init_path()
[2020-11-06 12:05:55,584][graphviz.files][DEBUG] write 332 bytes to '/tmp/tmp90iqdnit/output'
[2020-11-06 12:05:55,