## RDP Learning

In this notebook, we show how to perform PAC learning
over RDPs.

### Experiment 1: Rotating MAB

In [1]:
from functools import partial
from typing import Tuple, Set, Dict

import gym
import numpy as np
from gym.wrappers import TimeLimit
from notebooks.utils import render_digraph

from src import NonMarkovianRotatingMAB
from src.learn_pdfa.base import learn_pdfa, Algorithm
from src.learn_pdfa.common import MultiprocessedGenerator
from src.learn_rdps import random_exploration_policy, RDPGenerator
from src.pdfa.render import to_graphviz_from_graph
from src.pdfa.types import Character

env = NonMarkovianRotatingMAB(winning_probs=[0.9, 0.2])

print(f"Observation space: {env.observation_space}")
print(f"Action space: {env.action_space}")

s = env.reset()
print(f"Initial state: {s}")

action = 0
sp, reward, _, _ = env.step(action)
print("-" * 10)
print(f"Action: {action}")
print(f"Next state: {sp}")
print(f"Reward: {reward}")

sp, reward, _, _ = env.step(action)
print("-" * 10)
print(f"Action: {action}")
print(f"Next state: {sp}")
print(f"Reward: {reward}")


Observation space: Discrete(2)
Action space: Discrete(2)
Initial state: 0
----------
Action: 0
Next state: 1
Reward: 1.0
----------
Action: 0
Next state: 1
Reward: 1.0


### Learning

In [2]:
def learning_rotating_mab(
    stop_probability: float,
    winning_probabilities: Tuple[float, ...],
    max_episode_steps: int,
    nb_samples: int,
    delta: float,
    n_upperbound: int,
    nb_processes: int = 8,
) -> Tuple[RDPGenerator, Tuple[Set[int], Dict[int, Dict[Character, int]]]]:
    """Test learning of Rotating MAB."""
    env = gym.make("NonMarkovianRotatingMAB-v0", winning_probs=winning_probabilities)
    env = TimeLimit(env, max_episode_steps=max_episode_steps)

    policy = partial(random_exploration_policy, env)

    rdp_generator = RDPGenerator(
        env, policy=policy, nb_rewards=2, stop_probability=stop_probability
    )

    examples = rdp_generator.sample(n=1000)

    print(
        f"Apriori expected length of traces: 1/stop_prob = {1 / (stop_probability + np.finfo(float).eps)}"
    )
    print(f"Average length of traces: {np.mean([len(e) for e in examples])}")

    mp_rdp_generator = MultiprocessedGenerator(rdp_generator, nb_processes=nb_processes)
    v, t = learn_pdfa(
        algorithm=Algorithm.BALLE,
        nb_samples=nb_samples,
        sample_generator=mp_rdp_generator,
        alphabet_size=rdp_generator.alphabet_size(),
        delta=delta,
        n=n_upperbound,
    )
    return rdp_generator, (v, t)

In [3]:
rdp_generator, (v, t) = learning_rotating_mab(
    stop_probability=0.2,
    winning_probabilities=(0.7, 0.3),
    max_episode_steps=15,
    nb_samples=100000,
    delta=0.1,
    n_upperbound=4,
    nb_processes=8)

render_digraph(to_graphviz_from_graph(v, t, char2str=lambda c: str(rdp_generator.decoder(c))))


Apriori expected length of traces: 1/stop_prob = 4.999999999999994
Average length of traces: 3.679


[2020-11-06 19:23:32,990][src.learn_pdfa][INFO] Parameters: ('BalleParams(sample_generator=<src.learn_pdfa.common.MultiprocessedGenerator '
 'object at 0x7f4941412450>, nb_samples=100000, n=4, alphabet_size=8, '
 'delta=0.1)')
[2020-11-06 19:23:38,987][src.learn_pdfa][INFO] Iteration 0
[2020-11-06 19:23:39,789][src.learn_pdfa][INFO] Iteration 1
[2020-11-06 19:23:40,866][src.learn_pdfa][INFO] Iteration 2
[2020-11-06 19:23:41,900][src.learn_pdfa][INFO] Iteration 3
[2020-11-06 19:23:42,979][src.learn_pdfa][INFO] Iteration 4
[2020-11-06 19:23:43,834][src.learn_pdfa][INFO] Iteration 5
[2020-11-06 19:23:44,617][src.learn_pdfa][INFO] Iteration 6
[2020-11-06 19:23:45,420][src.learn_pdfa][INFO] Iteration 7
[2020-11-06 19:23:46,228][src.learn_pdfa][INFO] Iteration 8
  # Remove the CWD from sys.path while we load stuff.
[2020-11-06 19:23:46,823][graphviz.files][DEBUG] write 301 bytes to '/tmp/tmpc7el2jz4/output'
[2020-11-06 19:23:46,823][graphviz.backend][DEBUG] run ['dot', '-Tsvg', '-O', 'output

In [4]:
rdp_generator, (v, t) = learning_rotating_mab(
    stop_probability=0.2,
    winning_probabilities=(1.0, 0.0, 0.0),
    max_episode_steps=15,
    nb_samples=500000,
    delta=0.1,
    n_upperbound=5,
    nb_processes=8)

render_digraph(to_graphviz_from_graph(v, t, char2str=lambda c: str(rdp_generator.decoder(c))))

Apriori expected length of traces: 1/stop_prob = 4.999999999999994
Average length of traces: 3.679


[2020-11-06 19:23:47,080][src.learn_pdfa][INFO] Parameters: ('BalleParams(sample_generator=<src.learn_pdfa.common.MultiprocessedGenerator '
 'object at 0x7f4951e6ab10>, nb_samples=500000, n=5, alphabet_size=12, '
 'delta=0.1)')
[2020-11-06 19:23:52,604][src.learn_pdfa][INFO] Iteration 0
[2020-11-06 19:23:53,283][src.learn_pdfa][INFO] Iteration 1
[2020-11-06 19:23:54,183][src.learn_pdfa][INFO] Iteration 2
[2020-11-06 19:23:54,946][src.learn_pdfa][INFO] Iteration 3
[2020-11-06 19:23:55,600][src.learn_pdfa][INFO] Iteration 4
[2020-11-06 19:23:56,255][src.learn_pdfa][INFO] Iteration 5
[2020-11-06 19:23:56,972][src.learn_pdfa][INFO] Iteration 6
[2020-11-06 19:23:57,699][src.learn_pdfa][INFO] Iteration 7
[2020-11-06 19:23:58,450][src.learn_pdfa][INFO] Iteration 8
[2020-11-06 19:23:59,249][src.learn_pdfa][INFO] Iteration 9
  # Remove the CWD from sys.path while we load stuff.
[2020-11-06 19:23:59,802][graphviz.files][DEBUG] write 332 bytes to '/tmp/tmp8lc65vbv/output'
[2020-11-06 19:23:59,803

In [5]:
rdp_generator, (v, t) = learning_rotating_mab(
    stop_probability=0.2,
    winning_probabilities=(0.1, 0.2, 0.9),
    max_episode_steps=100,
    nb_samples=5000000,
    delta=0.05,
    n_upperbound=6,
    nb_processes=8)

render_digraph(to_graphviz_from_graph(v, t, char2str=lambda c: str(rdp_generator.decoder(c))))

Apriori expected length of traces: 1/stop_prob = 4.999999999999994
Average length of traces: 3.84


[2020-11-06 19:24:00,015][src.learn_pdfa][INFO] Parameters: ('BalleParams(sample_generator=<src.learn_pdfa.common.MultiprocessedGenerator '
 'object at 0x7f4940fb0950>, nb_samples=5000000, n=6, alphabet_size=12, '
 'delta=0.05)')
[2020-11-06 19:24:05,603][src.learn_pdfa][INFO] Iteration 0
[2020-11-06 19:24:06,195][src.learn_pdfa][INFO] Iteration 1
[2020-11-06 19:24:06,865][src.learn_pdfa][INFO] Iteration 2
[2020-11-06 19:24:07,541][src.learn_pdfa][INFO] Iteration 3
[2020-11-06 19:24:08,207][src.learn_pdfa][INFO] Iteration 4
[2020-11-06 19:24:08,954][src.learn_pdfa][INFO] Iteration 5
[2020-11-06 19:24:09,672][src.learn_pdfa][INFO] Iteration 6
[2020-11-06 19:24:10,401][src.learn_pdfa][INFO] Iteration 7
[2020-11-06 19:24:11,194][src.learn_pdfa][INFO] Iteration 8
[2020-11-06 19:24:11,989][src.learn_pdfa][INFO] Iteration 9
[2020-11-06 19:24:12,758][src.learn_pdfa][INFO] Iteration 10
[2020-11-06 19:24:13,576][src.learn_pdfa][INFO] Iteration 11
[2020-11-06 19:24:14,384][src.learn_pdfa][INFO] 