## RDP Learning

In this notebook, we show how to perform PAC learning
over RDPs.

### Experiment 1: Rotating MAB

In [1]:
from functools import partial
from typing import Tuple, Set, Dict

import gym
import numpy as np
from gym.wrappers import TimeLimit
from notebooks.utils import render_digraph

from src import NonMarkovianRotatingMAB
from src.learn_pdfa.base import learn_pdfa, Algorithm
from src.learn_pdfa.utils.generator import MultiprocessedGenerator
from src.learn_rdps import random_exploration_policy, RDPGenerator
from src.pdfa.render import to_graphviz_from_graph
from src.pdfa.types import Character

env = NonMarkovianRotatingMAB(winning_probs=[0.9, 0.2])

print(f"Observation space: {env.observation_space}")
print(f"Action space: {env.action_space}")

s = env.reset()
print(f"Initial state: {s}")

action = 0
sp, reward, _, _ = env.step(action)
print("-" * 10)
print(f"Action: {action}")
print(f"Next state: {sp}")
print(f"Reward: {reward}")

sp, reward, _, _ = env.step(action)
print("-" * 10)
print(f"Action: {action}")
print(f"Next state: {sp}")
print(f"Reward: {reward}")


Observation space: Discrete(2)
Action space: Discrete(2)
Initial state: 0
----------
Action: 0
Next state: 1
Reward: 1.0
----------
Action: 0
Next state: 1
Reward: 1.0


### Learning

In [2]:
def learning_rotating_mab(
    stop_probability: float,
    winning_probabilities: Tuple[float, ...],
    max_episode_steps: int,
    nb_samples: int,
    delta: float,
    n_upperbound: int,
    nb_processes: int = 8,
) -> Tuple[RDPGenerator, Tuple[Set[int], Dict[int, Dict[Character, int]]]]:
    """Test learning of Rotating MAB."""
    env = gym.make("NonMarkovianRotatingMAB-v0", winning_probs=winning_probabilities)
    env = TimeLimit(env, max_episode_steps=max_episode_steps)

    policy = partial(random_exploration_policy, env)

    rdp_generator = RDPGenerator(
        env, policy=policy, nb_rewards=2, stop_probability=stop_probability
    )

    examples = rdp_generator.sample(n=1000)

    print(
        f"Apriori expected length of traces: 1/stop_prob = {1 / (stop_probability + np.finfo(float).eps)}"
    )
    print(f"Average length of traces: {np.mean([len(e) for e in examples])}")

    mp_rdp_generator = MultiprocessedGenerator(rdp_generator, nb_processes=nb_processes)
    v, t = learn_pdfa(
        algorithm=Algorithm.BALLE,
        nb_samples=nb_samples,
        sample_generator=mp_rdp_generator,
        alphabet_size=rdp_generator.alphabet_size(),
        delta=delta,
        n=n_upperbound,
    )
    return rdp_generator, (v, t)

##### Rotating MAB, 2 arms, with probabilities (0.7, 0.3)

In [3]:
rdp_generator, (v, t) = learning_rotating_mab(
    stop_probability=0.2,
    winning_probabilities=(0.7, 0.3),
    max_episode_steps=15,
    nb_samples=100000,
    delta=0.1,
    n_upperbound=4,
    nb_processes=8)

render_digraph(to_graphviz_from_graph(v, t, char2str=lambda c: str(rdp_generator.decoder(c))))

Apriori expected length of traces: 1/stop_prob = 4.999999999999994
Average length of traces: 3.679


[2020-11-08 12:54:07,489][src.learn_pdfa][INFO] Parameters: ('BalleParams(sample_generator=<src.learn_pdfa.utils.generator.MultiprocessedGenerator '
 'object at 0x7f167b42acd0>, nb_samples=100000, n=4, alphabet_size=8, '
 'delta=0.1)')
[2020-11-08 12:54:11,040][src.learn_pdfa][INFO] Iteration 0
[2020-11-08 12:54:11,649][src.learn_pdfa][INFO] Iteration 1
[2020-11-08 12:54:12,479][src.learn_pdfa][INFO] Iteration 2
[2020-11-08 12:54:13,311][src.learn_pdfa][INFO] Iteration 3
[2020-11-08 12:54:14,135][src.learn_pdfa][INFO] Iteration 4
[2020-11-08 12:54:15,004][src.learn_pdfa][INFO] Iteration 5
[2020-11-08 12:54:15,918][src.learn_pdfa][INFO] Iteration 6
[2020-11-08 12:54:16,812][src.learn_pdfa][INFO] Iteration 7
[2020-11-08 12:54:17,779][src.learn_pdfa][INFO] Iteration 8
  # Remove the CWD from sys.path while we load stuff.
[2020-11-08 12:54:18,397][graphviz.files][DEBUG] write 301 bytes to '/tmp/tmpboxrh6gv/output'
[2020-11-08 12:54:18,398][graphviz.backend][DEBUG] run ['dot', '-Tsvg', '-O'

##### Rotating MAB, 3 arms, with probabilities (1.0, 0.0, 0.0)

In [4]:
rdp_generator, (v, t) = learning_rotating_mab(
    stop_probability=0.2,
    winning_probabilities=(1.0, 0.0, 0.0),
    max_episode_steps=15,
    nb_samples=100000,
    delta=0.1,
    n_upperbound=5,
    nb_processes=8)

render_digraph(to_graphviz_from_graph(v, t, char2str=lambda c: str(rdp_generator.decoder(c))))

Apriori expected length of traces: 1/stop_prob = 4.999999999999994
Average length of traces: 3.679


[2020-11-08 12:54:22,135][src.learn_pdfa][INFO] Parameters: ('BalleParams(sample_generator=<src.learn_pdfa.utils.generator.MultiprocessedGenerator '
 'object at 0x7f1691b426d0>, nb_samples=100000, n=5, alphabet_size=12, '
 'delta=0.1)')
[2020-11-08 12:54:29,322][src.learn_pdfa][INFO] Iteration 0
[2020-11-08 12:54:30,325][src.learn_pdfa][INFO] Iteration 1
[2020-11-08 12:54:31,111][src.learn_pdfa][INFO] Iteration 2
[2020-11-08 12:54:32,047][src.learn_pdfa][INFO] Iteration 3
[2020-11-08 12:54:33,005][src.learn_pdfa][INFO] Iteration 4
[2020-11-08 12:54:34,463][src.learn_pdfa][INFO] Iteration 5
[2020-11-08 12:54:35,604][src.learn_pdfa][INFO] Iteration 6
[2020-11-08 12:54:36,919][src.learn_pdfa][INFO] Iteration 7
[2020-11-08 12:54:38,296][src.learn_pdfa][INFO] Iteration 8
[2020-11-08 12:54:39,616][src.learn_pdfa][INFO] Iteration 9
  # Remove the CWD from sys.path while we load stuff.
[2020-11-08 12:54:40,487][graphviz.files][DEBUG] write 332 bytes to '/tmp/tmpf6kgp4ej/output'
[2020-11-08 12:

##### Rotating MAB, 3 arms, with probabilities (0.1, 0.2, 0.9)

In [5]:
rdp_generator, (v, t) = learning_rotating_mab(
    stop_probability=0.2,
    winning_probabilities=(0.1, 0.2, 0.9),
    max_episode_steps=1000,
    nb_samples=1000000,
    delta=0.05,
    n_upperbound=6,
    nb_processes=8)

render_digraph(to_graphviz_from_graph(v, t, char2str=lambda c: str(rdp_generator.decoder(c))))

Apriori expected length of traces: 1/stop_prob = 4.999999999999994
Average length of traces: 3.84


[2020-11-08 12:55:01,338][src.learn_pdfa][INFO] Parameters: ('BalleParams(sample_generator=<src.learn_pdfa.utils.generator.MultiprocessedGenerator '
 'object at 0x7f167a771650>, nb_samples=1000000, n=6, alphabet_size=12, '
 'delta=0.05)')
[2020-11-08 12:56:11,080][src.learn_pdfa][INFO] Iteration 0
[2020-11-08 12:56:27,630][src.learn_pdfa][INFO] Iteration 1
[2020-11-08 12:56:52,818][src.learn_pdfa][INFO] Iteration 2
[2020-11-08 12:57:22,732][src.learn_pdfa][INFO] Iteration 3
[2020-11-08 12:57:46,535][src.learn_pdfa][INFO] Iteration 4
[2020-11-08 12:58:09,572][src.learn_pdfa][INFO] Iteration 5
[2020-11-08 12:58:35,244][src.learn_pdfa][INFO] Iteration 6
[2020-11-08 12:59:01,623][src.learn_pdfa][INFO] Iteration 7
[2020-11-08 12:59:22,787][src.learn_pdfa][INFO] Iteration 8
[2020-11-08 12:59:47,634][src.learn_pdfa][INFO] Iteration 9
[2020-11-08 13:00:04,774][src.learn_pdfa][INFO] Iteration 10
[2020-11-08 13:00:21,536][src.learn_pdfa][INFO] Iteration 11
[2020-11-08 13:00:38,450][src.learn_pdf

##### Rotating MAB, 4 arms, with probabilities (1.0, 0.0, 0.0, 0.0)


In [6]:
rdp_generator, (v, t) = learning_rotating_mab(
    stop_probability=0.2,
    winning_probabilities=(1.0, 0.0, 0.0, 0.0),
    max_episode_steps=50,
    nb_samples=300000,
    delta=0.1,
    n_upperbound=5,
    nb_processes=8)

render_digraph(to_graphviz_from_graph(v, t, char2str=lambda c: str(rdp_generator.decoder(c))))


Apriori expected length of traces: 1/stop_prob = 4.999999999999994
Average length of traces: 3.84


[2020-11-08 13:18:24,806][src.learn_pdfa][INFO] Parameters: ('BalleParams(sample_generator=<src.learn_pdfa.utils.generator.MultiprocessedGenerator '
 'object at 0x7f16441e26d0>, nb_samples=300000, n=5, alphabet_size=16, '
 'delta=0.1)')
[2020-11-08 13:18:38,799][src.learn_pdfa][INFO] Iteration 0
[2020-11-08 13:18:41,588][src.learn_pdfa][INFO] Iteration 1
[2020-11-08 13:18:44,534][src.learn_pdfa][INFO] Iteration 2
[2020-11-08 13:18:48,375][src.learn_pdfa][INFO] Iteration 3
[2020-11-08 13:18:53,714][src.learn_pdfa][INFO] Iteration 4
[2020-11-08 13:18:58,224][src.learn_pdfa][INFO] Iteration 5
[2020-11-08 13:19:03,683][src.learn_pdfa][INFO] Iteration 6
[2020-11-08 13:19:08,777][src.learn_pdfa][INFO] Iteration 7
[2020-11-08 13:19:13,830][src.learn_pdfa][INFO] Iteration 8
[2020-11-08 13:19:18,694][src.learn_pdfa][INFO] Iteration 9
[2020-11-08 13:19:23,430][src.learn_pdfa][INFO] Iteration 10
[2020-11-08 13:19:27,585][src.learn_pdfa][INFO] Iteration 11
[2020-11-08 13:19:33,148][src.learn_pdfa]

##### Rotating MAB, 5 arms, with probabilities (1.0, 0.0, 0.0, 0.0, 0.0)


In [3]:
# rdp_generator, (v, t) = learning_rotating_mab(
#     stop_probability=0.15,
#     winning_probabilities=(1.0, 0.0, 0.0, 0.0, 0.0),
#     max_episode_steps=100,
#     nb_samples=1000000,
#     delta=0.05,
#     n_upperbound=10,
#     nb_processes=6)
#
# render_digraph(to_graphviz_from_graph(v, t, char2str=lambda c: str(rdp_generator.decoder(c))))

Apriori expected length of traces: 1/stop_prob = 6.666666666666657
Average length of traces: 5.611


[2020-11-06 22:11:12,955][src.learn_pdfa][INFO] Parameters: ('BalleParams(sample_generator=<src.learn_pdfa.common.MultiprocessedGenerator '
 'object at 0x7f3400f48910>, nb_samples=1000000, n=10, alphabet_size=20, '
 'delta=0.05)')
[2020-11-06 22:12:16,380][src.learn_pdfa][INFO] Iteration 0
[2020-11-06 22:12:25,422][src.learn_pdfa][INFO] Iteration 1
[2020-11-06 22:12:35,280][src.learn_pdfa][INFO] Iteration 2
[2020-11-06 22:12:45,929][src.learn_pdfa][INFO] Iteration 3
[2020-11-06 22:12:59,253][src.learn_pdfa][INFO] Iteration 4
[2020-11-06 22:13:14,098][src.learn_pdfa][INFO] Iteration 5
[2020-11-06 22:13:28,238][src.learn_pdfa][INFO] Iteration 6
[2020-11-06 22:13:43,299][src.learn_pdfa][INFO] Iteration 7
[2020-11-06 22:13:58,909][src.learn_pdfa][INFO] Iteration 8
[2020-11-06 22:14:15,204][src.learn_pdfa][INFO] Iteration 9
[2020-11-06 22:14:32,561][src.learn_pdfa][INFO] Iteration 10
[2020-11-06 22:14:48,810][src.learn_pdfa][INFO] Iteration 11
[2020-11-06 22:15:05,389][src.learn_pdfa][INFO]