GLIE: Greedy in the Limit with Infinite Exploration

In [1]:
from rl.monte_carlo import *

from rl.function_approx import Tabular
from rl.distribution import Choose
from rl.chapter3.simple_inventory_mdp_cap import InventoryState
from rl.chapter10.prediction_utils import *

from rl.chapter3.simple_inventory_mdp_cap import SimpleInventoryMDPCap
from rl.dynamic_programming import value_iteration_result

from rl.distribution import Constant
from rl.dynamic_programming import V
import itertools
import rl.iterate as iterate
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.policy import FiniteDeterministicPolicy

In [2]:
# Setup Inventory MDP
capacity: int = 2
poisson_lambda: float = 1.0
holding_cost: float = 1.0
stockout_cost: float = 10.0
gamma: float = 0.9
si_mdp: SimpleInventoryMDPCap = SimpleInventoryMDPCap(
    capacity=capacity,
    poisson_lambda=poisson_lambda,
    holding_cost=holding_cost,
    stockout_cost=stockout_cost
)

In [3]:
# Get Value Function and Optimal Policy via Dynamic Programming
true_opt_vf, true_opt_policy = value_iteration_result(si_mdp, gamma=gamma)
print("True Optimal Value Function")
pprint(true_opt_vf)
print("True Optimal Policy")
print(true_opt_policy)

True Optimal Value Function
{NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.66095964467877,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -27.99189950444479,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.66095964467877,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -28.99189950444479,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -29.991899504444792,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -34.894855194671294}
True Optimal Policy
For State InventoryState(on_hand=0, on_order=0): Do Action 1
For State InventoryState(on_hand=0, on_order=1): Do Action 1
For State InventoryState(on_hand=0, on_order=2): Do Action 0
For State InventoryState(on_hand=1, on_order=0): Do Action 1
For State InventoryState(on_hand=1, on_order=1): Do Action 0
For State InventoryState(on_hand=2, on_order=0): Do Action 0



## Implementing MC Control with GLIE

In [4]:
# Definitions

episode_length_tolerance: float = 1e-5
# GLIE:
epsilon_as_func_of_episodes: Callable[[int], float] = lambda k: k ** -1

initial_learning_rate: float = 0.1
half_life: float = 10000.0
exponent: float = 1.0

# Uniform sampling across state space:
initial_qvf_dict = {
    (s, a): 0. for s in si_mdp.non_terminal_states for a in si_mdp.actions(s)
}

# Function to control learning rate
learning_rate_func: Callable[[int], float] = learning_rate_schedule(
    initial_learning_rate=initial_learning_rate,
    half_life=half_life,
    exponent=exponent
)

# Redefine Tabular for QValue Function Approximation
QTabular = Tabular[Tuple[NonTerminal[S],A]]

### Tabular MC Control

In [5]:
# Tabular Monte Carlo Control:
def tabular_glie_mc_control(
    mdp: MarkovDecisionProcess[S, A],
    states: NTStateDistribution[S],
    approx_0: QTabular[S, A],
    γ: float,
    ϵ_as_func_of_episodes: Callable[[int], float],
    episode_length_tolerance: float = 1e-6
) -> Iterator[QTabular[S, A]]:
    
    q: QTabular[S, A] = approx_0
    p: Policy[S, A] = epsilon_greedy_policy(q, mdp, 1.0) # Start off with epsilon = 1/1 = 1
    yield q

    num_episodes: int = 0
    while True:
        trace: Iterable[TransitionStep[S, A]] = \
            mdp.simulate_actions(states, p)
        num_episodes += 1
        for step in returns(trace, γ, episode_length_tolerance):
            q = q.update([((step.state, step.action), step.return_)])
        p = epsilon_greedy_policy(q, mdp, ϵ_as_func_of_episodes(num_episodes))
        yield q

### Test Tabular MC Control

In [9]:
# Test Tabular MC Control:

from rl.chapter11.control_utils import get_vf_and_policy_from_qvf

qvfs = tabular_glie_mc_control(
    mdp=si_mdp,
    states=Choose(si_mdp.non_terminal_states),
    approx_0=QTabular(
        values_map=initial_qvf_dict,
        count_to_weight_func=learning_rate_func
    ),
    γ=gamma,
    ϵ_as_func_of_episodes=epsilon_as_func_of_episodes,
    episode_length_tolerance=episode_length_tolerance
)

num_episodes = 10000
final_qvf: QTabular[InventoryState, int] = \
    iterate.last(itertools.islice(qvfs, num_episodes))

# def tabular_get_vf_and_policy_from_qvf(
#     mdp: FiniteMarkovDecisionProcess[S, A],
#     qvf: QTabular[S, A]
# ) -> Tuple[V[S], FiniteDeterministicPolicy[S, A]]:
#     opt_vf: V[S] = {
#         s: max(qvf((s, a)) for a in mdp.actions(s))
#         for s in mdp.non_terminal_states
#     }
#     opt_policy: FiniteDeterministicPolicy[S, A] = \
#         FiniteDeterministicPolicy({
#             s.state: qvf.argmax((s, a) for a in mdp.actions(s))[1]
#             for s in mdp.non_terminal_states
#         })
#     return opt_vf, opt_policy
opt_vf, opt_policy = get_vf_and_policy_from_qvf(
    mdp=si_mdp,
    qvf=final_qvf
)
print("GLIE MC Optimal Value Function with {num_episodes:d} episodes")
pprint(opt_vf)
print("GLIE MC Optimal Policy with {num_episodes:d} episodes")
print(opt_policy)

GLIE MC Optimal Value Function with {num_episodes:d} episodes
{NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.737845976848455,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.890752606818577,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.94435700606091,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.273299704154397,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.745836703595206,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -36.00349163670859}
GLIE MC Optimal Policy with {num_episodes:d} episodes
For State InventoryState(on_hand=0, on_order=0): Do Action 2
For State InventoryState(on_hand=0, on_order=1): Do Action 1
For State InventoryState(on_hand=0, on_order=2): Do Action 0
For State InventoryState(on_hand=1, on_order=0): Do Action 1
For State InventoryState(on_hand=1, on_order=1): Do Action 0
For State InventoryState(on_hand=2, on_order=0): Do Action 0



### General MC Control

In [10]:
# General Monte Carlo Control:
def glie_mc_control(
    mdp: MarkovDecisionProcess[S, A],
    states: NTStateDistribution[S],
    approx_0: QValueFunctionApprox[S, A],
    γ: float,
    ϵ_as_func_of_episodes: Callable[[int], float],
    episode_length_tolerance: float = 1e-6
) -> Iterator[QValueFunctionApprox[S, A]]:
   
    q: QValueFunctionApprox[S, A] = approx_0
    p: Policy[S, A] = epsilon_greedy_policy(q, mdp, 1.0) # Start off with epsilon = 1/1 = 1
    yield q

    num_episodes: int = 0
    while True:
        trace: Iterable[TransitionStep[S, A]] = \
            mdp.simulate_actions(states, p)
        num_episodes += 1
        for step in returns(trace, γ, episode_length_tolerance):
            q = q.update([((step.state, step.action), step.return_)])
        p = epsilon_greedy_policy(q, mdp, ϵ_as_func_of_episodes(num_episodes))
        yield q

## Implement Asset_alloc_discrete test

In [11]:
from rl.chapter7.asset_alloc_discrete import *
from pprint import pprint

In [12]:
steps: int = 4
μ: float = 0.13
σ: float = 0.2
r: float = 0.07
a: float = 1.0
init_wealth: float = 1.0
init_wealth_stdev: float = 0.1

excess: float = μ - r
var: float = σ * σ
base_alloc: float = excess / (a * var)

risky_ret: Sequence[Gaussian] = [Gaussian(μ=μ, σ=σ) for _ in range(steps)]
riskless_ret: Sequence[float] = [r for _ in range(steps)]
utility_function: Callable[[float], float] = lambda x: - np.exp(-a * x) / a
alloc_choices: Sequence[float] = np.linspace(
    2 / 3 * base_alloc,
    4 / 3 * base_alloc,
    11
)
feature_funcs: Sequence[Callable[[Tuple[float, float]], float]] = \
    [
        lambda _: 1.,
        lambda w_x: w_x[0],
        lambda w_x: w_x[1],
        lambda w_x: w_x[1] * w_x[1]
    ]
dnn: DNNSpec = DNNSpec(
    neurons=[],
    bias=False,
    hidden_activation=lambda x: x,
    hidden_activation_deriv=lambda y: np.ones_like(y),
    output_activation=lambda x: - np.sign(a) * np.exp(-x),
    output_activation_deriv=lambda y: -y
)
init_wealth_distr: Gaussian = Gaussian(μ=init_wealth, σ=init_wealth_stdev)

aad: AssetAllocDiscrete = AssetAllocDiscrete(
    risky_return_distributions=risky_ret,
    riskless_returns=riskless_ret,
    utility_func=utility_function,
    risky_alloc_choices=alloc_choices,
    feature_functions=feature_funcs,
    dnn_spec=dnn,
    initial_wealth_distribution=init_wealth_distr
)

In [13]:
aa_mdp = aad.get_mdp(2)

In [14]:
#Optimal Value Function via ADP

vf_ff: Sequence[Callable[[NonTerminal[float]], float]] = [lambda _: 1., lambda w: w.state]
it_vf: Iterator[Tuple[DNNApprox[NonTerminal[float]], DeterministicPolicy[float, float]]] = \
    aad.backward_induction_vf_and_pi(vf_ff)

print("Backward Induction: VF And Policy")
print("---------------------------------")
print()
for t, (v, p) in enumerate(it_vf):
    print(f"Time {t:d}")
    print()
    opt_alloc: float = p.action_for(init_wealth)
    val: float = v(NonTerminal(init_wealth))
    print(f"Opt Risky Allocation = {opt_alloc:.2f}, Opt Val = {val:.3f}")
    print("Weights")
    for w in v.weights:
        print(w.weights)
    print()

Backward Induction: VF And Policy
---------------------------------

Time 0

Opt Risky Allocation = 1.00, Opt Val = -0.212
Weights
[[0.23875784 1.31068248]]

Time 1

Opt Risky Allocation = 1.50, Opt Val = -0.246
Weights
[[0.17572502 1.22807366]]

Time 2

Opt Risky Allocation = 1.40, Opt Val = -0.283
Weights
[[0.11566499 1.14756727]]

Time 3

Opt Risky Allocation = 1.20, Opt Val = -0.323
Weights
[[0.05855587 1.07062364]]



## Test General MC Control on AssetAllocDiscrete

## ???

In [38]:
qvfs = glie_mc_control(
    mdp = aa_mdp,
    states = aad.get_states_distribution(1), #<--- What to do here???
    approx_0 = aad.get_qvf_func_approx(),
    γ=gamma,
    ϵ_as_func_of_episodes=epsilon_as_func_of_episodes,
    episode_length_tolerance=episode_length_tolerance
)


final_qvf: QValueFunctionApprox[InventoryState, int] = \
    iterate.last(itertools.islice(qvfs, num_episodes))

  output_activation=lambda x: - np.sign(a) * np.exp(-x),


In [39]:
## How to do this?

nonterminal_states = [NonTerminal(i) for i in np.linspace(0,1,10)]

opt_vf: V[S] = {
        s: max(final_qvf((s, a)) for a in aa_mdp.actions(s))
        for s in nonterminal_states
}
opt_policy: FiniteDeterministicPolicy[S, A] = \
    FiniteDeterministicPolicy({
        s.state: final_qvf.argmax((s, a) for a in aa_mdp.actions(s))[1]
        for s in nonterminal_states
    })

In [40]:
print("GLIE MC Optimal Value Function")
pprint(opt_vf)
print("GLIE MC Optimal Policy")
print(opt_policy)

GLIE MC Optimal Value Function
{NonTerminal(state=0.5555555555555556): -2.6798010817957413e-18,
 NonTerminal(state=0.4444444444444444): -2.685339600919735e-18,
 NonTerminal(state=0.2222222222222222): -2.696451003278292e-18,
 NonTerminal(state=1.0): -2.6577610020234417e-18,
 NonTerminal(state=0.6666666666666666): -2.674273985880222e-18,
 NonTerminal(state=0.0): -2.7076083824147325e-18,
 NonTerminal(state=0.8888888888888888): -2.6632539694816263e-18,
 NonTerminal(state=0.7777777777777777): -2.668758289612784e-18,
 NonTerminal(state=0.3333333333333333): -2.6908895668612913e-18,
 NonTerminal(state=0.1111111111111111): -2.7020239338775146e-18}
GLIE MC Optimal Policy
For State 0.0: Do Action 1.9999999999999993
For State 0.1111111111111111: Do Action 1.9999999999999993
For State 0.2222222222222222: Do Action 1.9999999999999993
For State 0.3333333333333333: Do Action 1.9999999999999993
For State 0.4444444444444444: Do Action 1.9999999999999993
For State 0.5555555555555556: Do Action 1.99999999