GLIE: Greedy in the Limit with Infinite Exploration

In [1]:
from rl.monte_carlo import *

from rl.function_approx import Tabular
from rl.distribution import Choose
from rl.chapter3.simple_inventory_mdp_cap import InventoryState
from rl.chapter10.prediction_utils import *

from rl.chapter3.simple_inventory_mdp_cap import SimpleInventoryMDPCap
from rl.dynamic_programming import value_iteration_result

from rl.distribution import Constant
from rl.dynamic_programming import V
import itertools
import rl.iterate as iterate
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.policy import FiniteDeterministicPolicy

In [2]:
capacity: int = 2
poisson_lambda: float = 1.0
holding_cost: float = 1.0
stockout_cost: float = 10.0
gamma: float = 0.9
si_mdp: SimpleInventoryMDPCap = SimpleInventoryMDPCap(
    capacity=capacity,
    poisson_lambda=poisson_lambda,
    holding_cost=holding_cost,
    stockout_cost=stockout_cost
)

In [3]:

true_opt_vf, true_opt_policy = value_iteration_result(si_mdp, gamma=gamma)
print("True Optimal Value Function")
pprint(true_opt_vf)
print("True Optimal Policy")
print(true_opt_policy)

True Optimal Value Function
{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -34.894855194671294,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.66095964467877,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -27.99189950444479,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.66095964467877,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -28.99189950444479,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -29.991899504444792}
True Optimal Policy
For State InventoryState(on_hand=0, on_order=0): Do Action 1
For State InventoryState(on_hand=0, on_order=1): Do Action 1
For State InventoryState(on_hand=0, on_order=2): Do Action 0
For State InventoryState(on_hand=1, on_order=0): Do Action 1
For State InventoryState(on_hand=1, on_order=1): Do Action 0
For State InventoryState(on_hand=2, on_order=0): Do Action 0



In [8]:
episode_length_tolerance: float = 1e-5
epsilon_as_func_of_episodes: Callable[[int], float] = lambda k: k ** -0.5

initial_learning_rate: float = 0.1
half_life: float = 10000.0
exponent: float = 1.0

# Uniform sampling across state space:
initial_qvf_dict = {
    (s, a): 0. for s in si_mdp.non_terminal_states for a in si_mdp.actions(s)
}
learning_rate_func: Callable[[int], float] = learning_rate_schedule(
    initial_learning_rate=initial_learning_rate,
    half_life=half_life,
    exponent=exponent
)
qvfs = glie_mc_control(
    mdp=si_mdp,
    states=Choose(si_mdp.non_terminal_states),
    approx_0=Tabular(
        values_map=initial_qvf_dict,
        count_to_weight_func=learning_rate_func
    ),
    γ=gamma,
    ϵ_as_func_of_episodes=epsilon_as_func_of_episodes,
    episode_length_tolerance=episode_length_tolerance
)

In [9]:
QTabular = Tabular[Tuple[NonTerminal[S],A]]

In [10]:
# Tabular Monte-Carlo Control

num_episodes = 10000
final_qvf: QTabular[InventoryState, int] = \
    iterate.last(itertools.islice(qvfs, num_episodes))

def tabular_get_vf_and_policy_from_qvf(
    mdp: FiniteMarkovDecisionProcess[S, A],
    qvf: QTabular[S, A]
) -> Tuple[V[S], FiniteDeterministicPolicy[S, A]]:
    opt_vf: V[S] = {
        s: max(qvf((s, a)) for a in mdp.actions(s))
        for s in mdp.non_terminal_states
    }
    opt_policy: FiniteDeterministicPolicy[S, A] = \
        FiniteDeterministicPolicy({
            s.state: qvf.argmax((s, a) for a in mdp.actions(s))[1]
            for s in mdp.non_terminal_states
        })
    return opt_vf, opt_policy
opt_vf, opt_policy = tabular_get_vf_and_policy_from_qvf(
    mdp=si_mdp,
    qvf=final_qvf
)
print("GLIE MC Optimal Value Function with {num_episodes:d} episodes")
pprint(opt_vf)
print("GLIE MC Optimal Policy with {num_episodes:d} episodes")
print(opt_policy)

GLIE MC Optimal Value Function with {num_episodes:d} episodes
{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -34.96476984562754,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.53625227302578,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.586970994259534,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.581369243292652,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -28.825334895696155,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.1696604747163}
GLIE MC Optimal Policy with {num_episodes:d} episodes
For State InventoryState(on_hand=0, on_order=0): Do Action 1
For State InventoryState(on_hand=0, on_order=1): Do Action 1
For State InventoryState(on_hand=0, on_order=2): Do Action 0
For State InventoryState(on_hand=1, on_order=0): Do Action 1
For State InventoryState(on_hand=1, on_order=1): Do Action 0
For State InventoryState(on_hand=2, on_order=0): Do Action 0



In [None]:
# Monte-Carlo Control with Generic Function Approximation

num_episodes = 10000
final_qvf: QValueFunctionApprox[InventoryState, int] = \
    iterate.last(itertools.islice(qvfs, num_episodes))

def get_vf_and_policy_from_qvf(
    mdp: FiniteMarkovDecisionProcess[S, A],
    qvf: QValueFunctionApprox[S, A]
) -> Tuple[V[S], FiniteDeterministicPolicy[S, A]]:
    opt_vf: V[S] = {
        s: max(qvf((s, a)) for a in mdp.actions(s))
        for s in mdp.non_terminal_states
    }
    opt_policy: FiniteDeterministicPolicy[S, A] = \
        FiniteDeterministicPolicy({
            s.state: qvf.argmax((s, a) for a in mdp.actions(s))[1]
            for s in mdp.non_terminal_states
        })
    return opt_vf, opt_policy
opt_vf, opt_policy = get_vf_and_policy_from_qvf(
    mdp=si_mdp,
    qvf=final_qvf
)
print("GLIE MC Optimal Value Function with {num_episodes:d} episodes")
pprint(opt_vf)
print("GLIE MC Optimal Policy with {num_episodes:d} episodes")
print(opt_policy)

### Copy the above as the generic "Function Approximation" case. Replace this version with the Tabular setting. Might need to do a reclassification of Tabuluar as was done for QValueFunctionApprox[S,A]

In [None]:
## Implement Asset_alloc_discrete test