In [16]:
####### TABULAR MC PART #########

from typing import Sequence, Iterable, Callable
from rl.function_approx import AdamGradient
from rl.function_approx import LinearFunctionApprox
from rl.approximate_dynamic_programming import ValueFunctionApprox
from rl.distribution import Choose
from rl.markov_decision_process import NonTerminal
from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite
from rl.chapter2.simple_inventory_mrp import InventoryState
from rl.chapter10.prediction_utils import (
    mc_prediction_learning_rate,
    td_prediction_learning_rate
)
import numpy as np
from itertools import islice

capacity: int = 2
poisson_lambda: float = 1.0
holding_cost: float = 1.0
stockout_cost: float = 10.0

gamma: float = 0.9
gamma: float = 0.8  ### I CHOOSE THIS GAMMA FOR FAST CONVERGENCE 

## CLASSICAL SIMPLE INVENTORY
si_mrp = SimpleInventoryMRPFinite(
    capacity=capacity,
    poisson_lambda=poisson_lambda,
    holding_cost=holding_cost,
    stockout_cost=stockout_cost
)

all_states = si_mrp.non_terminal_states
true_vf = si_mrp.get_value_function_vec(gamma=gamma)


mc_episode_length_tol: float = 1e-6
num_episodes = 10000

td_episode_length: int = 100
initial_learning_rate: float = 0.03
half_life: float = 1000.0
exponent: float = 0.5

#### FUNCTION APPROXIMATION WITH STATE INDICATOR FEATURES  ####
ag = AdamGradient(
    learning_rate=0.05,
    decay1=0.9,
    decay2=0.999
)
ffs=[]
for i in range(capacity+1):
    for j in range(capacity+1-i):
        ffs.append(lambda x: float(x.state.on_hand==i and x.state.on_order==j))

ffs: Sequence[Callable[[NonTerminal[InventoryState]], float]] = \
[(lambda x, s=s: float(x.state == s.state)) for s in all_states]
lfa = LinearFunctionApprox.create(
        feature_functions=ffs,
        adam_gradient=ag,
        regularization_coeff=0.0001,
        direct_solve=False
)
lfa = LinearFunctionApprox.create(
        feature_functions=ffs,
        adam_gradient=ag,
        direct_solve=False
)

it_mc: Iterable[ValueFunctionApprox[InventoryState]] = \
    mc_prediction_learning_rate(
        mrp=si_mrp,
        start_state_distribution=Choose(all_states),
        gamma=gamma,
        episode_length_tolerance=mc_episode_length_tol,
        initial_func_approx=lfa
    )

mc_episodes: int = 3000
for i, mc_vf in enumerate(islice(it_mc, mc_episodes)):
    mc_rmse: float = np.sqrt(sum(
        (mc_vf(s) - true_vf[i]) ** 2 for i, s in enumerate(all_states)
    ) / len(all_states))
    if i%500==0:
        print(f"MC: Iteration = {i:d}, RMSE = {mc_rmse:.3f}")
print(f"mc_vf {mc_vf.weights.weights}")
print(f"True VF : {true_vf}")

MC: Iteration = 0, RMSE = 15.431
MC: Iteration = 500, RMSE = 0.460
MC: Iteration = 1000, RMSE = 0.623
MC: Iteration = 1500, RMSE = 0.488
MC: Iteration = 2000, RMSE = 0.837
MC: Iteration = 2500, RMSE = 0.397
mc_vf [-21.0675816  -12.79093864 -13.87750496 -14.4325741  -14.90124206
 -16.08667323]
True VF : [-20.69406782 -13.25880178 -13.36758477 -14.25880178 -14.36758477
 -15.36758477]


In [18]:
####### TABULAR TD PART #########

from typing import Sequence, Iterable, Callable
from rl.function_approx import AdamGradient
from rl.function_approx import LinearFunctionApprox
from rl.approximate_dynamic_programming import ValueFunctionApprox
from rl.distribution import Choose
from rl.markov_decision_process import NonTerminal
from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite
from rl.chapter2.simple_inventory_mrp import InventoryState
from rl.chapter10.prediction_utils import (
    mc_prediction_learning_rate,
    td_prediction_learning_rate
)
import numpy as np
from itertools import islice


capacity: int = 2
poisson_lambda: float = 1.0
holding_cost: float = 1.0
stockout_cost: float = 10.0

gamma: float = 0.9
gamma: float = 0.8  ### I CHOOSE THIS GAMMA FOR FAST CONVERGENCE 

## CLASSICAL SIMPLE INVENTORY
si_mrp = SimpleInventoryMRPFinite(
    capacity=capacity,
    poisson_lambda=poisson_lambda,
    holding_cost=holding_cost,
    stockout_cost=stockout_cost
)

all_states = si_mrp.non_terminal_states
true_vf = si_mrp.get_value_function_vec(gamma=gamma)


mc_episode_length_tol: float = 1e-6
num_episodes = 10000

td_episode_length: int = 100
initial_learning_rate: float = 0.03
half_life: float = 1000.0
exponent: float = 0.5

#### FUNCTION APPROXIMATION WITH STATE INDICATOR FEATURES  ####
ag = AdamGradient(
    learning_rate=0.05,
    decay1=0.9,
    decay2=0.999
)
ffs=[]
for i in range(capacity+1):
    for j in range(capacity+1-i):
        ffs.append(lambda x: float(x.state.on_hand==i and x.state.on_order==j))

ffs: Sequence[Callable[[NonTerminal[InventoryState]], float]] = \
[(lambda x, s=s: float(x.state == s.state)) for s in all_states]
lfa = LinearFunctionApprox.create(
        feature_functions=ffs,
        adam_gradient=ag,
        regularization_coeff=0.0001,
        direct_solve=False
)
lfa = LinearFunctionApprox.create(
        feature_functions=ffs,
        adam_gradient=ag,
        direct_solve=False
)


it_td: Iterable[ValueFunctionApprox[InventoryState]] = \
    td_prediction_learning_rate(
        mrp=si_mrp,
        start_state_distribution=Choose(all_states),
        gamma=gamma,
        episode_length=td_episode_length,
        initial_func_approx=lfa
    )


td_experiences: int = 300000
for i, td_vf in enumerate(islice(it_td, td_experiences)):
    td_rmse: float = np.sqrt(sum(
        (td_vf(s) - true_vf[i]) ** 2 for i, s in enumerate(all_states)
    ) / len(all_states))
    if i%25000==0:
        print(f"TD: Iteration = {i:d}, RMSE = {td_rmse:.3f}")

print(f"td_vf {td_vf.weights.weights}")
print(f"True VF : {true_vf}")

TD: Iteration = 0, RMSE = 15.431
TD: Iteration = 5000, RMSE = 0.413
TD: Iteration = 10000, RMSE = 0.757
TD: Iteration = 15000, RMSE = 0.312
TD: Iteration = 20000, RMSE = 0.748
TD: Iteration = 25000, RMSE = 0.124
TD: Iteration = 30000, RMSE = 0.688
TD: Iteration = 35000, RMSE = 0.414
TD: Iteration = 40000, RMSE = 0.518
TD: Iteration = 45000, RMSE = 0.521
TD: Iteration = 50000, RMSE = 0.611
TD: Iteration = 55000, RMSE = 0.360
TD: Iteration = 60000, RMSE = 0.123
TD: Iteration = 65000, RMSE = 0.257
TD: Iteration = 70000, RMSE = 0.234
TD: Iteration = 75000, RMSE = 0.418
TD: Iteration = 80000, RMSE = 0.423
TD: Iteration = 85000, RMSE = 0.556
TD: Iteration = 90000, RMSE = 0.294
TD: Iteration = 95000, RMSE = 0.391
TD: Iteration = 100000, RMSE = 0.352
TD: Iteration = 105000, RMSE = 0.339
TD: Iteration = 110000, RMSE = 0.423
TD: Iteration = 115000, RMSE = 0.610
TD: Iteration = 120000, RMSE = 0.403
TD: Iteration = 125000, RMSE = 0.510
TD: Iteration = 130000, RMSE = 0.665
TD: Iteration = 135000, R