In [1]:
from rl.monte_carlo import *
from rl.td import *
from rl.function_approx import Tabular
from rl.distribution import Constant
import rl.iterate as iterate

In [2]:
# Prediction for estimating the value function
# Traces is an iterable of trace experiences
# A trace experience is an iterable of atomic experiences
# An atomic experience is a TransitionStep[S], which contains
# the current state, next state, and reward
def tabular_mc_prediction(
    traces: Iterable[Iterable[mp.TransitionStep[S]]],
    tab_0: Tabular[S],
    γ: float,
    episode_length_tolerance: float = 1e-6
) -> Iterator[Tabular[S]]:
    
    '''
    Returns: Produces as output an Iterator of
    Tabular[S], i.e. an updated function approximation
    of the Value function at the end of each trace 
    experience. The updates can only be done at the end
    of trace experiences because trace experience returns
    are only available at the end of trace experiences.
    '''
    
    # Calculate returns for each trace experience. Recall that for 
    # mc, we can only do this at the end of the experience, unless
    # gamma < 1, in which case we can run through a trace experience
    # until the return converges (rewards farther and farther in the future
    # becomes less and less valuable today; at a certain point we can just 
    # ignore future rewards and go ahead and calculate the value function. 
    
    # returns will calculate return for a given state given the future returns
    # that are "relevant" considering our gamma discount factor
    
    # ReturnStep is just a dataclass with a single attribute: return
    # It inherets from TransitionStep which has, as attributes:
    # state, next_state, reward
    episodes: Iterator[Iterator[mp.ReturnStep[S]]] = \
        (returns(trace, γ, episode_length_tolerance) for trace in traces)
    f = tab_0
    yield f
    
    
    # iterate_updates calls the method update of FunctionApprox
    # for a single (state, return) data point. 
    for episode in episodes:
        f = last(f.iterate_updates(
            [(step.state, step.return_)] for step in episode
        ))
        yield f


In [3]:
# Tabular td does not take trace experiences as input,
# but just takes a single trace experience, which is an
# iterable of atomic experiences (aka TransitionSteps)

def tabular_td_prediction(
        transitions: Iterable[mp.TransitionStep[S]],
        approx_0: Tabular[S],
        γ: float
) -> Iterator[Tabular[S]]:
    '''Evaluate an MRP using TD(0) using the given sequence of
    transitions.

    Each value this function yields represents the approximated value
    function for the MRP after an additional transition.

    Arguments:
      transitions -- a sequence of transitions from an MRP which don't
                     have to be in order or from the same simulation
      approx_0 -- initial approximation of value function
      γ -- discount rate (0 < γ ≤ 1)

    '''
    def step(
            v: Tabular[S],
            transition: mp.TransitionStep[S]
    ) -> Tabular[S]:
        return v.update([(
            transition.state,
            transition.reward + γ * extended_vf(v, transition.next_state)
        )])
    return iterate.accumulate(transitions, step, initial=approx_0)

In [4]:
from rl.chapter2.simple_inventory_mrp import *

In [5]:
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

user_gamma = 0.9

si_mrp = SimpleInventoryMRPFinite(
    capacity=user_capacity,
    poisson_lambda=user_poisson_lambda,
    holding_cost=user_holding_cost,
    stockout_cost=user_stockout_cost
)

In [6]:
# Correct Value Function:

print("Value Function")
print("--------------")
si_mrp.display_value_function(gamma=user_gamma)
print()

Value Function
--------------
{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.511,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.932,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.345,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.932,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.345,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.345}



In [7]:
# Monte Carlo Value Function: 

# 1) Create a tabular value function approximation
# 2) Generate an iterable of trace experiences via simulation
# 3) Run thse trace experiences through the update function of tabular_mc_prediction
# 4) Display the value function

# If we continue the inventory MRP long enough, we will fully explore
# the state-space, so there isn't a need to create different starting 
# states.
dist = Constant(NonTerminal(InventoryState(0,0)))
trace_experiences = si_mrp.reward_traces(dist)

# num = 0
# for trace in trace_experiences:
#     #print(trace)
#     #print(list(trace))
#     for atomic in trace:
#         print("printing atomic experiences")
#         print(atomic)
#         num += 1
#         if num > 5:
#             break
#     if num > 5:
#         break

In [8]:
mc_approx = Tabular()
value_functs_mc = tabular_mc_prediction(trace_experiences, mc_approx, user_gamma)

In [9]:
v_num = 0
num_iterations = 1000
processed_value_funcs_mc = []
for i in value_functs_mc:
    processed_value_funcs_mc += [i]
    v_num += 1
    if v_num > num_iterations:
        break

In [10]:
processed_value_funcs_mc[-1]

Tabular(values_map={NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.43502647787239, NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.260178640684643, NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.872788886873348, NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.271489683011694, NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.279778587325566, NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.91741748237547}, counts_map={NonTerminal(state=InventoryState(on_hand=0, on_order=0)): 16080, NonTerminal(state=InventoryState(on_hand=0, on_order=2)): 15943, NonTerminal(state=InventoryState(on_hand=1, on_order=0)): 21234, NonTerminal(state=InventoryState(on_hand=1, on_order=1)): 20793, NonTerminal(state=InventoryState(on_hand=2, on_order=0)): 21189, NonTerminal(state=InventoryState(on_hand=0, on_order=1)): 35761}, count_to_weight_func=<function Tabular.<lambda>.<locals>.<lambda> at 0x7fe04a5825e0>)

## We see that the tabular_mc_prediction's value function after 50 iterations has converged to the true value function. 

# TD Test

In [11]:
dist = Constant(NonTerminal(InventoryState(0,0)))
trace_experiences = si_mrp.simulate_reward(dist)

In [12]:
from rl.function_approx import learning_rate_schedule
initial_learning_rate = 0.03
H = 1000
exponent  = 0.5

learning_rate_func = learning_rate_schedule(
    initial_learning_rate=initial_learning_rate,
    half_life=H,
    exponent=exponent)

In [13]:
td_approx = Tabular(count_to_weight_func=learning_rate_func)
value_functs_td = tabular_td_prediction(trace_experiences, td_approx, user_gamma)

In [14]:
v_num = 0
num_iterations = 1000
processed_value_funcs_td = []
for i in value_functs_td:
    processed_value_funcs_td += [i]
    v_num += 1
    if v_num > num_iterations:
        break

In [15]:
processed_value_funcs_td[-1]

Tabular(values_map={NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -16.244694637938228, NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -10.103064269554238, NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -10.729857334857188, NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -10.295541254475278, NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -10.015064995266933, NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -12.187547454534348}, counts_map={NonTerminal(state=InventoryState(on_hand=0, on_order=0)): 125, NonTerminal(state=InventoryState(on_hand=0, on_order=2)): 125, NonTerminal(state=InventoryState(on_hand=1, on_order=0)): 152, NonTerminal(state=InventoryState(on_hand=0, on_order=1)): 264, NonTerminal(state=InventoryState(on_hand=1, on_order=1)): 151, NonTerminal(state=InventoryState(on_hand=2, on_order=0)): 183}, count_to_weight_func=<function learning_rate_schedule.<locals>.lr_func at 0x7fe04a635280>)

### 1000 iterations doesn't appear to be enough for TD. Increasing the number of episodes:

In [16]:
dist = Constant(NonTerminal(InventoryState(0,0)))
trace_experiences = si_mrp.simulate_reward(dist)

initial_learning_rate = 0.03
H = 1000
exponent  = 0.5

learning_rate_func = learning_rate_schedule(
    initial_learning_rate=initial_learning_rate,
    half_life=H,
    exponent=exponent)

In [17]:
td_approx = Tabular(count_to_weight_func=learning_rate_func)
value_functs_td = tabular_td_prediction(trace_experiences, td_approx, user_gamma)

In [18]:
v_num = 0
num_iterations = 100000
processed_value_funcs_td = []
for i in value_functs_td:
    processed_value_funcs_td += [i]
    v_num += 1
    if v_num > num_iterations:
        break

In [19]:
processed_value_funcs_td[-1]

Tabular(values_map={NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.513771716787154, NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.41429742597726, NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -29.077505773982626, NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.87785938383371, NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.466668400102723, NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.997003375166425}, counts_map={NonTerminal(state=InventoryState(on_hand=0, on_order=0)): 11610, NonTerminal(state=InventoryState(on_hand=0, on_order=2)): 11610, NonTerminal(state=InventoryState(on_hand=1, on_order=0)): 16285, NonTerminal(state=InventoryState(on_hand=1, on_order=1)): 16284, NonTerminal(state=InventoryState(on_hand=2, on_order=0)): 16142, NonTerminal(state=InventoryState(on_hand=0, on_order=1)): 28069}, count_to_weight_func=<function learning_rate_schedule.<locals>.lr_func at 0x7fe04a5820d0>)

### That does it. 

### Reminder: MC is unbiased but high variance. TD is biased, but much lower variance.