In [None]:
from rl.monte_carlo import *
from rl.function_approx import Tabular
from rl.distribution import Constant

In [None]:
# Prediction for estimating the value function
# Traces is an iterable of trace experiences
# A trace experience is an iterable of atomic experiences
# An atomic experience is a TransitionStep[S], which contains
# the current state, next state, and reward
def tabular_mc_prediction(
    traces: Iterable[Iterable[mp.TransitionStep[S]]],
    tab_0: Tabular[S],
    γ: float,
    episode_length_tolerance: float = 1e-6
) -> Iterator[Tabular[S]]:
    
    '''
    Returns: Produces as output an Iterator of
    Tabular[S], i.e. an updated function approximation
    of the Value function at the end of each trace 
    experience. The updates can only be done at the end
    of trace experiences because trace experience returns
    are only available at the end of trace experiences.
    '''
    
    episodes: Iterator[Iterator[mp.ReturnStep[S]]] = \
        (returns(trace, γ, episode_length_tolerance) for trace in traces)
    f = tab_0
    
    num = 0
    for trace in traces:
        print(trace)
        print(list(trace))
        num += 1
        if num > 10:
            break
    
    yield f
    
    
    # iterate_updates calls the method update of FunctionApprox
    # for a single (state, return) data point. 
    for episode in episodes:
        f = last(f.iterate_updates(
            [(step.state, step.return_)] for step in episode
        ))
        yield f


In [None]:
def td_prediction(
        transitions: Iterable[mp.TransitionStep[S]],
        approx_0: Tabular[S],
        γ: float
) -> Iterator[Tabular[S]]:
    '''Evaluate an MRP using TD(0) using the given sequence of
    transitions.

    Each value this function yields represents the approximated value
    function for the MRP after an additional transition.

    Arguments:
      transitions -- a sequence of transitions from an MRP which don't
                     have to be in order or from the same simulation
      approx_0 -- initial approximation of value function
      γ -- discount rate (0 < γ ≤ 1)

    '''
    def step(
            v: Tabular[S],
            transition: mp.TransitionStep[S]
    ) -> tabular[S]:
        return v.update([(
            transition.state,
            transition.reward + γ * extended_vf(v, transition.next_state)
        )])
    return iterate.accumulate(transitions, step, initial=approx_0)

In [None]:
from rl.chapter2.simple_inventory_mrp import *

In [None]:
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

user_gamma = 0.9

si_mrp = SimpleInventoryMRPFinite(
    capacity=user_capacity,
    poisson_lambda=user_poisson_lambda,
    holding_cost=user_holding_cost,
    stockout_cost=user_stockout_cost
)

In [None]:
# Correct Value Function:

print("Value Function")
print("--------------")
si_mrp.display_value_function(gamma=user_gamma)
print()

In [None]:
# Monte Carlo Value Function: 

# 1) Create a tabular value function approximation
# 2) Generate an iterable of trace experiences via simulation
# 3) Run thse trace experiences through the update function of tabular_mc_prediction
# 4) Display the value function


# Generate Trace Experiences:
#initial_distribution = Categorical({Inventory_State(a,b): 1
#                                   for a in range(user_capacity)
#                                   for b in range(user_capacity - a)})

# We will just use a single trace experience, so just a single starting point

initial_dist = Constant(InventoryState(0,1))
trace_experiences = si_mrp.reward_traces(initial_dist)

num = 1
for trace in trace_experiences:
    print(trace)
    print(list(trace))
    for atomic in trace:
        print("printing atomic experiences")
        print(atomic)
        num += 1
        if num > 1:
            break
    if num > 1:
        break

In [None]:
# I guess a single trace experience never actually ends...
# So we should only call 

tv_approx = Tabular()
value_functs = tabular_mc_prediction(trace_experiences, tv_approx, user_gamma)

In [None]:
for i in value_functs:
    print(i)

In [None]:
# Now call the last of the generator?

In [None]:
def infinite_sequence():
    num = 0
    while True:
        yield num
        num += 1

In [None]:
a = infinite_sequence()

In [None]:
a

In [3]:
from rl.chapter2.simple_inventory_mrp import *
from rl.distribution import Constant

In [2]:
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

user_gamma = 0.9

si_mrp = SimpleInventoryMRPFinite(
    capacity=user_capacity,
    poisson_lambda=user_poisson_lambda,
    holding_cost=user_holding_cost,
    stockout_cost=user_stockout_cost
)

from rl.markov_process import FiniteMarkovProcess
print("Transition Map")
print("--------------")
print(FiniteMarkovProcess(
    {s.state: Categorical({s1.state: p for s1, p in v.table().items()})
     for s, v in si_mrp.transition_map.items()}
))

print("Transition Reward Map")
print("---------------------")
print(si_mrp)

print("Stationary Distribution")
print("-----------------------")
si_mrp.display_stationary_distribution()
print()

print("Reward Function")
print("---------------")
si_mrp.display_reward_function()
print()

print("Value Function")
print("--------------")
si_mrp.display_value_function(gamma=user_gamma)
print()

Transition Map
--------------
From State InventoryState(on_hand=0, on_order=0):
  To State InventoryState(on_hand=0, on_order=2) with Probability 1.000
From State InventoryState(on_hand=0, on_order=1):
  To State InventoryState(on_hand=1, on_order=1) with Probability 0.368
  To State InventoryState(on_hand=0, on_order=1) with Probability 0.632
From State InventoryState(on_hand=0, on_order=2):
  To State InventoryState(on_hand=2, on_order=0) with Probability 0.368
  To State InventoryState(on_hand=1, on_order=0) with Probability 0.368
  To State InventoryState(on_hand=0, on_order=0) with Probability 0.264
From State InventoryState(on_hand=1, on_order=0):
  To State InventoryState(on_hand=1, on_order=1) with Probability 0.368
  To State InventoryState(on_hand=0, on_order=1) with Probability 0.632
From State InventoryState(on_hand=1, on_order=1):
  To State InventoryState(on_hand=2, on_order=0) with Probability 0.368
  To State InventoryState(on_hand=1, on_order=0) with Probability 0.368


In [20]:
dist = Constant(NonTerminal(InventoryState(0,0)))

a = si_mrp.simulate_reward(dist)

In [21]:
a

<generator object MarkovRewardProcess.simulate_reward at 0x7fd148088c10>

In [17]:
num = 0
for step in a:
    print("printing")
    print(step)
    num += 1
    if num > 10:
        break

In [23]:
list(a)

KeyboardInterrupt: 