In [9]:
from typing import Sequence, Tuple, Mapping
from typing import Sequence, Iterable, Callable
from rl.function_approx import AdamGradient
from rl.function_approx import LinearFunctionApprox
from rl.approximate_dynamic_programming import ValueFunctionApprox
from rl.distribution import Choose
from rl.markov_decision_process import NonTerminal
from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite
from rl.chapter2.simple_inventory_mrp import InventoryState
from rl.chapter10.prediction_utils import (
    mc_prediction_learning_rate,
    td_prediction_learning_rate
)
import numpy as np
from itertools import islice
from rl.policy import *
from rl.distribution import (Categorical, Distribution, FiniteDistribution,
                             SampledDistribution)
from rl.markov_process import *


S = str
DataType = Sequence[Sequence[Tuple[S, float]]]
ProbFunc = Mapping[S, Mapping[S, float]]
RewardFunc = Mapping[S, float]
ValueFunc = Mapping[S, float]


def get_state_return_samples(
    data: DataType
) -> Sequence[Tuple[S, float]]:
    """
    prepare sequence of (state, return) pairs.
    Note: (state, return) pairs is not same as (state, reward) pairs.
    """
    return [(s, sum(r for (_, r) in l[i:]))
            for l in data for i, (s, _) in enumerate(l)]


def get_mc_value_function(
    state_return_samples: Sequence[Tuple[S, float]]
) -> ValueFunc:
    """
    Implement tabular MC Value Function compatible with the interface defined above.
    """
    ###### MY IMPLEMENTATION #######
    ################################
    all_states = set(x[0] for x in state_return_samples)
    appearances : Mapping={}
    summations: Mapping={}
    for state in all_states:
        appearances[state]=0
        summations[state]=0
    for state,ret in state_return_samples:
        appearances[state] += 1
        summations[state] += ret
    value_func:Mapping={}
    for state in all_states:
        value_func[state] = summations[state]/appearances[state]
    return value_func

def get_state_reward_next_state_samples(
    data: DataType
) -> Sequence[Tuple[S, float, S]]:
    """
    prepare sequence of (state, reward, next_state) triples.
    """
    return [(s, r, l[i+1][0] if i < len(l) - 1 else 'T')
            for l in data for i, (s, r) in enumerate(l)]


def get_probability_and_reward_functions(
    srs_samples: Sequence[Tuple[S, float, S]]
) -> Tuple[ProbFunc, RewardFunc]:
    """
    Implement code that produces the probability transitions and the
    reward function compatible with the interface defined above.
    """
    all_states = set(x[0] for x in srs_samples).union(set(x[2] for x in srs_samples))
    appearance:Mapping={}
    for state in all_states:
        appearance[state] = 0
    for x in srs_samples:
        appearance[x[0]] += 1
    
    prob:Mapping[S, Mapping[S, float]] = {}
    for state1 in all_states:
        prob[state1] = {}
        for state2 in all_states:
            prob[state1][state2] = 0
    for x in srs_samples:
        prob[x[0]][x[2]] += 1/appearance[x[0]]

    rew:Mapping[S, float] = {}
    for state in all_states:
        rew[state]=0
    for x in srs_samples:
        rew[x[0]] += x[1] / appearance[x[0]]
    return prob,rew 


def get_mrp_value_function(
    prob_func: ProbFunc,
    reward_func: RewardFunc
) -> ValueFunc:
    """
    Implement code that calculates the MRP Value Function from the probability
    transitions and reward function, compatible with the interface defined above.
    Hint: Use the MRP Bellman Equation and simple linear algebra
    """
    all_states = set(x[0] for x in reward_func.keys())
    all_states = list(all_states)
    n_state = len(all_states)
    P = np.zeros((n_state,n_state))
    R = np.zeros((n_state))
    for i,state in enumerate(all_states):
        P[i]
    for i in range(n_state):
        R[i] = reward_func[all_states[i]]
        for j in range(n_state):
            P[i,j] = prob_func[all_states[i]][all_states[j]]
    vstar = np.linalg.inv(np.eye(n_state)-P)@R
    vf: Mapping[S,float] = {}
    for i in range(n_state):
        vf[all_states[i]] = vstar[i]
    return vf



def get_td_value_function(
    srs_samples: Sequence[Tuple[S, float, S]],
    num_updates: int = 300000,
    learning_rate: float = 0.3,
    learning_rate_decay: int = 30
) -> ValueFunc:
    """
    Implement tabular TD(0) (with experience replay) Value Function compatible
    with the interface defined above. Let the step size (alpha) be:
    learning_rate * (updates / learning_rate_decay + 1) ** -0.5
    so that Robbins-Monro condition is satisfied for the sequence of step sizes.
    """


    ag = AdamGradient(
    learning_rate=0.05,
    decay1=0.9,
    decay2=0.999
    )
    all_state_names = set(x[0] for x in srs_samples)
    
    # ffs: Sequence[Callable[[NonTerminal[InventoryState]], float]] = \
    # [(lambda x, s=s: float(x.state == s.state)) for s in all_states]

    appearance={}
    for state in all_state_names:
        appearance[state] = 0
    for x in srs_samples:
        appearance[x[0]] += 1

    d={}
    for state in all_state_names:
        d[state]={}
    for x in srs_samples:
        d[x[0]][(x[2],x[1])]=1/appearance[x[0]]

    for state in all_state_names:
        d[state] = Categorical(d[state])
    # print(d)
    mrp=FiniteMarkovRewardProcess(d)
    # print(mrp.get_value_function_vec(gamma=1))
    # print(mrp.transition_map)
    td_episode_length = 1000
    gamma = 1
    all_states = mrp.non_terminal_states
    # print(all_states)
    ffs: Sequence[Callable[[NonTerminal[S]], float]] = \
    [(lambda x, s=s: float(x.state == s.state)) for s in all_states]
    
    lfa = LinearFunctionApprox.create(
        feature_functions=ffs,
        adam_gradient=ag,
        regularization_coeff=0.0001,
        direct_solve=False
)
    lfa = LinearFunctionApprox.create(
            feature_functions=ffs,
            adam_gradient=ag,
            direct_solve=False
    )
    it_td: Iterable[ValueFunctionApprox[S]] = \
        td_prediction_learning_rate(
            mrp=mrp,
            start_state_distribution=Choose(all_states),
            gamma=gamma,
            episode_length=td_episode_length,
            initial_func_approx=lfa
        )

    

    td_experiences: int = 5000
    for i, td_vf in enumerate(islice(it_td, td_experiences)):
        zzz=3
        # td_rmse: float = np.sqrt(sum(
        #     (td_vf(s) - true_vf[i]) ** 2 for i, s in enumerate(all_states)
        # ) / len(all_states))
        # if i%2000==0:
        #     print(f"TD: Iteration = {i:d}")
            # print(f"TD: Iteration = {i:d}, RMSE = {td_rmse:.3f}")
    # print(td_vf)
    print(f"td_vf {np.flip(td_vf.weights.weights)}")
    # print(f"True VF : {true_vf}")


def get_lstd_value_function(
    srs_samples: Sequence[Tuple[S, float, S]]
) -> ValueFunc:
    """
    Implement LSTD Value Function compatible with the interface defined above.
    Hint: Tabular is a special case of linear function approx where each feature
    is an indicator variables for a corresponding state and each parameter is
    the value function for the corresponding state.
    """


if __name__ == '__main__':
    given_data: DataType = [
        [('A', 2.), ('A', 6.), ('B', 1.), ('B', 2.)],
        [('A', 3.), ('B', 2.), ('A', 4.), ('B', 2.), ('B', 0.)],
        [('B', 3.), ('B', 6.), ('A', 1.), ('B', 1.)],
        [('A', 0.), ('B', 2.), ('A', 4.), ('B', 4.), ('B', 2.), ('B', 3.)],
        [('B', 8.), ('B', 2.)]
    ]

    sr_samps = get_state_return_samples(given_data)

    print("------------- MONTE CARLO VALUE FUNCTION --------------")
    print(get_mc_value_function(sr_samps))

    srs_samps = get_state_reward_next_state_samples(given_data)

    pfunc, rfunc = get_probability_and_reward_functions(srs_samps)
    print("-------------- MRP VALUE FUNCTION ----------")
    print(get_mrp_value_function(pfunc, rfunc))

    print("------------- TD VALUE FUNCTION --------------")
    # print(get_td_value_function(srs_samps))
    (get_td_value_function(srs_samps))


    ##### I COULD NOT FINISH THE LSTD PART
    #  
    # print("------------- LSTD VALUE FUNCTION --------------")
    # print(get_lstd_value_function(srs_samps))

------------- MONTE CARLO VALUE FUNCTION --------------
{'B': 5.642857142857143, 'A': 9.571428571428571}
-------------- MRP VALUE FUNCTION ----------
{'A': 12.933333333333323, 'T': 0.0, 'B': 9.599999999999994}
------------- TD VALUE FUNCTION --------------
td_vf [13.72070364 10.69307161]


In [31]:
print(td_vf)

NameError: name 'td_vf' is not defined