## Template Code from GitHub:

In [1]:
from typing import Sequence, Tuple, Mapping

S = str
DataType = Sequence[Sequence[Tuple[S, float]]]
ProbFunc = Mapping[S, Mapping[S, float]]
RewardFunc = Mapping[S, float]
ValueFunc = Mapping[S, float]

In [2]:
# DONE
def get_state_return_samples(
    data: DataType
) -> Sequence[Tuple[S, float]]:
    """
    prepare sequence of (state, return) pairs.
    Note: (state, return) pairs is not same as (state, reward) pairs.
    """
    return [(s, sum(r for (_, r) in l[i:]))
            for l in data for i, (s, _) in enumerate(l)]

In [12]:
# DONE:
from rl.monte_carlo import *
from rl.markov_process import *
from rl.function_approx import Tabular
from itertools import islice

def get_mc_value_function(
    state_return_samples: Sequence[Tuple[S, float]]
) -> ValueFunc:
    """
    Implement tabular MC Value Function compatible with the interface defined above.
    """
    
#     Generate list of ReturnStep objects
#     list_RS = []
#     for (i, tup) in enumerate(state_return_samples):
#         if i < len(state_return_samples) - 1:
#             current_state = tup[0]
#             return_ = tup[1]
#             next_state = state_return_samples[i+1][0]
#             list_RS.append(ReturnStep(state=NonTerminal(current_state), 
#                                       next_state = next_state, 
#                                       reward = None,
#                                       return_ = return_))
    
#     it: Iterator[ValueFunc] = mc_prediction(
#         traces = [list_RS],
#         approx_0 = Tabular(),
#         γ=1,
#         episode_length_tolerance=1e-6
#     )
    
#     num_traces = 60000
    
#     return last(islice(it, num_traces))


    # The above is not necessary. Once we discent the mc_prediction() function in rl.monte_carlo
    # we see that we really only need each state and return, so creating TransitionStep objects 
    # is wholly unnecessary. Doing so would be wrapping the relevant data in an object, just to
    # be unwrapped again. Further, the mc_prediction function wants to calculate returns from rewards,
    # where we only have returns directly and no access to individual rewards.
    def helper(approx_0, episode):
        f = approx_0
        f = last(f.iterate_updates(
            [(step[0], step[1])] for step in episode
        ))
        yield f
        
    num_traces = 60000
    it = helper(approx_0 = Tabular(),
               episode = state_return_samples)
    
    return last(islice(it, num_traces))
        

In [15]:
# DONE
def get_state_reward_next_state_samples(
    data: DataType
) -> Sequence[Tuple[S, float, S]]:
    """
    prepare sequence of (state, reward, next_state) triples.
    """
    return [(s, r, l[i+1][0] if i < len(l) - 1 else 'T')
            for l in data for i, (s, r) in enumerate(l)]

In [24]:
# TO DO:
from rl.markov_process import *

def get_probability_and_reward_functions(
    srs_samples: Sequence[Tuple[S, float, S]]
) -> Tuple[ProbFunc, RewardFunc]:
    """
    Implement code that produces the probability transitions and the
    reward function compatible with the interface defined above.
    """
    
    # Input is (State, Reward, Next_State)
    # ProbFunc = Mapping[S, Mapping[S, float]]
    # RewardFunc = Mapping[S, float]

    A_targets_map = {'A':0,'B':0,'T':0}
    B_targets_map = {'A':0,'B':0,'T':0}
    A_rewards = 0
    num_A = 0
    B_rewards = 0
    num_B = 0
    for samp in srs_samples:
        if samp[0] == 'A':
            A_targets_map[samp[2]] += 1
            A_rewards += samp[1]
            num_A += 1
        elif samp[0] == 'B':
            B_targets_map[samp[2]] += 1
            B_rewards += samp[1]
            num_B += 1
            
    
    A_total_transitions = sum(A_targets_map.values())
    for key in A_targets_map.keys():
        A_targets_map.update({key: A_targets_map[key] / A_total_transitions})
       
    B_total_transitions = sum(B_targets_map.values())
    for key in B_targets_map.keys():
        B_targets_map.update({key: B_targets_map[key] / B_total_transitions})
    
    my_ProbFunc = {'A' : A_targets_map,
                'B' : B_targets_map}
    
    my_RewardFunc = {'A': A_rewards / num_A,
                  'B': B_rewards / num_B}
    
    
    return (my_ProbFunc, my_RewardFunc)

In [30]:
# DONE
from rl.dynamic_programming import *
def get_mrp_value_function(
    prob_func: ProbFunc,
    reward_func: RewardFunc
) -> ValueFunc:
    """
    Implement code that calculates the MRP Value Function from the probability
    transitions and reward function, compatible with the interface defined above.
    Hint: Use the MRP Bellman Equation and simple linear algebra
    """
    
   # See pg. 132 
    gamma = 1
    
    # Generate transition matrix
    mat = np.zeros((2,2))
    for i, s1 in enumerate(['A','B']):
        for j, s2 in enumerate(['A','B']):
            mat[i,j] = prob_func[s1][s2]
    
    # Expected reward for each state
    reward_function_vec = np.array([reward_func['A'], reward_func['B']])
    
    # Direct solve for the value function vector
    value_func_vec = np.linalg.solve(
        np.eye(2) -
        gamma * mat,
        reward_function_vec
    )
    
    # Turn value function vector into a dictionary
    return {'A': value_func_vec[0], 'B': value_func_vec[1]}

In [None]:
# TO DO:
def get_td_value_function(
    srs_samples: Sequence[Tuple[S, float, S]],
    num_updates: int = 300000,
    learning_rate: float = 0.3,
    learning_rate_decay: int = 30
) -> ValueFunc:
    """
    Implement tabular TD(0) (with experience replay) Value Function compatible
    with the interface defined above. Let the step size (alpha) be:
    learning_rate * (updates / learning_rate_decay + 1) ** -0.5
    so that Robbins-Monro condition is satisfied for the sequence of step sizes.
    """
    
    

In [None]:
# TO DO:
def get_lstd_value_function(
    srs_samples: Sequence[Tuple[S, float, S]]
) -> ValueFunc:
    """
    Implement LSTD Value Function compatible with the interface defined above.
    Hint: Tabular is a special case of linear function approx where each feature
    is an indicator variables for a corresponding state and each parameter is
    the value function for the corresponding state.
    """

In [31]:
if __name__ == '__main__':
    given_data: DataType = [
        [('A', 2.), ('A', 6.), ('B', 1.), ('B', 2.)],
        [('A', 3.), ('B', 2.), ('A', 4.), ('B', 2.), ('B', 0.)],
        [('B', 3.), ('B', 6.), ('A', 1.), ('B', 1.)],
        [('A', 0.), ('B', 2.), ('A', 4.), ('B', 4.), ('B', 2.), ('B', 3.)],
        [('B', 8.), ('B', 2.)]
    ]

    sr_samps = get_state_return_samples(given_data)
    print(sr_samps)

    print("------------- MONTE CARLO VALUE FUNCTION --------------")
    print(get_mc_value_function(sr_samps))

    srs_samps = get_state_reward_next_state_samples(given_data)
    print(srs_samps)
    
    pfunc, rfunc = get_probability_and_reward_functions(srs_samps)
    print("-------------- MRP VALUE FUNCTION ----------")
    print(get_mrp_value_function(pfunc, rfunc))

    print("------------- TD VALUE FUNCTION --------------")
    print(get_td_value_function(srs_samps))

    print("------------- LSTD VALUE FUNCTION --------------")
    print(get_lstd_value_function(srs_samps))

[('A', 11.0), ('A', 9.0), ('B', 3.0), ('B', 2.0), ('A', 11.0), ('B', 8.0), ('A', 6.0), ('B', 2.0), ('B', 0.0), ('B', 11.0), ('B', 8.0), ('A', 2.0), ('B', 1.0), ('A', 15.0), ('B', 15.0), ('A', 13.0), ('B', 9.0), ('B', 5.0), ('B', 3.0), ('B', 10.0), ('B', 2.0)]
------------- MONTE CARLO VALUE FUNCTION --------------
Tabular(values_map={'A': 9.571428571428571, 'B': 5.642857142857142}, counts_map={'A': 7, 'B': 14}, count_to_weight_func=<function Tabular.<lambda>.<locals>.<lambda> at 0x7fa6080b41f0>)
[('A', 2.0, 'A'), ('A', 6.0, 'B'), ('B', 1.0, 'B'), ('B', 2.0, 'T'), ('A', 3.0, 'B'), ('B', 2.0, 'A'), ('A', 4.0, 'B'), ('B', 2.0, 'B'), ('B', 0.0, 'T'), ('B', 3.0, 'B'), ('B', 6.0, 'A'), ('A', 1.0, 'B'), ('B', 1.0, 'T'), ('A', 0.0, 'B'), ('B', 2.0, 'A'), ('A', 4.0, 'B'), ('B', 4.0, 'B'), ('B', 2.0, 'B'), ('B', 3.0, 'T'), ('B', 8.0, 'B'), ('B', 2.0, 'T')]
-------------- MRP VALUE FUNCTION ----------
{'A': 12.933333333333332, 'B': 9.6}
------------- TD VALUE FUNCTION --------------


NameError: name 'get_td_value_function' is not defined