## Template Code from GitHub:

In [1]:
from typing import Sequence, Tuple, Mapping

S = str
DataType = Sequence[Sequence[Tuple[S, float]]]
ProbFunc = Mapping[S, Mapping[S, float]]
RewardFunc = Mapping[S, float]
ValueFunc = Mapping[S, float]

In [2]:
def get_state_return_samples(
    data: DataType
) -> Sequence[Tuple[S, float]]:
    """
    prepare sequence of (state, return) pairs.
    Note: (state, return) pairs is not same as (state, reward) pairs.
    """
    return [(s, sum(r for (_, r) in l[i:]))
            for l in data for i, (s, _) in enumerate(l)]

In [3]:
from rl.monte_carlo import *
from rl.markov_process import *
from rl.function_approx import Tabular
from itertools import islice

def get_mc_value_function(
    state_return_samples: Sequence[Tuple[S, float]]
) -> ValueFunc:
    """
    Implement tabular MC Value Function compatible with the interface defined above.
    """
    

    # Once we discect the mc_prediction() function in rl.monte_carlo
    # we see that we really only need each state and return, so creating TransitionStep objects 
    # is wholly unnecessary. Doing so would be wrapping the relevant data in an object, just to
    # be unwrapped again. Further, the mc_prediction function wants to calculate returns from rewards,
    # where here we only have returns directly and no access to individual rewards.
    def helper(approx_0, episode):
        f = approx_0
        f = last(f.iterate_updates(
            [(step[0], step[1])] for step in episode
        ))
        yield f
        
    num_traces = 60000
    it = helper(approx_0 = Tabular(),
               episode = state_return_samples)
    
    return last(islice(it, num_traces))

In [4]:
def get_state_reward_next_state_samples(
    data: DataType
) -> Sequence[Tuple[S, float, S]]:
    """
    prepare sequence of (state, reward, next_state) triples.
    """
    return [(s, r, l[i+1][0] if i < len(l) - 1 else 'T')
            for l in data for i, (s, r) in enumerate(l)]

In [5]:
from rl.markov_process import *

def get_probability_and_reward_functions(
    srs_samples: Sequence[Tuple[S, float, S]]
) -> Tuple[ProbFunc, RewardFunc]:
    """
    Implement code that produces the probability transitions and the
    reward function compatible with the interface defined above.
    """
    
    # Input is (State, Reward, Next_State)
    # ProbFunc = Mapping[S, Mapping[S, float]]
    # RewardFunc = Mapping[S, float]

    A_targets_map = {'A':0,'B':0,'T':0}
    B_targets_map = {'A':0,'B':0,'T':0}
    A_rewards = 0
    num_A = 0
    B_rewards = 0
    num_B = 0
    for samp in srs_samples:
        if samp[0] == 'A':
            A_targets_map[samp[2]] += 1
            A_rewards += samp[1]
            num_A += 1
        elif samp[0] == 'B':
            B_targets_map[samp[2]] += 1
            B_rewards += samp[1]
            num_B += 1
            
    
    A_total_transitions = sum(A_targets_map.values())
    for key in A_targets_map.keys():
        A_targets_map.update({key: A_targets_map[key] / A_total_transitions})
       
    B_total_transitions = sum(B_targets_map.values())
    for key in B_targets_map.keys():
        B_targets_map.update({key: B_targets_map[key] / B_total_transitions})
    
    my_ProbFunc = {'A' : A_targets_map,
                'B' : B_targets_map}
    
    my_RewardFunc = {'A': A_rewards / num_A,
                  'B': B_rewards / num_B}
    
    
    return (my_ProbFunc, my_RewardFunc)

In [6]:
def get_mrp_value_function(
    prob_func: ProbFunc,
    reward_func: RewardFunc
) -> ValueFunc:
    """
    Implement code that calculates the MRP Value Function from the probability
    transitions and reward function, compatible with the interface defined above.
    Hint: Use the MRP Bellman Equation and simple linear algebra
    """
    
   # See pg. 132 for Bellman Equation
    gamma = 1
    
    # Generate transition matrix
    mat = np.zeros((2,2))
    for i, s1 in enumerate(['A','B']):
        for j, s2 in enumerate(['A','B']):
            mat[i,j] = prob_func[s1][s2]
    
    # Expected reward for each state
    reward_function_vec = np.array([reward_func['A'], reward_func['B']])
    
    # Direct solve for the value function vector
    value_func_vec = np.linalg.solve(
        np.eye(2) -
        gamma * mat,
        reward_function_vec
    )
    
    # Turn value function vector into a dictionary
    return {'A': value_func_vec[0], 'B': value_func_vec[1]}

In [7]:
import rl.iterate as iterate

def get_td_value_function(
    srs_samples: Sequence[Tuple[S, float, S]],
    num_updates: int = 300000,
    learning_rate: float = 0.3,
    learning_rate_decay: int = 30
) -> ValueFunc:
    """
    Implement tabular TD(0) (with experience replay) Value Function compatible
    with the interface defined above. Let the step size (alpha) be:
    learning_rate * (updates / learning_rate_decay + 1) ** -0.5
    so that Robbins-Monro condition is satisfied for the sequence of step sizes.
    """

    # Returns a reward of 0 if Terminal
    def custom_extended_vf(vf, s):
        if s not in ['A', 'B']:
            return 0.0
        else:
            return vf(s)
        
    # Meat of the td prediction algorithm. Adapted from td_prediction 
    # in td.py
    # The first for loop generates experience-replayed groupings
    def custom_td(srs_samples, approx_0):
        # Generate a new sequence with replacement
        transitions = []
        for i in range(len(srs_samples)):
            rnum = np.random.randint(0,len(srs_samples))
            transitions.append(srs_samples[rnum])
        
        # Apply TD updates
        def step(v,transition):
            return v.update([(
                transition[0],
                transition[1] + 1 * custom_extended_vf(v, transition[2])
            )])
        return iterate.accumulate(transitions, step, initial=approx_0)
        
        
        
    # Custom learning rate schedule, specified above
    def alpha_schedule(learning_rate: float,learning_rate_decay: float
    ) -> Callable[[int], float]:
        def lr_func(n: int) -> float:
            return learning_rate * (n / learning_rate_decay + 1) ** -0.5
        return lr_func
    
    learning_rate_func = alpha_schedule(
        learning_rate=learning_rate,
        learning_rate_decay=learning_rate_decay)

    # Create Tabular value approximation object
    td_approx = Tabular(count_to_weight_func=learning_rate_func)
    
    # Generates an Iterator of value functions
    td_value_functs = custom_td(srs_samples, td_approx)
    
    # Now we need to iterate over our iterator of value functions
    v_num = 0
    processed_value_funcs_td = []
    for i in td_value_functs:
        processed_value_funcs_td += [i]
        v_num += 1
        if v_num > num_updates:
            break
            
    # Return the last value function (that which we have iterated/updated
    # the most
    return processed_value_funcs_td[-1]
    

In [57]:
from rl.function_approx import LinearFunctionApprox, Weights

def get_lstd_value_function(
    srs_samples: Sequence[Tuple[S, float, S]]
) -> ValueFunc:
    """
    Implement LSTD Value Function compatible with the interface defined above.
    Hint: Tabular is a special case of linear function approx where each feature
    is an indicator variables for a corresponding state and each parameter is
    the value function for the corresponding state.
    """
    γ = 1
    ε = 1e-4
    
    feature_functions = [lambda x: 1*(x == 'A'), lambda x: 1*(x == 'B')]
    
    num_features: int = len(feature_functions)
    a_inv: np.ndarray = np.eye(num_features) / ε
    b_vec: np.ndarray = np.zeros(num_features)
    for samp in srs_samples:
        phi1: np.ndarray = np.array([f(samp[0]) for f in feature_functions])
        if isinstance(samp[2], NonTerminal):
            phi2 = phi1 - γ * np.array([f(samp[2])
                                        for f in feature_functions])
        else:
            phi2 = phi1
        temp: np.ndarray = a_inv.T.dot(phi2)
        a_inv = a_inv - np.outer(a_inv.dot(phi1), temp) / (1 + phi1.dot(temp))
        b_vec += phi1 * samp[1]

    opt_wts: np.ndarray = a_inv.dot(b_vec)
    lstd_func = LinearFunctionApprox.create(
        feature_functions=feature_functions,
        weights=Weights.create(opt_wts)
    )
    
    return lstd_func.evaluate(['A','B'])

In [58]:
if __name__ == '__main__':
    given_data: DataType = [
        [('A', 2.), ('A', 6.), ('B', 1.), ('B', 2.)],
        [('A', 3.), ('B', 2.), ('A', 4.), ('B', 2.), ('B', 0.)],
        [('B', 3.), ('B', 6.), ('A', 1.), ('B', 1.)],
        [('A', 0.), ('B', 2.), ('A', 4.), ('B', 4.), ('B', 2.), ('B', 3.)],
        [('B', 8.), ('B', 2.)]
    ]

    sr_samps = get_state_return_samples(given_data)
    print(sr_samps)

    print("------------- MONTE CARLO VALUE FUNCTION --------------")
    print(get_mc_value_function(sr_samps))

    srs_samps = get_state_reward_next_state_samples(given_data)
    #print(srs_samps)
    
    pfunc, rfunc = get_probability_and_reward_functions(srs_samps)
    print("-------------- MRP VALUE FUNCTION ----------")
    print(get_mrp_value_function(pfunc, rfunc))

    print("------------- TD VALUE FUNCTION --------------")
    print(get_td_value_function(srs_samps))

    print("------------- LSTD VALUE FUNCTION --------------")
    print(get_lstd_value_function(srs_samps))

[('A', 11.0), ('A', 9.0), ('B', 3.0), ('B', 2.0), ('A', 11.0), ('B', 8.0), ('A', 6.0), ('B', 2.0), ('B', 0.0), ('B', 11.0), ('B', 8.0), ('A', 2.0), ('B', 1.0), ('A', 15.0), ('B', 15.0), ('A', 13.0), ('B', 9.0), ('B', 5.0), ('B', 3.0), ('B', 10.0), ('B', 2.0)]
------------- MONTE CARLO VALUE FUNCTION --------------
Tabular(values_map={'A': 9.571428571428571, 'B': 5.642857142857142}, counts_map={'A': 7, 'B': 14}, count_to_weight_func=<function Tabular.<lambda>.<locals>.<lambda> at 0x7fb0718bca60>)
-------------- MRP VALUE FUNCTION ----------
{'A': 12.933333333333332, 'B': 9.6}
------------- TD VALUE FUNCTION --------------
Tabular(values_map={'B': 2.3780547053967886, 'A': 4.990493337255367}, counts_map={'B': 16, 'A': 5}, count_to_weight_func=<function get_td_value_function.<locals>.alpha_schedule.<locals>.lr_func at 0x7fb0718bc550>)
------------- LSTD VALUE FUNCTION --------------
[2.85710204 2.71426633]


### They don't give the same value function. Now, for all of them A has a larger value than B, but their values are not the same. 

### Theoretically, the MRP value function should be exact since it does a direct liner algebra solve with the true rewards. MC doesnt use any bootstrapping, so should be unbiased. TD uses bootstrapping, so it will be biased, but it should be lower variance. LSTD is just a direct (gradient free) linear algebra solution for the batch TD solution. Again, MC and MRP should be unbiased where TD and LSTD will be biased. I wouldn't expect the bias to be quite this big, however. 