In [1]:
from __future__ import annotations
from abc import ABC, abstractmethod
from collections import defaultdict
from dataclasses import dataclass
# import graphviz
import numpy as np
from pprint import pprint
from typing import (Callable, Dict, Iterable, Generic, Sequence, Tuple,
                    Mapping, TypeVar, Set)
from rl.policy import *
from rl.distribution import (Categorical, Distribution, FiniteDistribution,
                             SampledDistribution)

S = TypeVar('S')
X = TypeVar('X')

class State(ABC, Generic[S]):
    state: S

    def on_non_terminal(
        self,
        f: Callable[[NonTerminal[S]], X],
        default: X
    ) -> X:
        if isinstance(self, NonTerminal):
            return f(self)
        else:
            return default

    
@dataclass(frozen=True)
class Terminal(State[S]):
    state: S


@dataclass(frozen=True)
class NonTerminal(State[S]):
    state: S


In [2]:
class MarkovProcess(ABC, Generic[S]):
    '''A Markov process with states of type S.
    '''
    @abstractmethod
    def transition(self, state: NonTerminal[S]) -> Distribution[State[S]]:
        '''Given a state of the process, returns a distribution of
        the next states.  Returning None means we are in a terminal state.
        '''

    def simulate(
        self,
        start_state_distribution: Distribution[NonTerminal[S]]
    ) -> Iterable[State[S]]:
        '''Run a simulation trace of this Markov process, generating the
        states visited during the trace.

        This yields the start state first, then continues yielding
        subsequent states forever or until we hit a terminal state.
        '''
        state: State[S] = start_state_distribution.sample()
        # print(start_state_distribution.sample())
        yield state
        # print(state)
        while isinstance(state, NonTerminal):
            
            # print(self.transition(state))
            state = self.transition(state).sample()
            # print(state)
            yield state

    def traces(
            self,
            start_state_distribution: Distribution[NonTerminal[S]]
    ) -> Iterable[Iterable[State[S]]]:
        '''Yield simulation traces (the output of `simulate'), sampling a
        start state from the given distribution each time.

        '''
        while True:
            yield self.simulate(start_state_distribution)


Transition = Mapping[NonTerminal[S], FiniteDistribution[State[S]]]


class FiniteMarkovProcess(MarkovProcess[S]):
    '''A Markov Process with a finite state space.

    Having a finite state space lets us use tabular methods to work
    with the process (ie dynamic programming).

    '''

    non_terminal_states: Sequence[NonTerminal[S]]
    transition_map: Transition[S]


    # def __init__(self, transition_map: Mapping[S, FiniteDistribution[S]]):
    #     non_terminals: Set[S] = set(transition_map.keys())
    #     self.transition_map = {
    #         NonTerminal(s): Categorical(
    #             {(s1 if s1 in non_terminals else s1): p
    #              for s1, p in v.table().items()}
    #         ) for s, v in transition_map.items()
    #     }
    #     self.non_terminal_states = list(self.transition_map.keys())

    ###ORIGINAL ONE IS BELOW #####
    def __init__(self, transition_map: Mapping[S, FiniteDistribution[S]]):
        non_terminals: Set[S] = set(transition_map.keys())
        self.transition_map = {
            NonTerminal(s): Categorical(
                {(NonTerminal(s1) if s1 in non_terminals else Terminal(s1)): p
                 for s1, p in v.table().items()}
            ) for s, v in transition_map.items()
        }
        self.non_terminal_states = list(self.transition_map.keys())
        
    def __repr__(self) -> str:
        display = ""

        for s, d in self.transition_map.items():
            display += f"From State {s.state}:\n"
            for s1, p in d:
                opt = "Terminal " if isinstance(s1, Terminal) else ""
                display += f"  To {opt}State {s1.state} with Probability {p:.3f}\n"

        return display

    def get_transition_matrix(self) -> np.ndarray:
        sz = len(self.non_terminal_states)
        mat = np.zeros((sz, sz))

        for i, s1 in enumerate(self.non_terminal_states):
            for j, s2 in enumerate(self.non_terminal_states):
                mat[i, j] = self.transition(s1).probability(s2)
        return mat


    def transition(self, state: NonTerminal[S])\
            -> FiniteDistribution[State[S]]:
        return self.transition_map[state]

    def get_stationary_distribution(self) -> FiniteDistribution[S]:
        eig_vals, eig_vecs = np.linalg.eig(self.get_transition_matrix().T)
        index_of_first_unit_eig_val = np.where(
            np.abs(eig_vals - 1) < 1e-8)[0][0]
        eig_vec_of_unit_eig_val = np.real(
            eig_vecs[:, index_of_first_unit_eig_val])
        return Categorical({
            self.non_terminal_states[i].state: ev
            for i, ev in enumerate(eig_vec_of_unit_eig_val /
                                   sum(eig_vec_of_unit_eig_val))
        })

    def display_stationary_distribution(self):
        pprint({
            s: round(p, 3)
            for s, p in self.get_stationary_distribution()
        })

    def generate_image(self) -> graphviz.Digraph:
        d = graphviz.Digraph()

        for s in self.transition_map.keys():
            d.node(str(s))

        for s, v in self.transition_map.items():
            for s1, p in v:
                d.edge(str(s), str(s1), label=str(p))

        return d


In [3]:
from __future__ import annotations

from abc import ABC, abstractmethod
from collections import defaultdict
from dataclasses import dataclass
from typing import (DefaultDict, Dict, Iterable, Generic, Mapping,
                    Tuple, Sequence, TypeVar, Set)

from rl.distribution import (Categorical, Distribution, FiniteDistribution)

from rl.markov_process import (
    FiniteMarkovRewardProcess, MarkovRewardProcess, StateReward, State,
    NonTerminal, Terminal)
from rl.policy import FinitePolicy, Policy

A = TypeVar('A')
S = TypeVar('S')


@dataclass(frozen=True)
class TransitionStep(Generic[S, A]):
    '''A single step in the simulation of an MDP, containing:

    state -- the state we start from
    action -- the action we took at that state
    next_state -- the state we ended up in after the action
    reward -- the instantaneous reward we got for this transition
    '''
    state: NonTerminal[S]
    action: A
    next_state: State[S]
    reward: float

    def add_return(self, γ: float, return_: float) -> ReturnStep[S, A]:
        '''Given a γ and the return from 'next_state', this annotates the
        transition with a return for 'state'.

        '''
        return ReturnStep(
            self.state,
            self.action,
            self.next_state,
            self.reward,
            return_=self.reward + γ * return_
        )


@dataclass(frozen=True)
class ReturnStep(TransitionStep[S, A]):
    '''A Transition that also contains the total *return* for its starting
    state.

    '''
    return_: float


class MarkovDecisionProcess(ABC, Generic[S, A]):
    def apply_policy(self, policy: Policy[S, A]) -> MarkovRewardProcess[S]:
        mdp = self

        class RewardProcess(MarkovRewardProcess[S]):
            def transition_reward(
                self,
                state: NonTerminal[S]
            ) -> Distribution[Tuple[State[S], float]]:
                actions: Distribution[A] = policy.act(state)
                return actions.apply(lambda a: mdp.step(state, a))

        return RewardProcess()

    @abstractmethod
    def actions(self, state: NonTerminal[S]) -> Iterable[A]:
        pass

    @abstractmethod
    def step(
        self,
        state: NonTerminal[S],
        action: A
    ) -> Distribution[Tuple[State[S], float]]:
        pass

    def simulate_actions(
            self,
            start_states: Distribution[NonTerminal[S]],
            policy: Policy[S, A]
    ) -> Iterable[TransitionStep[S, A]]:
        '''Simulate this MDP with the given policy, yielding the
        sequence of (states, action, next state, reward) 4-tuples
        encountered in the simulation trace.

        '''
        state: State[S] = start_states.sample()

        while isinstance(state, NonTerminal):
            action_distribution = policy.act(state)

            action = action_distribution.sample()
            next_distribution = self.step(state, action)

            next_state, reward = next_distribution.sample()
            yield TransitionStep(state, action, next_state, reward)
            state = next_state

    def action_traces(
            self,
            start_states: Distribution[NonTerminal[S]],
            policy: Policy[S, A]
    ) -> Iterable[Iterable[TransitionStep[S, A]]]:
        '''Yield an infinite number of traces as returned by
        simulate_actions.

        '''
        while True:
            yield self.simulate_actions(start_states, policy)


ActionMapping = Mapping[A, StateReward[S]]
StateActionMapping = Mapping[NonTerminal[S], ActionMapping[A, S]]


class FiniteMarkovDecisionProcess(MarkovDecisionProcess[S, A]):
    '''A Markov Decision Process with finite state and action spaces.

    '''

    mapping: StateActionMapping[S, A]
    non_terminal_states: Sequence[NonTerminal[S]]

    def __init__(
        self,
        mapping: Mapping[S, Mapping[A, FiniteDistribution[Tuple[S, float]]]]
    ):
        non_terminals: Set[S] = set(mapping.keys())
        self.mapping = {NonTerminal(s): {a: Categorical(
            {(s1 if s1 in non_terminals else s1, r): p
             for (s1, r), p in v.table().items()}
        ) for a, v in d.items()} for s, d in mapping.items()}
        self.non_terminal_states = list(self.mapping.keys())
    def __repr__(self) -> str:
        display = ""
        for s, d in self.mapping.items():
            display += f"From State {s.state}:\n"
            for a, d1 in d.items():
                display += f"  With Action {a}:\n"
                for (s1, r), p in d1:
                    opt = "Terminal " if isinstance(s1, Terminal) else ""
                    display += f"    To [{opt}State {s1.state} and "\
                        + f"Reward {r:.3f}] with Probability {p:.3f}\n"
        return display

    def step(self, state: NonTerminal[S], action: A) -> StateReward[S]:
        action_map: ActionMapping[A, S] = self.mapping[state]
        return action_map[action]

    def apply_finite_policy(self, policy: FinitePolicy[S, A])\
            -> FiniteMarkovRewardProcess[S]:

        transition_mapping: Dict[S, FiniteDistribution[Tuple[S, float]]] = {}

        for state in self.mapping:
            action_map: ActionMapping[A, S] = self.mapping[state]
            outcomes: DefaultDict[Tuple[S, float], float]\
                = defaultdict(float)
            actions = policy.act(state)
            for action, p_action in actions:
                for (s1, r), p in action_map[action].table().items():
                    outcomes[(s1.state, r)] += p_action * p

            transition_mapping[state.state] = Categorical(outcomes)

        return FiniteMarkovRewardProcess(transition_mapping)

    def actions(self, state: NonTerminal[S]) -> Iterable[A]:
        '''All the actions allowed for the given state.

        This will be empty for terminal states.

        '''
        return self.mapping[state].keys()


In [51]:
n = 3
States = {}

for i in range(1,n):
    States[i] = NonTerminal(i)
States[0] = Terminal(0)
States[n] = Terminal(n)

Dists_a: Mapping[int, float] = {}
for i in range(1,n):
    temp_map: Mapping[int, float] = {}

    temp_map[(States[i+1],1)] = (n-i)/n
    if i-1!=0:
        temp_map[(States[i-1],-1)] = (i)/n # Original one
        # temp_map[(States[i-1],1)] = (i)/n  # Just for fun
    else:
        temp_map[(States[i-1],-n)] = (i)/n # Original one
        # temp_map[(States[i-1],+1)] = (i)/n # Just for fun
    Dists_a[i] = Categorical(temp_map)


Dists_b: Mapping[int, float] = {}
for i in range(1,n):
    temp_map: Mapping[int, float] = {}
    for j in range(0,n+1):
        if j==i:
            continue
        if j!=0:
            temp_map[(States[j],j-i)] = 1/n  #Original one
            # temp_map[(States[j],-1)] = 1/n  #For fun
        else:
            temp_map[(States[j],-n)] = 1/n # Original one
            # temp_map[(States[j],-1)] = 1/n # Just for fun
    Dists_b[i] = Categorical(temp_map)


state_action_map: Mapping[int, float] = {}
for i in range(1,n):
    temp_map: Mapping={}
    temp_map['a'] = Dists_a[i]
    temp_map['b'] = Dists_b[i]
    state_action_map[States[i]] = temp_map

# print(state_action_map)
mdp = FiniteMarkovDecisionProcess(state_action_map)

@dataclass(frozen=True)
class FinitePolicy(Policy[S, A]):
    ''' A policy where the state and action spaces are finite.

    '''
    policy_map: Mapping[S, FiniteDistribution[A]]

    def __repr__(self) -> str:
        display = ""
        for s, d in self.policy_map.items():
            display += f"For State {s}:\n"
            for a, p in d:
                display += f"  Do Action {a} with Probability {p:.3f}\n"
        return display

    def act(self, state: NonTerminal[S]) -> FiniteDistribution[A]:
        # return self.policy_map[state] #Try this for hw3 or  hw4
        return self.policy_map[state.state]


class FiniteDeterministicPolicy(FinitePolicy[S, A]):
    '''A deterministic policy where the state and action spaces are
    finite.

    '''
    action_for: Mapping[S, A]

    def __init__(self, action_for: Mapping[S, A]):
        self.action_for = action_for
        super().__init__(policy_map={s: Constant(a) for s, a in
                                     self.action_for.items()})

    def __repr__(self) -> str:
        display = ""
        for s, a in self.action_for.items():
            display += f"For State {s}: Do Action {a}\n"
        return display



policies = []
for i in range(2**(n-1)):
    binary = "{0:b}".format(i)
    binary = ''.join(['0' for _ in range(n-len(binary))]) + binary
    policy: Mapping={}
    for j in range(1,n):
        policy[States[j]] = 'a' if binary[j] == '1' else'b'
    policies.append(FiniteDeterministicPolicy(policy))

val_fs = np.zeros((2**(n-1),n-1))
for i in range(2**(n-1)):
    val_f = mdp.apply_finite_policy(policies[i]).get_value_function_vec(gamma=0.5)
    val_fs[i,:] = val_f
c = np.max(val_fs,axis=0)
for i in range(2**(n-1)):
    if np.equal(c,val_fs[i,:]).all():
        ind=i
        print(f"Optimal Value Function: {val_fs[ind,:]}")
        break
optimal_policy:Mapping={}
binary = "{0:b}".format(ind)
binary = ''.join(['0' for _ in range(n-len(binary))]) + binary
for j in range(1,n):
    optimal_policy[States[j]] = 'a' if binary[j] == '1' else'b' 
print(f"Optimal Policy : {optimal_policy}")

Optimal Value Function: [-1.11022302e-16 -3.33333333e-01]
Optimal Policy : {NonTerminal(state=1): 'b', NonTerminal(state=2): 'a'}


In [49]:
n = 6
States = {}

for i in range(1,n):
    States[i] = NonTerminal(i)
States[0] = Terminal(0)
States[n] = Terminal(n)

Dists_a: Mapping[int, float] = {}
for i in range(1,n):
    temp_map: Mapping[int, float] = {}

    temp_map[(States[i+1],1)] = (n-i)/n
    if i-1!=0:
        temp_map[(States[i-1],-1)] = (i)/n # Original one
        # temp_map[(States[i-1],1)] = (i)/n  # Just for fun
    else:
        temp_map[(States[i-1],-n)] = (i)/n # Original one
        # temp_map[(States[i-1],+1)] = (i)/n # Just for fun
    Dists_a[i] = Categorical(temp_map)


Dists_b: Mapping[int, float] = {}
for i in range(1,n):
    temp_map: Mapping[int, float] = {}
    for j in range(0,n+1):
        if j==i:
            continue
        if j!=0:
            temp_map[(States[j],j-i)] = 1/n  #Original one
            # temp_map[(States[j],-1)] = 1/n  #For fun
        else:
            temp_map[(States[j],-n)] = 1/n # Original one
            # temp_map[(States[j],-1)] = 1/n # Just for fun
    Dists_b[i] = Categorical(temp_map)


state_action_map: Mapping[int, float] = {}
for i in range(1,n):
    temp_map: Mapping={}
    temp_map['a'] = Dists_a[i]
    temp_map['b'] = Dists_b[i]
    state_action_map[States[i]] = temp_map

# print(state_action_map)
mdp = FiniteMarkovDecisionProcess(state_action_map)

@dataclass(frozen=True)
class FinitePolicy(Policy[S, A]):
    ''' A policy where the state and action spaces are finite.

    '''
    policy_map: Mapping[S, FiniteDistribution[A]]

    def __repr__(self) -> str:
        display = ""
        for s, d in self.policy_map.items():
            display += f"For State {s}:\n"
            for a, p in d:
                display += f"  Do Action {a} with Probability {p:.3f}\n"
        return display

    def act(self, state: NonTerminal[S]) -> FiniteDistribution[A]:
        # return self.policy_map[state] #Try this for hw3 or  hw4
        return self.policy_map[state.state]


class FiniteDeterministicPolicy(FinitePolicy[S, A]):
    '''A deterministic policy where the state and action spaces are
    finite.

    '''
    action_for: Mapping[S, A]

    def __init__(self, action_for: Mapping[S, A]):
        self.action_for = action_for
        super().__init__(policy_map={s: Constant(a) for s, a in
                                     self.action_for.items()})

    def __repr__(self) -> str:
        display = ""
        for s, a in self.action_for.items():
            display += f"For State {s}: Do Action {a}\n"
        return display



policies = []
for i in range(2**(n-1)):
    binary = "{0:b}".format(i)
    binary = ''.join(['0' for _ in range(n-len(binary))]) + binary
    policy: Mapping={}
    for j in range(1,n):
        policy[States[j]] = 'a' if binary[j] == '1' else'b'
    policies.append(FiniteDeterministicPolicy(policy))

val_fs = np.zeros((2**(n-1),n-1))
for i in range(2**(n-1)):
    val_f = mdp.apply_finite_policy(policies[i]).get_value_function_vec(gamma=0.5)
    val_fs[i,:] = val_f
c = np.max(val_fs,axis=0)
for i in range(2**(n-1)):
    if np.equal(c,val_fs[i,:]).all():
        ind=i
        print(f"Optimal Value Function: {val_fs[ind,:]}")
        break
optimal_policy:Mapping={}
binary = "{0:b}".format(ind)
binary = ''.join(['0' for _ in range(n-len(binary))]) + binary
for j in range(1,n):
    optimal_policy[States[j]] = 'a' if binary[j] == '1' else'b' 
print(f"Optimal Policy : {optimal_policy}")

Optimal Value Function: [ 1.5         0.5         0.         -0.33333333 -0.66666667]
Optimal Policy : {NonTerminal(state=1): 'b', NonTerminal(state=2): 'b', NonTerminal(state=3): 'a', NonTerminal(state=4): 'a', NonTerminal(state=5): 'a'}


In [50]:
n = 9
States = {}

for i in range(1,n):
    States[i] = NonTerminal(i)
States[0] = Terminal(0)
States[n] = Terminal(n)

Dists_a: Mapping[int, float] = {}
for i in range(1,n):
    temp_map: Mapping[int, float] = {}

    temp_map[(States[i+1],1)] = (n-i)/n
    if i-1!=0:
        temp_map[(States[i-1],-1)] = (i)/n # Original one
        # temp_map[(States[i-1],1)] = (i)/n  # Just for fun
    else:
        temp_map[(States[i-1],-n)] = (i)/n # Original one
        # temp_map[(States[i-1],+1)] = (i)/n # Just for fun
    Dists_a[i] = Categorical(temp_map)


Dists_b: Mapping[int, float] = {}
for i in range(1,n):
    temp_map: Mapping[int, float] = {}
    for j in range(0,n+1):
        if j==i:
            continue
        if j!=0:
            temp_map[(States[j],j-i)] = 1/n  #Original one
            # temp_map[(States[j],-1)] = 1/n  #For fun
        else:
            temp_map[(States[j],-n)] = 1/n # Original one
            # temp_map[(States[j],-1)] = 1/n # Just for fun
    Dists_b[i] = Categorical(temp_map)


state_action_map: Mapping[int, float] = {}
for i in range(1,n):
    temp_map: Mapping={}
    temp_map['a'] = Dists_a[i]
    temp_map['b'] = Dists_b[i]
    state_action_map[States[i]] = temp_map

# print(state_action_map)
mdp = FiniteMarkovDecisionProcess(state_action_map)

@dataclass(frozen=True)
class FinitePolicy(Policy[S, A]):
    ''' A policy where the state and action spaces are finite.

    '''
    policy_map: Mapping[S, FiniteDistribution[A]]

    def __repr__(self) -> str:
        display = ""
        for s, d in self.policy_map.items():
            display += f"For State {s}:\n"
            for a, p in d:
                display += f"  Do Action {a} with Probability {p:.3f}\n"
        return display

    def act(self, state: NonTerminal[S]) -> FiniteDistribution[A]:
        # return self.policy_map[state] #Try this for hw3 or  hw4
        return self.policy_map[state.state]


class FiniteDeterministicPolicy(FinitePolicy[S, A]):
    '''A deterministic policy where the state and action spaces are
    finite.

    '''
    action_for: Mapping[S, A]

    def __init__(self, action_for: Mapping[S, A]):
        self.action_for = action_for
        super().__init__(policy_map={s: Constant(a) for s, a in
                                     self.action_for.items()})

    def __repr__(self) -> str:
        display = ""
        for s, a in self.action_for.items():
            display += f"For State {s}: Do Action {a}\n"
        return display



policies = []
for i in range(2**(n-1)):
    binary = "{0:b}".format(i)
    binary = ''.join(['0' for _ in range(n-len(binary))]) + binary
    policy: Mapping={}
    for j in range(1,n):
        policy[States[j]] = 'a' if binary[j] == '1' else'b'
    policies.append(FiniteDeterministicPolicy(policy))

val_fs = np.zeros((2**(n-1),n-1))
for i in range(2**(n-1)):
    val_f = mdp.apply_finite_policy(policies[i]).get_value_function_vec(gamma=0.5)
    val_fs[i,:] = val_f
c = np.max(val_fs,axis=0)
for i in range(2**(n-1)):
    if np.equal(c,val_fs[i,:]).all():
        ind=i
        print(f"Optimal Value Function: {val_fs[ind,:]}")
        break
optimal_policy:Mapping={}
binary = "{0:b}".format(ind)
binary = ''.join(['0' for _ in range(n-len(binary))]) + binary
for j in range(1,n):
    optimal_policy[States[j]] = 'a' if binary[j] == '1' else'b' 
print(f"Optimal Policy : {optimal_policy}")

Optimal Value Function: [ 3.          2.          1.          0.11111111 -0.11111111 -0.33333333
 -0.55555556 -0.77777778]
Optimal Policy : {NonTerminal(state=1): 'b', NonTerminal(state=2): 'b', NonTerminal(state=3): 'b', NonTerminal(state=4): 'a', NonTerminal(state=5): 'a', NonTerminal(state=6): 'a', NonTerminal(state=7): 'a', NonTerminal(state=8): 'a'}
