# Monte Carlo Methods

First-visit monte carlo

In [26]:
import numpy as np, matplotlib, random, pandas, seaborn

In [27]:
from dataclasses import dataclass

@dataclass
class Environment:
    states: list
    actions: list
    step: callable

In [249]:
class tape:
    def __init__(self, policy:  callable, environment: Environment, length: int):
        self.environment = environment
        self.policy = policy
        self.states = []
        self.actions = []
        self.rewards = [-1]
        self.length = length
        
    def playout(self, start_state):
        first_seen = dict()
        self.states = [start_state]
        self.actions = [self.policy[start_state]]
        self.rewards = [-1]
        s = self.states[0]
        a = self.actions[0]
        for i in range(1, self.length + 1): # the last s and a are ignored
            print(s,a)
            if s not in first_seen:
                first_seen.update( {s : i-1} )
            r,s = self.environment.step(s,a)
            if s is None:
                break
            self.rewards.append(r)
            self.states.append(s)
            a = self.policy[s]
            self.actions.append(a)
        self.first_seen = first_seen

In [263]:
class monte_carlo:
    def __init__(self, policy, environment, num_steps):
        self.num_steps = num_steps
        self.V = dict()
        self.V_count = dict()
        self.pi = dict()
        self.policy = policy
        self.environment= environment

    def estimate_v_first_visit(self, start_state):
        V = { s : random.random() for s in self.environment.states }
        V_counts = { s : 0 for s in self.environment.states }

        loop_counter = 0
        while True:
            loop_counter += 1
            if loop_counter > 5:
                break
            tape = tape(self.policy, self.environment, self.num_steps)
            tape.playout(start_state)
            G = 0
            for t in range(self.num_steps-1,-1,-1):
                G = gamma*G + tape.rewards[t+1]
                if tape.first_seen[t] == t:
                    s = tape.states[t]
                    n = V_counts[s]
                    v = V[s]
                    v = ((n * v) + G) /(n+1)
                    V[s] = v
                    V_counts[s] += 1

In [264]:
gambler_states = list(range(101))
gambler_actions = list(range(1,100))
gambler_policy = dict()
gambler_values = dict()

with open("gambler-policy.txt","r") as f:
    text = f.readlines()
    for l in text:
        l = l.strip()
        i, x = l.split(" ")
        gambler_policy.update({int(i):int(round(float(x)))})

with open("gambler-value.txt","r") as f:
    text = f.readlines()
    for l in text:
        i, x = l.split(" ")
        gambler_values.update({i:x})

def gambler_step(s,a):
    if (s==0 or s==100):
        return None, None
    if (random.random() < 0.4):
        s = s + a
    else:
        s = s - a
    r = 1 if s == 100 else 0
    if (s<0 or s>100):
        raise ValueError(f"Invalid state encountered: {s}, {a}")
    return r, s
    
gambler_environment = Environment(actions = gambler_actions, states = gambler_states, step=gambler_step)

Let's try it

In [252]:
tp = tape(gambler_policy, gambler_environment, 10)

In [260]:
x=tp.playout(80)

tp.states

80 5
75 25
50 50
0 0


[80, 75, 50, 0]

In [261]:
tp.actions

[5, 25, 50, 0]

In [262]:
tp.rewards

[-1, 0, 0, 0]

In [265]:
mc = monte_carlo(gambler_policy, gambler_environment, 10)

In [266]:
mc.estimate_v_first_visit(40)

UnboundLocalError: cannot access local variable 'tape' where it is not associated with a value