In [1]:
import gym
import numpy as np
from collections import defaultdict

In [2]:
env = gym.make('CartPole-v1')

In [4]:
def mc_prediction(policy, env, num_episodes, gamma=1.0):
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    V = defaultdict(float)

    for _ in range(num_episodes):
        episode = []
        state = tuple(env.reset())
        done = False
        while not done:
            action = policy(state)
            next_state, reward, done, _ = env.step(action)
            episode.append((state, action, reward))
            state = tuple(next_state)
        
        G = 0
        for state, action, reward in reversed(episode):
            G = gamma * G + reward
            if state not in [x[0] for x in episode[:-1]]:
                returns_sum[state] += G
                returns_count[state] += 1
                V[state] = returns_sum[state] / returns_count[state]

    return V

In [7]:
def mc_epsilo_greedy(env, num_episodes, gamma=1.0, epsilon=0.1):
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)

    def policy(state):
        if np.random.rand() < epsilon:
            return np.random.choice(env.action_space.n)
        else:
            return np.argmax(Q[state])

    for _ in range(num_episodes):
        episode = []
        state = tuple(env.reset())
        done = False

        while not done:
            action = policy(state)
            next_state, reward, done, _ = env.step(action)
            episode.append((state, action, reward))
            state = next_state

        G = 0
        for state, action, reward in reversed(episode):
            G = gamma * G + reward
            if (state, action) not in [(x[0], x[1]) for x in episode[:-1]]:
                returns_sum[(state, action)] += G
                returns_count[(state, action)] += 1
                Q[state][action] = returns_sum[(state, action)] / returns_count[(state, action)]

    def optimal_policy(state):
        return np.argmax(Q[state])
    
    return Q, optimal_policy


In [None]:
if __name__ == "__main__":
    def sample_policy(state):
        return 0 if state[0] >=20 else 1
    
    value_function = mc_prediction(sample_policy, env, num_episodes=50000)
    print("Value function: ", dict(list(value_function.items())[:5]))

SyntaxError: 'return' outside function (1206397702.py, line 4)