In [1]:
import gymnasium as gym
import numpy as np

In [2]:
class Agent:
    def __init__(self, gamma=0.99, alpha=0.1):
        self.bins = np.linspace(-0.2095, 0.2095, 10)
        self.action_space = [0, 1]
        self.V = {}
        self.gamma = gamma
        self.alpha = alpha

        self.init_V()

    def init_V(self):
        for angle in range(len(self.bins) + 1):
            self.V[angle] = 0

    def policy(self, state):
        return self.action_space[0] if state[2] < 0 else self.action_space[1]

    def update_V(self, state, reward, next_state):
        angle = np.digitize(state[2], self.bins)
        next_angle = np.digitize(next_state[2], self.bins)
        # print(angle, next_angle)

        self.V[angle] = self.V[angle] + self.alpha * (
            reward + self.gamma * self.V[next_angle] - self.V[angle]
        )

In [3]:
def main():
    env = gym.make("CartPole-v1")

    agent = Agent()
    n_episodes = 50000

    for episode in range(n_episodes):
        print("Executing episode:", episode + 1, end="\r")
        state, _ = env.reset()

        terminated, truncated = False, False
        while not terminated and not truncated:
            action = agent.policy(state)
            state_, reward, terminated, truncated, _ = env.step(action)
            agent.update_V(state, reward, state_)
            state = state_

    print("Training Complete!\t\t")
    print("V:", list(agent.V.values()))

In [4]:
main()

Training Complete!		0000
V: [0, 13.384196739302425, 22.102694224564456, 27.431829990418912, 30.925713615650473, 31.24005492133547, 28.34487423377081, 22.485453372676435, 11.623406512098327, 3.5035531245753764, 0]
