In [48]:
import gymnasium as gym
import random
from collections import defaultdict
from tqdm.notebook import tqdm

# Set learning parameters
LEARNING_RATE = 0.75
DISCOUNT = 0.95
EPSILON = 0.1
EPISODES = 40000


In [None]:
NUM_BINS = 100
NUM_OBVS = 4
NUM_ACTS = 2
peak_cart_velocity = 7
peak_pole_velocity = 3
MAX_VALS = (3, peak_cart_velocity, 0.3, peak_pole_velocity)
MIN_VALS = tuple(-1 * v for v in MAX_VALS)
INTERVAL = tuple(  (MAX_VALS[i]-MIN_VALS[i])/NUM_BINS   for i in range(NUM_OBVS))


def discretize_state(state):
    state_zeroed = [state[i]-MIN_VALS[i] for i in range(NUM_OBVS)]
    state_bin = [  round(  state_zeroed[i]/INTERVAL[i]  )     for i in range(NUM_OBVS)]
    return tuple(state_bin)


def get_q_actions(q, state_inds):
    return [  q[(act, state_inds)][0]   for act in range(NUM_ACTS)]


def get_best_action(q, state_inds):
    q_actions = get_q_actions(q, state_inds)
    max_q = max(q_actions)

    return q_actions.index(max_q)


def default_val():
    return [random.uniform(-2, 0), random.uniform(-2, 0)]

In [None]:

def deploy_agent(environement, q_table):

    state = environement.reset()[0]
    state_inds = discretize_state(state)

    terminated = False
    num_steps = 0
    while not terminated:

        # Choose action
        if random.random() < EPSILON:
            action = random.randint(0,1)
        else:
            action = get_best_action(q_table, state_inds)

        # Make a note of this q's value
        q_val = q_table[(action, state_inds)][0]

        # Observe results that could happen if you take this step
        new_state, reward, terminated, _, _ = environement.step(action)

        new_state_inds = discretize_state(new_state)

        best_new_q_value = max(  get_q_actions(q_table, new_state_inds)  )

        # Update q based on this possibility        
        updated_q_val = (1-LEARNING_RATE)*q_val + LEARNING_RATE*(reward+DISCOUNT*best_new_q_value)

        q_table[(action, state_inds)][1] = updated_q_val

        # Take the step
        state_inds = new_state_inds
        num_steps += 1

    return q_table, num_steps

In [None]:
print(INTERVAL)

In [None]:
# Initialize the environment
env = gym.make('CartPole-v1', render_mode = "human")

q_table = defaultdict( default_val )

tot_steps = 0

for episode in range(20000):
    
    q_table, num_steps = deploy_agent(env, q_table)

    tot_steps += num_steps
    if episode % 2000 == 0:
        print(tot_steps/2000)
        tot_steps = 0

    for key, val in q_table.items():
        if val[0] != val[1]:
            val[0] = val[1]

env.close()

In [None]:
for key in q_table.keys():
    if key[1][0] == 1 or key[1][0] == 10:   #any(state == 1 or state == 10 for state in key[1]):
        print(key[1])
