In [None]:

# Sample code for Q-learning on CartPole-v1
import numpy as np
import gym
import random
import matplotlib.pyplot as plt

# Initialize the CartPole environment
env = gym.make("CartPole-v1", render_mode="human")

# Define hyperparameters
alpha = 0.1     # learning rate
gamma = 0.99    # discount factor
epsilon = 0.1   # exploration rate
episodes = 1000 # number of episodes
max_timesteps = 200 # max timesteps per episode

# Discretize the state space into 10 bins for each of the 4 dimensions (Cart position, Cart velocity, Pole angle, Pole velocity)
n_bins = 10
state_space = [np.linspace(-x, x, n_bins) for x in [env.observation_space.high[0], env.observation_space.high[1], env.observation_space.high[2], env.observation_space.high[3]]]

# Initialize the Q-table with zeros
n_actions = env.action_space.n
Q_table = np.zeros([n_bins] * 4 + [n_actions])

def discretize_state(state):
    """Discretize the continuous state into discrete bins."""
    discrete_state = []
    for i, val in enumerate(state):
        discrete_state.append(np.digitize(val, state_space[i]) - 1)  # Convert to zero-indexed
    return tuple(discrete_state)

def select_action(state):
    """Choose action using epsilon-greedy strategy."""
    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()  # Explore: random action
    else:
        return np.argmax(Q_table[state])  # Exploit: best action

def update_q_table(state, action, reward, next_state, done):
    """Update the Q-table using the Q-learning update rule."""
    best_next_action = np.argmax(Q_table[next_state])
    td_target = reward + gamma * Q_table[next_state + (best_next_action,)] * (1 - done)
    Q_table[state + (action,)] += alpha * (td_target - Q_table[state + (action,)])

# Training loop
reward_list = []

for episode in range(episodes):
    state = env.reset()  # Get the state directly from the tuple
    state = discretize_state(state[0])  # Discretize the state
    total_reward = 0
    for t in range(max_timesteps):
        # Choose an action
        action = select_action(state)

        # Take action and observe the next state and reward
        next_state, reward, done, _, _ = env.step(action)
        next_state = discretize_state(next_state)  # Discretize the next state
        
        # Update the Q-table
        update_q_table(state, action, reward, next_state, done)
        
        # Accumulate reward
        total_reward += reward
        
        # Transition to next state
        state = next_state
        
        # End the episode if done
        if done:
            break
    
    reward_list.append(total_reward)

    # Print progress
    if episode % 100 == 0:
        print(f"Episode {episode}, Total Reward: {total_reward}")


# Plotting the rewards
plt.plot(reward_list)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.show()

# Close the environment
env.close()
