In [7]:
import gymnasium as gym
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#### Question 1: Train a CartPole Agent

Dataset Problem: Use the OpenAI Gym's CartPole-v1 environment to train an agent using a simple reinforcement learning algorithm.  Assume hyperparameters, as per requirement and develop the model. Try to apply the concepts discussed in class 


In [15]:
env = gym.make('CartPole-v1')
np.random.seed(42)

In [9]:
n_actions = env.action_space.n       
n_states = env.observation_space.shape[0]  
theta = np.random.rand(n_actions, n_states) * 0.01  
alpha = 0.01       
gamma = 0.99

In [10]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [11]:
def choose_action(state):
    probs = softmax(theta @ state)
    action = np.random.choice(n_actions, p=probs)
    return action, probs

In [14]:
for episode in range(1000):
    state = env.reset()[0]
    done = False
    trajectory = []

    while not done:
        action, probs = choose_action(state)
        next_state, reward, done, truncated, info = env.step(action)
        trajectory.append((state, action, reward, probs))
        state = next_state

    G = 0
    returns = []
    for _, _, r, _ in reversed(trajectory):
        G = r + gamma * G
        returns.insert(0, G)

    for (s, a, r, probs), Gt in zip(trajectory, returns):
        grad_log = np.zeros_like(theta)
        grad_log[a] = s - np.sum(probs[:, None] * s, axis=0)
        theta += alpha * Gt * grad_log

    if (episode + 1) % 100 == 0:
        print(f"Episode {episode+1}, Return: {sum(r for _,_,r,_ in trajectory)}")

env.close()

Episode 100, Return: 17.0
Episode 200, Return: 47.0
Episode 300, Return: 18.0
Episode 400, Return: 11.0
Episode 500, Return: 14.0
Episode 600, Return: 28.0
Episode 700, Return: 36.0
Episode 800, Return: 16.0
Episode 900, Return: 43.0
Episode 1000, Return: 12.0


#### Question 2: Mountain Car with Q-Learning
Dataset Problem: Use OpenAI Gym's MountainCar-v0 environment to train a Q-learning agent.
Similar to the CartPole example, but with the Mountain Car environment. The Q-learning code will be similar, with adjustments to the state and action space to fit the Mountain Car environment.


In [16]:
env = gym.make("MountainCar-v0")
n_actions = env.action_space.n

In [17]:
n_bins = (18, 14)  
obs_low = env.observation_space.low
obs_high = env.observation_space.high
bins = [np.linspace(obs_low[i], obs_high[i], n_bins[i]) for i in range(len(n_bins))]

In [18]:
def discretize_state(state):
    return tuple(np.digitize(s, b) for s, b in zip(state, bins))

In [19]:
Q = np.zeros(n_bins + (n_actions,))
alpha = 0.1
gamma = 0.99
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01
episodes = 2000

In [20]:
for ep in range(episodes):
    state = discretize_state(env.reset()[0])
    done = False
    total_reward = 0

    while not done:
        if np.random.random() < epsilon:
            action = np.random.randint(n_actions)
        else:
            action = np.argmax(Q[state])

        next_state_continuous, reward, done, truncated, info = env.step(action)
        next_state = discretize_state(next_state_continuous)

        best_next_action = np.max(Q[next_state])
        Q[state][action] += alpha * (reward + gamma * best_next_action - Q[state][action])
        state = next_state
        total_reward += reward

    epsilon = max(epsilon * epsilon_decay, epsilon_min)
    if (ep + 1) % 100 == 0:
        print(f"Episode {ep+1}, Reward: {total_reward}")

env.close()

Episode 100, Reward: -346.0
Episode 200, Reward: -351.0
Episode 300, Reward: -212.0
Episode 400, Reward: -258.0
Episode 500, Reward: -372.0
Episode 600, Reward: -399.0
Episode 700, Reward: -150.0
Episode 800, Reward: -178.0
Episode 900, Reward: -165.0
Episode 1000, Reward: -225.0
Episode 1100, Reward: -149.0
Episode 1200, Reward: -197.0
Episode 1300, Reward: -131.0
Episode 1400, Reward: -167.0
Episode 1500, Reward: -158.0
Episode 1600, Reward: -156.0
Episode 1700, Reward: -163.0
Episode 1800, Reward: -151.0
Episode 1900, Reward: -177.0
Episode 2000, Reward: -159.0
