<a href="https://colab.research.google.com/github/manikanta-eng/Reinforcement-learning/blob/main/rml_lab_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import gym
import numpy as np
import random

class QLearningAgent:
    def __init__(self, env, alpha=0.1, gamma=0.99, strategy="epsilon_greedy"):
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.strategy = strategy
        self.epsilon = 0.1  # ε for ε-greedy
        self.tau = 1.0      # temperature for Boltzmann
        self.num_bins = 20
        self.q_table = np.zeros((self.num_bins, self.num_bins, env.action_space.n))

        # Create bins for discretization
        self.pos_space = np.linspace(env.observation_space.low[0],
                                     env.observation_space.high[0],
                                     self.num_bins - 1)
        self.vel_space = np.linspace(env.observation_space.low[1],
                                     env.observation_space.high[1],
                                     self.num_bins - 1)

    def discretize(self, obs):
        # The error "TypeError: cannot unpack non-iterable numpy.float32 object"
        # suggests that 'obs' is sometimes a single scalar (e.g., numpy.float32)
        # instead of the expected 2-element array [position, velocity].
        # This can happen due to compatibility issues with older 'gym' versions
        # and newer 'numpy' versions, as indicated by the warning messages.

        # Ensure 'obs' is a 2-element array. If it's a scalar,
        # assume it's the position and set velocity to 0.0 as a heuristic.
        if isinstance(obs, (float, np.float32, int, np.int_)):
            # If obs is a scalar, treat it as position and assume velocity is 0.0
            # This is a heuristic to allow the code to run, but highlights a potential
            # issue with the environment's observation return type.
            obs_array = np.array([obs, 0.0])
        elif isinstance(obs, np.ndarray) and obs.ndim == 0:
            # Handle 0-dimensional numpy arrays (scalars wrapped in ndarray)
            obs_array = np.array([obs.item(), 0.0])
        elif isinstance(obs, (list, tuple, np.ndarray)) and len(obs) == 1:
            # Handle 1-element iterables, assume it's position and velocity is 0.0
            obs_array = np.array([obs[0], 0.0])
        elif isinstance(obs, np.ndarray) and obs.shape == (2,):
            # This is the expected case: a 2-element numpy array
            obs_array = obs
        else:
            # For any other unexpected format, try a generic conversion
            try:
                temp_obs = np.asarray(obs).flatten()
                if len(temp_obs) == 1:
                    obs_array = np.array([temp_obs[0], 0.0])
                elif len(temp_obs) >= 2:
                    obs_array = temp_obs[:2] # Take the first two elements
                else:
                    raise ValueError(f"Observation has fewer than 1 element: {temp_obs}")
            except Exception as e:
                raise ValueError(f"Could not convert observation to a 2-element array. "
                                 f"Original observation type: {type(obs)}, value: {obs}. Error: {e}")

        pos, vel = obs_array
        pos_bin = np.digitize(pos, self.pos_space)
        vel_bin = np.digitize(vel, self.vel_space)
        # Ensure bins are within range
        pos_bin = min(max(pos_bin, 0), self.num_bins - 1)
        vel_bin = min(max(vel_bin, 0), self.num_bins - 1)
        return (pos_bin, vel_bin)

    def choose_action(self, state):
        if self.strategy == "epsilon_greedy":
            if random.random() < self.epsilon:
                return self.env.action_space.sample()
            else:
                return np.argmax(self.q_table[state])
        elif self.strategy == "boltzmann":
            q_values = self.q_table[state]
            exp_q = np.exp(q_values / self.tau)
            probs = exp_q / np.sum(exp_q)
            return np.random.choice(len(q_values), p=probs)

    def learn(self, state, action, reward, next_state, done):
        best_next = np.max(self.q_table[next_state])
        td_target = reward + self.gamma * best_next * (1 - done)
        td_error = td_target - self.q_table[state][action]
        self.q_table[state][action] += self.alpha * td_error

def train_agent(strategy="epsilon_greedy"):
    # Disable the environment checker and enable new step API
    env = gym.make("MountainCar-v0", disable_env_checker=True, new_step_api=True)
    agent = QLearningAgent(env, strategy=strategy)
    rewards = []

    for ep in range(300):
        obs, _ = env.reset()
        state = agent.discretize(obs)
        total_reward = 0
        done = False

        while not done:
            action = agent.choose_action(state)
            next_obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            next_state = agent.discretize(next_obs)
            agent.learn(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

        rewards.append(total_reward)
        if (ep + 1) % 50 == 0:
            print(f"{strategy} Episode {ep+1}: Avg Reward (last 50 eps) = {np.mean(rewards[-50:]):.2f}")

    env.close()
    return rewards

if __name__ == "__main__":
    print("Training with ε-greedy strategy:")
    train_agent("epsilon_greedy")

    print("\nTraining with Boltzmann strategy:")
    train_agent("boltzmann")

Training with ε-greedy strategy:
epsilon_greedy Episode 50: Avg Reward (last 50 eps) = -200.00
epsilon_greedy Episode 100: Avg Reward (last 50 eps) = -200.00
epsilon_greedy Episode 150: Avg Reward (last 50 eps) = -200.00
epsilon_greedy Episode 200: Avg Reward (last 50 eps) = -200.00
epsilon_greedy Episode 250: Avg Reward (last 50 eps) = -200.00
epsilon_greedy Episode 300: Avg Reward (last 50 eps) = -200.00

Training with Boltzmann strategy:
boltzmann Episode 50: Avg Reward (last 50 eps) = -200.00
boltzmann Episode 100: Avg Reward (last 50 eps) = -200.00
boltzmann Episode 150: Avg Reward (last 50 eps) = -200.00
boltzmann Episode 200: Avg Reward (last 50 eps) = -200.00
boltzmann Episode 250: Avg Reward (last 50 eps) = -200.00
boltzmann Episode 300: Avg Reward (last 50 eps) = -200.00
