In [2]:
import gymnasium as gym
from stable_baselines3 import DQN

# Create environment (no rendering)
train_env = gym.make("MountainCar-v0")

# Build DQN model with good hyperparameters
model = DQN(
    policy="MlpPolicy",
    env=train_env,
    verbose=1,
    learning_rate=1e-3,
    buffer_size=50_000,
    learning_starts=1000,
    batch_size=64,
    tau=1.0,
    gamma=0.99,
    train_freq=1,
    target_update_interval=500,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.01,
    exploration_fraction=0.1,
)

# Train the model
print("Training the model...")
model.learn(total_timesteps=350_000)
print("Training complete!")

# Visualize agent performance (this part renders)
eval_env = gym.make("MountainCar-v0", render_mode="human")

print("\nRunning evaluation episodes...")
for episode in range(5):
    obs, info = eval_env.reset()
    total_reward = 0

    for step in range(200):
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = eval_env.step(action)
        total_reward += reward

        if terminated or truncated:
            if obs[0] >= 0.5:
                print(f"Episode {episode+1}: Reached the flag! Total reward: {total_reward}")
            else:
                print(f"Episode {episode+1}: Didn't reach flag! Total reward: {total_reward}")
            break

eval_env.close()


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Training the model...
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 200      |
|    ep_rew_mean      | -200     |
|    exploration_rate | 0.977    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 17572    |
|    time_elapsed     | 0        |
|    total_timesteps  | 800      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 200      |
|    ep_rew_mean      | -200     |
|    exploration_rate | 0.955    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3547     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1600     |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 9.6e-05  |
|    n_updates        | 599      |
---------