<a href="https://colab.research.google.com/github/lscblack/Deep-Q-Learning_Reforcement_Learning/blob/Tamanda_10/TamandaKaunda_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1) Setup & Requirements

Install:
pip install stable-baselines3 gymnasium[atari] ale-py autorom opencv-python torch torchvision tensorboard pandas matplotlib

AutoROM (optional) can download ROMs but requires internet.


In [None]:
!pip install 'stable-baselines3[extra]'
!pip install gymnasium[atari] ale-py
!pip install autorom

In [None]:
!AutoROM

In [78]:
# Imports
import os, time, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack, VecTransposeImage
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback, CallbackList
from stable_baselines3.common.evaluation import evaluate_policy

from stable_baselines3.common.atari_wrappers import AtariWrapper


# RESULTS_DIR = '/mnt/data/dqn_atari_results'
# os.makedirs(RESULTS_DIR, exist_ok=True)
RESULTS_DIR = os.path.expanduser('~/dqn_atari_results')

def save_json(obj, path):
    with open(path, 'w') as f:
        json.dump(obj, f, indent=2)


In [79]:
ENV_ID = "ALE/IceHockey-v5"
NUM_ENV_FRAMES = 4


## 2) Hyperparameter experiments (10 combos)
The experiment table (10 rows) below includes lr, gamma, batch_size, eps_start, eps_end, eps_decay, policy.


In [80]:
import os
import pandas as pd

# Hyperparameter sets
hp_table = [
    {"lr":2.5e-4, "gamma":0.995, "batch_size":32, "eps_start":1.0, "eps_end":0.01, "eps_decay":2e5, "policy":"MlpPolicy"},
    {"lr":3e-4, "gamma":0.99, "batch_size":32, "eps_start":1.0, "eps_end":0.02, "eps_decay":7e5, "policy":"CnnPolicy"},
    {"lr":1e-3, "gamma":0.95, "batch_size":128, "eps_start":1.0, "eps_end":0.01, "eps_decay":2e5, "policy":"CnnPolicy"},
    {"lr":5e-5, "gamma":0.998, "batch_size":32, "eps_start":1.0, "eps_end":0.001, "eps_decay":1e6, "policy":"CnnPolicy"},
    {"lr":2e-4, "gamma":0.0, "batch_size":64, "eps_start":1.0, "eps_end":0.05, "eps_decay":1e5, "policy":"CnnPolicy"},
    {"lr":2e-4, "gamma":0.995, "batch_size":16, "eps_start":1.0, "eps_end":0.01, "eps_decay":2e5, "policy":"CnnPolicy"},
    {"lr":2e-4, "gamma":0.995, "batch_size":128, "eps_start":1.0, "eps_end":0.01, "eps_decay":2e5, "policy":"CnnPolicy"},
    {"lr":2.5e-4, "gamma":0.995, "batch_size":32, "eps_start":1.0, "eps_end":0.01, "eps_decay":5e4, "policy":"CnnPolicy"},
    {"lr":2.5e-4, "gamma":0.995, "batch_size":32, "eps_start":1.0, "eps_end":0.01, "eps_decay":3e6, "policy":"CnnPolicy"},
    {"lr":1e-4, "gamma":0.99, "batch_size":64, "eps_start":0.5, "eps_end":0.01, "eps_decay":2e5, "policy":"CnnPolicy"},
]
hp_df = pd.DataFrame(hp_table)

# FIX: Ensure directory exists
RESULTS_DIR = "dqn_atari_results"
os.makedirs(RESULTS_DIR, exist_ok=True)

# Save table
hp_df.to_csv(os.path.join(RESULTS_DIR, "hyperparameter_table.csv"), index=False)

hp_df


Unnamed: 0,lr,gamma,batch_size,eps_start,eps_end,eps_decay,policy
0,0.00025,0.995,32,1.0,0.01,200000.0,MlpPolicy
1,0.0003,0.99,32,1.0,0.02,700000.0,CnnPolicy
2,0.001,0.95,128,1.0,0.01,200000.0,CnnPolicy
3,5e-05,0.998,32,1.0,0.001,1000000.0,CnnPolicy
4,0.0002,0.0,64,1.0,0.05,100000.0,CnnPolicy
5,0.0002,0.995,16,1.0,0.01,200000.0,CnnPolicy
6,0.0002,0.995,128,1.0,0.01,200000.0,CnnPolicy
7,0.00025,0.995,32,1.0,0.01,50000.0,CnnPolicy
8,0.00025,0.995,32,1.0,0.01,3000000.0,CnnPolicy
9,0.0001,0.99,64,0.5,0.01,200000.0,CnnPolicy


  return datetime.utcnow().replace(tzinfo=utc)


## 3) Callbacks
We'll create checkpoint + eval callbacks to save intermediate and best models.


In [81]:
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback, CallbackList
def make_callbacks(eval_env, exp_name):
    checkpoint_dir = os.path.join(RESULTS_DIR, exp_name)
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint_cb = CheckpointCallback(save_freq=100000, save_path=checkpoint_dir, name_prefix='dqn_ckpt')
    eval_cb = EvalCallback(eval_env, best_model_save_path=checkpoint_dir, log_path=checkpoint_dir,
                           eval_freq=50000, deterministic=True, render=False)
    return CallbackList([checkpoint_cb, eval_cb])


In [None]:
TIMESTEPS = int(2e5)  # Total timesteps for each experiment
results_summary = []

# Assuming:
# 1. ENV_ID = "ALE/IceHockey-v5"
# 2. hp_df contains your 10 unique experiments
# 3. AtariWrapper and VecFrameStack are imported

for i, row in hp_df.iterrows():
    # 1. Define Policy and Experiment Name (Fixes the NameError)
    policy = row['policy']
    exp_name = f"exp_{i+1}_{policy}_lr{row['lr']}_g{row['gamma']}_b{int(row['batch_size'])}"
    print('===', exp_name, '===')

    # 2. Define the environment creator using the reliable SB3 AtariWrapper
    def make_env():
        # Make the base environment
        # Note: frameskip=1 is recommended when using the wrapper that handles frame skipping internally
        env = gym.make(ENV_ID, frameskip=1)

        # Use the SB3 AtariWrapper for preprocessing (grayscale, resize, life-loss handling)
        env = AtariWrapper(env, terminal_on_life_loss=True)
        return env

    # 3. Create Vectorized Environments and Apply VecFrameStack (Crucial for CNN input)
    train_env = DummyVecEnv([make_env])
    train_env = VecFrameStack(train_env, n_stack=4) # Stacks 4 frames

    eval_env = DummyVecEnv([make_env])
    eval_env = VecFrameStack(eval_env, n_stack=4) # Stacks 4 frames for evaluation

    # 4. Instantiate the DQN model
    model = DQN(
        policy=policy,
        env=train_env,
        learning_rate=row['lr'],
        gamma=row['gamma'],
        batch_size=int(row['batch_size']),
        exploration_initial_eps=row['eps_start'],
        exploration_final_eps=row['eps_end'],
        exploration_fraction=min(1.0, row['eps_decay']/TIMESTEPS),
        verbose=1,
        tensorboard_log=os.path.join(RESULTS_DIR, exp_name+'_tb')
    )

    callbacks = make_callbacks(eval_env, exp_name) if 'make_callbacks' in globals() else None

    # 5. Train and Save
    start = time.time()
    model.learn(total_timesteps=TIMESTEPS, callback=callbacks)
    elapsed = time.time() - start

    model_path = os.path.join(RESULTS_DIR, f'dqn_model_{exp_name}.zip')
    model.save(model_path)

    # 6. Evaluate and Record Results
    mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=5)
    results_summary.append({
        'exp': exp_name,
        'mean_reward': float(mean_reward),
        'std_reward': float(std_reward),
        'timesteps': TIMESTEPS,
        'elapsed_s': elapsed,
        'model_path': model_path
    })

    save_json(results_summary, os.path.join(RESULTS_DIR, 'results_summary.json'))

    print(f'Saved {model_path}, Mean Reward: {mean_reward:.2f}')

=== exp_1_MlpPolicy_lr0.00025_g0.995_b32 ===
Using cpu device
Wrapping the env in a VecTransposeImage.




Logging to dqn_atari_results/exp_1_MlpPolicy_lr0.00025_g0.995_b32_tb/DQN_1


  return datetime.utcnow().replace(tzinfo=utc)


----------------------------------
| rollout/            |          |
|    exploration_rate | 0.935    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 77       |
|    time_elapsed     | 167      |
|    total_timesteps  | 13033    |
| train/              |          |
|    learning_rate    | 0.00025  |
|    loss             | 0.000164 |
|    n_updates        | 3233     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.87     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 76       |
|    time_elapsed     | 342      |
|    total_timesteps  | 26363    |
| train/              |          |
|    learning_rate    | 0.00025  |
|    loss             | 0.000285 |
|    n_updates        | 6565     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat



Eval num_timesteps=50000, episode_reward=-11.80 +/- 1.17
Episode length: 3594.80 +/- 119.43
----------------------------------
| eval/               |          |
|    mean_ep_length   | 3.59e+03 |
|    mean_reward      | -11.8    |
| rollout/            |          |
|    exploration_rate | 0.753    |
| time/               |          |
|    total_timesteps  | 50000    |
| train/              |          |
|    learning_rate    | 0.00025  |
|    loss             | 0.016    |
|    n_updates        | 12474    |
----------------------------------
New best mean reward!
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.737    |
| time/               |          |
|    episodes         | 16       |
|    fps              | 70       |
|    time_elapsed     | 749      |
|    total_timesteps  | 53121    |
| train/              |          |
|    learning_rate    | 0.00025  |
|    loss             | 6.3e-05  |
|    n_updates        | 13255    |
-----------



Logging to dqn_atari_results/exp_2_CnnPolicy_lr0.0003_g0.99_b32_tb/DQN_1


