In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from ppo_train import PPO
from losses import Clip_Loss, No_clip_Loss, KL_Penalty_Loss
from constants import seven_env_names

In [2]:
experiment_configs = [
    {
        "name": "No clipping or penalty",
        "loss_class": No_clip_Loss,
        "loss_kwargs": {},
    },
    {
        "name": "Clipping, ε = 0.1",
        "loss_class": Clip_Loss,
        "loss_kwargs": {"clip_coef": 0.1},
    },
    {
        "name": "Clipping, ε = 0.2",
        "loss_class": Clip_Loss,
        "loss_kwargs": {"clip_coef": 0.2},
    },
    {
        "name": "Clipping, ε = 0.3",
        "loss_class": Clip_Loss,
        "loss_kwargs": {"clip_coef": 0.3},
    },
    {
        "name": "Adaptive KL, dtarg = 0.003",
        "loss_class": KL_Penalty_Loss,
        "loss_kwargs": {"beta": 1.0, "adaptive_kl": True, "d_targ": 0.003},
    },
    {
        "name": "Adaptive KL, dtarg = 0.01",
        "loss_class": KL_Penalty_Loss,
        "loss_kwargs": {"beta": 1.0, "adaptive_kl": True, "d_targ": 0.01},
    },
    {
        "name": "Adaptive KL, dtarg = 0.03",
        "loss_class": KL_Penalty_Loss,
        "loss_kwargs": {"beta": 1.0, "adaptive_kl": True, "d_targ": 0.03},
    },
    {
        "name": "Fixed KL, β = 0.3",
        "loss_class": KL_Penalty_Loss,
        "loss_kwargs": {"beta": 0.3},
    },
    {
        "name": "Fixed KL, β = 1.0",
        "loss_class": KL_Penalty_Loss,
        "loss_kwargs": {"beta": 1.0},
    },
    {
        "name": "Fixed KL, β = 3.0",
        "loss_class": KL_Penalty_Loss,
        "loss_kwargs": {"beta": 3.0},
    },
    {
        "name": "Fixed KL, β = 10.0",
        "loss_class": KL_Penalty_Loss,
        "loss_kwargs": {"beta": 10.0},
    },
]

In [3]:
environments = seven_env_names
seeds = [1, 2, 3]
TOTAL_TIMESTEPS = 1_000_000

In [None]:
all_results = []

total_runs = len(experiment_configs) * len(environments) * len(seeds)
pbar = tqdm(total=total_runs, desc="Overall Progress")

for config in experiment_configs:
    for env_name in environments:
        for seed in seeds:
            print(f"--- Running: {config['name']} on {env_name} with seed {seed} ---")
            
            try:
                agent = PPO(
                    env_name=env_name,
                    num_envs=1,
                    seed=seed,
                    total_timesteps=TOTAL_TIMESTEPS,
                    loss_fn_class=config['loss_class'],
                    loss_kwargs=config['loss_kwargs'],
                    verbose=False
                )
                final_reward = agent.train()
                
                result = {
                    "algorithm": config['name'],
                    "environment": env_name,
                    "seed": seed,
                    "final_reward": final_reward
                }
                all_results.append(result)
                
                print(f"Finished: {config['name']} on {env_name} with seed {seed}. Final Reward: {final_reward:.2f}")

            except Exception as e:
                print(f"Failed: {config['name']} on {env_name} with seed {seed}. Error: {e} !!!")
                result = {
                    "algorithm": config['name'],
                    "environment": env_name,
                    "seed": seed,
                    "final_reward": -np.inf
                }
                all_results.append(result)

            pbar.update(1)

pbar.close()

results_df = pd.DataFrame(all_results)
results_df.to_csv("ppo_raw_results.csv", index=False)
print("\nAll experiments complete. Raw results saved to ppo_raw_results.csv")

Overall Progress:   0%|          | 0/231 [00:00<?, ?it/s]

--- Running: No clipping or penalty on Walker2d-v5 with seed 1 ---
--- Finished: No clipping or penalty on Walker2d-v5 with seed 1. Final Reward: 22.59 ---
--- Running: No clipping or penalty on Walker2d-v5 with seed 2 ---
--- Finished: No clipping or penalty on Walker2d-v5 with seed 2. Final Reward: -31.17 ---
--- Running: No clipping or penalty on Walker2d-v5 with seed 3 ---
--- Finished: No clipping or penalty on Walker2d-v5 with seed 3. Final Reward: -105.83 ---
--- Running: No clipping or penalty on HalfCheetah-v5 with seed 1 ---
--- Finished: No clipping or penalty on HalfCheetah-v5 with seed 1. Final Reward: -1068947.80 ---
--- Running: No clipping or penalty on HalfCheetah-v5 with seed 2 ---
--- Finished: No clipping or penalty on HalfCheetah-v5 with seed 2. Final Reward: -5930.08 ---
--- Running: No clipping or penalty on HalfCheetah-v5 with seed 3 ---
--- Finished: No clipping or penalty on HalfCheetah-v5 with seed 3. Final Reward: -3702422.94 ---
--- Running: No clipping or 

In [None]:
import gymnasium as gym

def get_random_policy_score(env_name, num_episodes=100):
    env = gym.make(env_name)
    total_rewards = []
    
    for i in range(num_episodes):
        obs, _ = env.reset(seed=i)
        done = False
        episode_reward = 0
        while not done:
            action = env.action_space.sample()
            obs, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            episode_reward += reward
        total_rewards.append(episode_reward)
        
    env.close()
    return np.mean(total_rewards)

random_scores = {}
for env_name in tqdm(environments, desc="Calculating Random Scores"):
    score = get_random_policy_score(env_name)
    random_scores[env_name] = score
    print(f"Random score for {env_name}: {score:.2f}")

random_scores_df = pd.DataFrame(list(random_scores.items()), columns=['environment', 'random_score'])

Calculating Random Scores:   0%|          | 0/7 [00:00<?, ?it/s]

Random score for Walker2d-v5: 1.30
Random score for HalfCheetah-v5: -277.97
Random score for Hopper-v5: 19.20
Random score for InvertedDoublePendulum-v5: 47.46
Random score for InvertedPendulum-v5: 4.51
Random score for Reacher-v5: -42.82
Random score for Swimmer-v5: 0.02


In [None]:
best_scores_df = results_df.groupby('environment')['final_reward'].max().reset_index()
worst_scores_df = results_df.groupby('environment')['final_reward'].min().reset_index()
best_scores_df.rename(columns={'final_reward': 'best_score'}, inplace=True)
worst_scores_df.rename(columns={'final_reward': 'worst_score'}, inplace=True)

env_scores = pd.merge(best_scores_df, random_scores_df, on='environment')
env_scores = pd.merge(env_scores, worst_scores_df, on='environment')

normalized_df = pd.merge(results_df, env_scores, on='environment')

normalized_df['normalized_score'] = np.where(
    (normalized_df['best_score'] - normalized_df['random_score']) == 0,
    0,
    (normalized_df['final_reward'] - normalized_df['random_score']) / (normalized_df['best_score'] - normalized_df['random_score'])
)

In [None]:
final_table = normalized_df.groupby('algorithm')['normalized_score'].median().reset_index()
final_table.rename(columns={'normalized_score': 'avg. normalized score'}, inplace=True)

algorithm_order = [config['name'] for config in experiment_configs]
final_table['algorithm'] = pd.Categorical(final_table['algorithm'], categories=algorithm_order, ordered=True)
final_table = final_table.sort_values('algorithm')


print("Table 1: Results from continuous control benchmark.")
print("Average normalized scores (over 21 runs of the algorithm, on 7 environments) for each algorithm / hyperparameter setting.")
display(final_table.round(2))

Table 1: Results from continuous control benchmark.
Average normalized scores (over 21 runs of the algorithm, on 7 environments) for each algorithm / hyperparameter setting.


Unnamed: 0,algorithm,avg. normalized score
10,No clipping or penalty,-0.02
3,"Clipping, ε = 0.1",0.69
4,"Clipping, ε = 0.2",0.79
5,"Clipping, ε = 0.3",0.64
0,"Adaptive KL, dtarg = 0.003",0.84
1,"Adaptive KL, dtarg = 0.01",0.8
2,"Adaptive KL, dtarg = 0.03",0.65
6,"Fixed KL, β = 0.3",0.09
7,"Fixed KL, β = 1.0",0.38
9,"Fixed KL, β = 3.0",0.5


In [None]:
cliping_results = pd.DataFrame()
cliping_results = normalized_df[normalized_df['algorithm'] == 'Clipping, ε = 0.2'].groupby('environment')['final_reward'].max().reset_index()
print(cliping_results.to_string())


                 environment  final_reward
0             HalfCheetah-v5   1639.476342
1                  Hopper-v5   1562.328899
2  InvertedDoublePendulum-v5   5651.123639
3        InvertedPendulum-v5    975.100000
4                 Reacher-v5     -5.696826
5                 Swimmer-v5     84.402427
6                Walker2d-v5    982.731343


In [2]:
all_results_v2 = []

experiment_configs = [
    {
        "name": "Clipping, ε = 0.2",
        "loss_class": Clip_Loss,
        "loss_kwargs": {"clip_coef": 0.2},
    }
]

environments = seven_env_names
seeds = [1]
TOTAL_TIMESTEPS = 10_000_000

original_results = {'Walker2d-v5': 3000,
                   'HalfCheetah-v5': 2000,
                   'Hopper-v5': 2300,
                   'InvertedDoublePendulum-v5': 8000,
                   'InvertedPendulum-v5': 1000,
                   'Reacher-v5': -10,
                   'Swimmer-v5': 110}

total_runs = len(experiment_configs) * len(environments) * len(seeds)
pbar = tqdm(total=total_runs, desc="Overall Progress")

for config in experiment_configs:
    for env_name in environments:
        for seed in seeds:
            print(f"--- Running: {config['name']} on {env_name} with seed {seed} ---")
            
            try:
                agent = PPO(
                    env_name=env_name,
                    num_envs=10,
                    seed=seed,
                    total_timesteps=TOTAL_TIMESTEPS,
                    loss_fn_class=config['loss_class'],
                    loss_kwargs=config['loss_kwargs'],
                    verbose=False
                )
                final_reward, _, steps_taken = agent.train(target_score=original_results[env_name])
                
                result = {
                    "algorithm": config['name'],
                    "environment": env_name,
                    "seed": seed,
                    "final_reward": final_reward,
                    "steps_taken": steps_taken
                }
                all_results_v2.append(result)
                
                print(f"Finished: {config['name']} on {env_name} with seed {seed}. Final Reward: {final_reward:.2f} steps: {steps_taken}")

            except Exception as e:
                print(f"Failed: {config['name']} on {env_name} with seed {seed}. Error: {e}")
                result = {
                    "algorithm": config['name'],
                    "environment": env_name,
                    "seed": seed,
                    "final_reward": -np.inf,
                    "steps_taken": -np.inf
                }
                all_results_v2.append(result)

            pbar.update(1)

pbar.close()

results_df = pd.DataFrame(all_results_v2)
results_df.to_csv("ppo_raw_results_v2.csv", index=False)
print("\nAll experiments complete. Raw results saved to ppo_raw_results_v2.csv")

Overall Progress:   0%|          | 0/7 [00:00<?, ?it/s]

--- Running: Clipping, ε = 0.2 on Walker2d-v5 with seed 1 ---
Target score reached: 3003.423880224996
Finished: Clipping, ε = 0.2 on Walker2d-v5 with seed 1. Final Reward: 3003.42 steps: 4239360
--- Running: Clipping, ε = 0.2 on HalfCheetah-v5 with seed 1 ---
Target score reached: 2036.5806756811214
Finished: Clipping, ε = 0.2 on HalfCheetah-v5 with seed 1. Final Reward: 2036.58 steps: 4136960
--- Running: Clipping, ε = 0.2 on Hopper-v5 with seed 1 ---
Target score reached: 2386.4269804049277
Finished: Clipping, ε = 0.2 on Hopper-v5 with seed 1. Final Reward: 2386.43 steps: 1331200
--- Running: Clipping, ε = 0.2 on InvertedDoublePendulum-v5 with seed 1 ---
Target score reached: 8792.76895001977
Finished: Clipping, ε = 0.2 on InvertedDoublePendulum-v5 with seed 1. Final Reward: 8792.77 steps: 368640
--- Running: Clipping, ε = 0.2 on InvertedPendulum-v5 with seed 1 ---
Failed: Clipping, ε = 0.2 on InvertedPendulum-v5 with seed 1. Error: not enough values to unpack (expected 3, got 2)
---

In [5]:
print(results_df)

           algorithm                environment  seed  final_reward  \
0  Clipping, ε = 0.2                Walker2d-v5     1   3024.946599   
1  Clipping, ε = 0.2             HalfCheetah-v5     1          -inf   
2  Clipping, ε = 0.2                  Hopper-v5     1   2305.800322   
3  Clipping, ε = 0.2  InvertedDoublePendulum-v5     1          -inf   
4  Clipping, ε = 0.2        InvertedPendulum-v5     1          -inf   
5  Clipping, ε = 0.2                 Reacher-v5     1     -9.698386   
6  Clipping, ε = 0.2                 Swimmer-v5     1    110.079151   

   steps_taken  
0    7297024.0  
1         -inf  
2    2060288.0  
3         -inf  
4         -inf  
5     122880.0  
6    3293184.0  


In [None]:
all_results_v2 = []

experiment_configs = [
    {
        "name": "Clipping, ε = 0.2",
        "loss_class": Clip_Loss,
        "loss_kwargs": {"clip_coef": 0.2},
    }
]

environments = seven_env_names
seeds = [1]
TOTAL_TIMESTEPS = 5_000_000

original_results = {'Walker2d-v5': 3000,
                   'HalfCheetah-v5': 2000,
                   'Hopper-v5': 2300,
                   'InvertedDoublePendulum-v5': 8000,
                   'InvertedPendulum-v5': 1000,
                   'Reacher-v5': -10,
                   'Swimmer-v5': 110}

total_runs = len(experiment_configs) * len(environments) * len(seeds)
pbar = tqdm(total=total_runs, desc="Overall Progress")

for config in experiment_configs:
    for env_name in environments:
        for seed in seeds:
            print(f"--- Running: {config['name']} on {env_name} with seed {seed} ---")
            
            try:
                agent = PPO(
                    env_name=env_name,
                    num_envs=12,
                    seed=seed,
                    total_timesteps=TOTAL_TIMESTEPS,
                    loss_fn_class=config['loss_class'],
                    loss_kwargs=config['loss_kwargs'],
                    verbose=False
                )
                final_reward, _, steps_taken, one_million_reward = agent.train(original_results[env_name])
                
                result = {
                    "algorithm": config['name'],
                    "environment": env_name,
                    "seed": seed,
                    "final_reward": final_reward,
                    "steps_taken": steps_taken,
                    "one_million_reward": one_million_reward
                }
                all_results_v2.append(result)
                
                print(f"Finished: {config['name']} on {env_name} with seed {seed}. Final Reward: {final_reward:.2f}, one million reward: {one_million_reward:.2f}, steps: {steps_taken}")

            except Exception as e:
                print(f"Failed: {config['name']} on {env_name} with seed {seed}. Error: {e}")
                result = {
                    "algorithm": config['name'],
                    "environment": env_name,
                    "seed": seed,
                    "final_reward": -np.inf,
                    "steps_taken": -np.inf,
                    "one_million_reward": -np.inf
                }
                all_results_v2.append(result)

            pbar.update(1)

pbar.close()

results_df_1M = pd.DataFrame(all_results_v2)
results_df_1M.to_csv("ppo_raw_results_v2_1M.csv", index=False)
print("\nAll experiments complete. Raw results saved to ppo_raw_results_v2_1M.csv")

Overall Progress:   0%|          | 0/7 [00:00<?, ?it/s]

--- Running: Clipping, ε = 0.2 on Walker2d-v5 with seed 1 ---
