In [42]:
import gym 
import numpy as np 
from stable_baselines3 import A2C, DQN, PPO 

# Define a list of RL algorithms to benchmark 
algorithms = [A2C, DQN, PPO] 

# Define a list of environments to benchmark the algorithms on
env_names = ['MountainCar-v0', 'CartPole-v1', 'Acrobot-v1']

# Load data from file
data = np.loadtxt('DATA.txt', delimiter=',')

# Define a function to evaluate the performance of an RL algorithm on a specific environment
def evaluate_algorithm_on_env(algorithm, env_name, data):
    env = gym.make(env_name)
    model = algorithm('MlpPolicy', env)
    model.learn(total_timesteps=10000, tb_log_name=f"{algorithm.__name__}_{env_name}")
    episode_rewards = []
    for episode in range(data.shape[0]):
        obs = env.reset()
        done = False
        episode_reward = 0
        while not done:
            action, _ = model.predict(obs)
            obs, reward, done, info = env.step(action)
            episode_reward += reward
        episode_rewards.append(episode_reward)
    return np.mean(episode_rewards)

# Evaluate the performance of each RL algorithm on each environment and print the results
for algorithm in algorithms:
    for env_name in env_names:
        mean_reward = evaluate_algorithm_on_env(algorithm, env_name, data)
        print(f"{algorithm.__name__} on {env_name}: {mean_reward}")


A2C on MountainCar-v0: -200.0
A2C on CartPole-v1: 153.54722638680659
A2C on Acrobot-v1: -500.0
DQN on MountainCar-v0: -200.0
DQN on CartPole-v1: 9.615192403798101
DQN on Acrobot-v1: -339.47726136931533
PPO on MountainCar-v0: -200.0
PPO on CartPole-v1: 135.23338330834582
PPO on Acrobot-v1: -441.0849575212394
