1. Import Dependencies

In [None]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
#import random

2.1. Load Environment

In [None]:
environment_name = 'BipedalWalker-v3'
#env = gym.make(environment_name, render_mode="human")
env = gym.make(environment_name, render_mode = "rgb_array")

In [None]:
episodes = 5
for episode in range(1, episodes+1):
    state, info = env.reset()
    terminated = False
    score = 0 
    
    while not terminated:
        env.render()
        action = env.action_space.sample()
        n_state, reward, terminated, truncated, info = env.step(action)
        score+=reward
        
        if terminated or truncated:
            observation, info = env.reset()
        
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

2.2. Understanding the Environemnt

In [None]:
# 0-push cart to left, 1-push cart to the right
env.action_space.sample()

In [None]:
# [cart position, cart velocity, pole angle, pole angular velocity]
env.observation_space

3. Train an RL Model

3.1. Make your directories first:

In [None]:
PPO_path_TB_log = os.path.join('Training', 'Saved Models', 'PPO_Biped_model_TB_Log')

In [None]:
env = gym.make(environment_name, render_mode="rgb_array")
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=PPO_path_TB_log)

In [None]:
from tensorflow.python.client import device_lib

def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

print(get_available_devices())

In [None]:
model.learn(total_timesteps=750000)

4.Save and Reload Model

In [None]:
env.close()

In [None]:
PPO_Path_trained = os.path.join('Training', 'Saved Models', 'PPO_Biped_Model')

In [None]:
model.save(PPO_Path_trained)

In [None]:
del model

In [None]:
#model = PPO.load(PPO_Path_trained, env=env)
model = PPO.load(PPO_Path_trained)

5. Evaluation

In [None]:
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)

In [None]:
mean_reward

In [None]:
episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    terminated = False
    score = 0 
    
    while not terminated:
        env.render()
        action, _ = model.predict(obs) #Now using model here
        obs, reward, terminated, info = env.step(action)
        score+=reward
        
        if terminated or truncated:
            observation = env.reset()
        
    print('Episode:{} Score:{}'.format(episode, score))

In [None]:
env.close()

In [None]:
#env = gym.make(environment_name, render_mode="human")
env = gym.make(environment_name, render_mode="rgb_array")
env = DummyVecEnv([lambda: env])

In [None]:
episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    terminated = False
    score = 0 
    
    while not terminated:
        env.render()
        action, _ = model.predict(obs) #Now using model here
        obs, reward, terminated, info = env.step(action)
        score+=reward
        
        if terminated or truncated:
            observation = env.reset()
        
    print('Episode:{} Score:{}'.format(episode, score))

In [None]:
env.close()