In [None]:
!pip3 install swig
!pip3 install "gymnasium[box2d]"
!pip3 install "stable-baselines3[extra]"
!pip3 install tensorboard

In [14]:
import gymnasium as gym

from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback

from gymnasium.wrappers import RecordVideo

In [5]:
def create_environment(env_name='CarRacing-v2', render_mode='rgb_array'):
    env = gym.make(env_name, render_mode=render_mode)
    env = Monitor(env)
    return env

In [6]:
def setup_train_environment(env_name='CarRacing-v2', n_stack=4):
    env = create_environment(env_name)
    env = DummyVecEnv([lambda: env])
    env = VecFrameStack(env, n_stack=n_stack)
    return env

In [16]:
def setup_recording_env(env_name='CarRacing-v2', video_folder='./videos/', n_stack=4):
    test_env = create_environment(env_name)
    test_env = RecordVideo(test_env, video_folder=video_folder, episode_trigger=lambda e: True)
    test_env = VecFrameStack(DummyVecEnv([lambda: test_env]), n_stack=n_stack)
    return test_env

In [8]:
def linear_schedule_with_end(initial_value, final_value):
    def func(progress_remaining):
        return initial_value * progress_remaining + final_value * (1 - progress_remaining)

    return func

In [9]:
def first_stage_model(env):
    model = PPO('CnnPolicy',
                env=env,
                learning_rate=linear_schedule_with_end(3e-4, 1e-4),
                n_steps=2048,
                clip_range=linear_schedule_with_end(0.2, 0.1),
                verbose=1,
                batch_size=128,
                n_epochs=10,
                stats_window_size=10,
                tensorboard_log="./ppo_carracing_tensorboard_n_stack_4_v2/")



In [10]:
def first_stage_learn(env, total_timesteps=200000):
    eval_callback = EvalCallback(eval_env=env,
                                 best_model_save_path=f'./best_model_n_stack_4_v2_200000',
                                 log_path=f'./best_model_n_stack_4_v2_200000',
                                 deterministic=False,
                                 eval_freq=5000)
    checkpoint_callback = CheckpointCallback(save_freq=5000,
                                             save_path='./models_n_stack_4_v2_200000/',
                                             name_prefix='ppo_carracing')
    model = first_stage_model(env)
    model.learn(total_timesteps=total_timesteps,
                callback=[checkpoint_callback, eval_callback])

In [11]:
def second_stage_model(env, path_to_model):
    model = PPO.load(path_to_model)
    model.save("ppo_temp_model")
    model = PPO('CnnPolicy',
                env=env,
                learning_rate=1e-5,
                n_steps=2048,
                clip_range=0.1,
                verbose=1,
                batch_size=128,
                n_epochs=10,
                stats_window_size=10,
                tensorboard_log="./ppo_carracing_tensorboard_n_stack_4_v4/")
    model.set_parameters("ppo_temp_model")
    return model

In [12]:
def second_stage_learn(env, total_timesteps=200000):
    eval_callback = EvalCallback(env,
                                 best_model_save_path=f'./best_model_n_stack_4_v4_200000',
                                 log_path=f'./best_model_n_stack_4_v4_200000',
                                 deterministic=False,
                                 eval_freq=5000)
    checkpoint_callback = CheckpointCallback(save_freq=5000,
                                             save_path=f'./models_n_stack_4_v4_200000/',
                                             name_prefix='ppo_carracing')
    model = second_stage_model(env, 'best_model_n_stack_4_v2_200000/best_model.zip')
    model.learn(total_timesteps=total_timesteps,
                callback=[checkpoint_callback, eval_callback])

In [None]:
first_stage_learn(setup_train_environment(), total_timesteps=200000)

In [None]:
second_stage_learn(setup_train_environment(), total_timesteps=200000)

In [None]:
def vizual(path_to_model):
    test_env = setup_recording_env()
    model = PPO.load(path_to_model)
    obs = test_env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, done, _ = test_env.step(action)
        test_env.render()
        if done[0]:
            print("Episode terminated")
            break
    
    test_env.close()

In [27]:
vizual(f'best_model_n_stack_4_v4_200000/best_model.zip')

  logger.warn(


Moviepy - Building video /Users/nik-dergunov/RL_car_racing_cource_work1/RL_car_racing_cource_work1/videos/rl-video-episode-0.mp4.
Moviepy - Writing video /Users/nik-dergunov/RL_car_racing_cource_work1/RL_car_racing_cource_work1/videos/rl-video-episode-0.mp4



                                                                 

Moviepy - Done !
Moviepy - video ready /Users/nik-dergunov/RL_car_racing_cource_work1/RL_car_racing_cource_work1/videos/rl-video-episode-0.mp4
Episode terminated
Moviepy - Building video /Users/nik-dergunov/RL_car_racing_cource_work1/RL_car_racing_cource_work1/videos/rl-video-episode-1.mp4.
Moviepy - Writing video /Users/nik-dergunov/RL_car_racing_cource_work1/RL_car_racing_cource_work1/videos/rl-video-episode-1.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /Users/nik-dergunov/RL_car_racing_cource_work1/RL_car_racing_cource_work1/videos/rl-video-episode-1.mp4


In [12]:
!tensorboard --logdir=ppo_carracing_tensorboard_n_stack_4_v2/PPO_8

TensorFlow installation not found - running with reduced feature set.
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.18.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C


In [25]:
!tensorboard --logdir=ppo_carracing_tensorboard_n_stack_4_v4/PPO_5

TensorFlow installation not found - running with reduced feature set.
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.18.0 at http://localhost:6061/ (Press CTRL+C to quit)
^C


In [15]:
evaluate = evaluate_policy(PPO.load("best_model_n_stack_4_v2_200000/best_model.zip"), setup_train_environment(), n_eval_episodes=100, render=False, deterministic=False)
print(evaluate)

NameError: name 'setup_train_environment' is not defined

In [None]:
evaluate = evaluate_policy(PPO.load("best_model_n_stack_4_v4_200000/best_model.zip"), setup_train_environment(), n_eval_episodes=100, render=False, deterministic=False)
print(evaluate)