# Cart Pole control problem, we want to keep the pole straight  

https://www.gymlibrary.dev/environments/classic_control/cart_pole/

In [None]:
!pip install stable-baselines3[extra]

In [None]:
import stable_baselines3
print(stable_baselines3.__version__)

In [None]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Current device: {torch.cuda.current_device() if torch.cuda.is_available() else 'CPU'}")
if torch.cuda.is_available():
    print(f"Device name: {torch.cuda.get_device_name()}")

In [None]:
import gymnasium as gym 
from stable_baselines3 import PPO   #algorithm
from stable_baselines3.common.vec_env import DummyVecEnv    #environment
from stable_baselines3.common.evaluation import evaluate_policy #performance evaluation

In [None]:
print(gym.__version__)

## Environment

In [None]:
env = gym.make("CartPole-v1", render_mode="human")
episodes = 5
for episode in range(1, episodes + 1):
    state = env.reset() #observations of the environment, which will be given to agent
    done = False
    score = 0 

    while not done:
        env.render()
        action = env.action_space.sample()  #generate random action from action space
                                            #which is Discrete(2) for CartPole-v0
                                            # -> 0 or 1
        n_state, reward, done, truncated, info = env.step(action)   #next step
        score += reward
        done = done or truncated  #if either 'done' or 'truncated', end episode
        
    print('Episode:{} Score:{}'.format(episode, score))

env.close()

What is each value:
https://www.gymlibrary.dev/environments/classic_control/cart_pole/

In [None]:
env.action_space

In [None]:
env.observation_space #4 values float32

## Train the model

In [None]:
import os
log_path = os.path.join('Training', 'Logs')
log_path

In [None]:
env = gym.make("CartPole-v1")
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose = 1) #PPO alg, MlpPolicy is the neural network, env, verbose (print)

In [None]:
model.learn(total_timesteps=20000)

Losses not too big, 58% explained variance

## Save and reload model

In [None]:
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_model')

In [None]:
model.save(PPO_path)

In [None]:
#delete model and load it again
#del model
#model = PPO.load(PPO_path, env=env)

## Evaluate

In [None]:
env = gym.make("CartPole-v1", render_mode="human")
evaluate_policy(model, env, n_eval_episodes=10, render=True) #put render=True to see the agent in action (slower)

In [None]:
env.close()

^ Number of rewards (500 is max) and standard deviation (0 is best) -> perfect!

## Test model

In [None]:
env = gym.make("CartPole-v1", render_mode="human")
episodes = 5
for episode in range(1, episodes + 1):
    obs, info = env.reset()
    done = False
    score = 0 

    while not done:
        env.render()
        action, _ = model.predict(obs) #now using model
        obs, reward, done, truncated, info = env.step(action)   #next step
        score += reward
        done = done or truncated  #if either 'done' or 'truncated', end episode
        
    print('Episode:{} Score:{}'.format(episode, score))

env.close()

In [None]:
env = gym.make("CartPole-v1", render_mode="human")
obs, info = env.reset()
print(model.predict(obs)) #using agent to predict action
env.action_space.sample() #instead of random action

In [None]:
env.step(action)
#reward is 1 for every successful step
#current state, reward, done, info

In [None]:
env.close()

### View logs in tensorboard

In [None]:
#training_log_path = os.path.join(log_path, 'PPO_3')
#!tensorboard --logdir={training_log_path}

## Adding a callback to the training stage


In [None]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
import os

In [None]:
save_path = os.path.join('Training', 'Saved Models')
log_path = os.path.join('Training', 'Logs')

In [None]:
env = gym.make("CartPole-v1", render_mode="human")
env = DummyVecEnv([lambda: env])

In [None]:
#setup callbacks
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback, #if new best model, stop training on this model
                             eval_freq=10000, 
                             best_model_save_path=save_path, 
                             verbose=1)

In [None]:
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=20000, callback=eval_callback)

^ Stops after the mean reward goes above 200

## Changing policies

In [None]:
net_arch=[dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]

In [None]:
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path, policy_kwargs={'net_arch': net_arch})

In [None]:
model.learn(total_timesteps=20000, callback=eval_callback)

## Using other algorithm

In [None]:
model = DQN('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=20000, callback=eval_callback)

In [None]:
dqn_path = os.path.join('Training', 'Saved Models', 'DQN_model')

In [None]:
model.save(dqn_path)

In [None]:
model = DQN.load(dqn_path, env=env)

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
env.close()