In [3]:
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import reinforce_algorithm_utils as rl_monte_carlo
import gym
import torch
from torch.optim import Adam

import imageio

Using device: cpu.


In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}.")

Using device: cpu.


## Cartpole Environment 

In [3]:
env_id = "CartPole-v1"
# Create the env
env_cartpole = gym.make(env_id)

# Create the evaluation env
eval_env_cartpole = gym.make(env_id)

# Get the state space and action space
s_size = env_cartpole.observation_space.shape[0]
a_size = env_cartpole.action_space.n

In [4]:
print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env_cartpole.observation_space.sample()) # Get a random observation

_____OBSERVATION SPACE_____ 

The State Space is:  4
Sample observation [ 3.2461052e+00  1.8677396e+38 -3.1095046e-01  1.7249017e+38]


In [5]:
print("\n _____ACTION SPACE_____ \n")
print("The Action Space is: ", a_size)
print("Action Space Sample", env_cartpole.action_space.sample()) # Take a random action


 _____ACTION SPACE_____ 

The Action Space is:  2
Action Space Sample 0


In [6]:
cartpole_hyperparameters = {
    "h_sizes": [16],
    "n_training_episodes": 1000,
    "n_evaluation_episodes": 10,
    "max_t": 1000,
    "gamma": 1.0,
    "lr": 1e-2,
    "env_id": env_id,
    "state_space": s_size,
    "action_space": a_size,
}

In [7]:
# Create policy and place it to the device
cartpole_policy = rl_monte_carlo.Softmax_Policy_Dense_Layers(cartpole_hyperparameters["state_space"], cartpole_hyperparameters["action_space"], cartpole_hyperparameters["h_sizes"]).to(device)
cartpole_optimizer = Adam(cartpole_policy.parameters(), lr=cartpole_hyperparameters["lr"])

In [10]:
scores = rl_monte_carlo.reinforce_algorithm(env_cartpole, cartpole_policy,
                   cartpole_optimizer,
                   cartpole_hyperparameters["n_training_episodes"], 
                   cartpole_hyperparameters["max_t"],
                   cartpole_hyperparameters["gamma"], 
                   100)

Episode 100	Average Score: 57.23
Episode 200	Average Score: 65.36
Episode 300	Average Score: 77.41
Episode 400	Average Score: 70.61
Episode 500	Average Score: 93.36
Episode 600	Average Score: 130.68
Episode 700	Average Score: 135.55
Episode 800	Average Score: 153.57
Episode 900	Average Score: 154.12
Episode 1000	Average Score: 181.62


In [11]:
rl_monte_carlo.evaluate_agent(eval_env_cartpole, 
               cartpole_hyperparameters["max_t"], 
               cartpole_hyperparameters["n_evaluation_episodes"],
               cartpole_policy)

(239.4, 110.30883917438348)

## Pixel Copter

In [5]:
import gym_pygame

env_id = "Pixelcopter-PLE-v0"
env_pixel_copter = gym.make(env_id)
eval_env = gym.make(env_id)
s_size = env_pixel_copter.observation_space.shape[0]
a_size = env_pixel_copter.action_space.n

pygame 2.1.2 (SDL 2.0.18, Python 3.7.13)
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


In [6]:
print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env_pixel_copter.observation_space.sample()) # Get a random observation

_____OBSERVATION SPACE_____ 

The State Space is:  7
Sample observation [ 1.4245529   1.2038699  -1.0818822  -0.6148519   0.43378115 -1.9250286
 -0.38153526]


In [8]:
print("\n _____ACTION SPACE_____ \n")
print("The Action Space is: ", a_size)
print("Action Space Sample", env_pixel_copter.action_space.sample()) # Take a random action


 _____ACTION SPACE_____ 

The Action Space is:  2
Action Space Sample 1


In [9]:
pixelcopter_hyperparameters = {
    "h_size": [64, 32, 16],
    "n_training_episodes": 50000,
    "n_evaluation_episodes": 10,
    "max_t": 10000,
    "gamma": 0.99,
    "lr": 1e-4,
    "env_id": env_id,
    "state_space": s_size,
    "action_space": a_size,
}

In [10]:
# Create policy and place it to the device
# torch.manual_seed(50)
pixelcopter_policy = rl_monte_carlo.Softmax_Policy_Dense_Layers(pixelcopter_hyperparameters["state_space"], pixelcopter_hyperparameters["action_space"], pixelcopter_hyperparameters["h_size"]).to(device)
pixelcopter_optimizer = Adam(pixelcopter_policy.parameters(), lr=pixelcopter_hyperparameters["lr"])

In [11]:
scores = rl_monte_carlo.reinforce_algorithm(env_pixel_copter, pixelcopter_policy,
                   pixelcopter_optimizer,
                   pixelcopter_hyperparameters["n_training_episodes"], 
                   pixelcopter_hyperparameters["max_t"],
                   pixelcopter_hyperparameters["gamma"], 
                   100)

Episode 100	Average Score: -5.00
Episode 200	Average Score: -4.99
Episode 300	Average Score: -5.00
Episode 400	Average Score: -5.00
Episode 500	Average Score: -5.00
Episode 600	Average Score: -4.99
Episode 700	Average Score: -4.99
Episode 800	Average Score: -5.00
Episode 900	Average Score: -5.00
Episode 1000	Average Score: -4.99
Episode 1100	Average Score: -5.00
Episode 1200	Average Score: -5.00
Episode 1300	Average Score: -5.00
Episode 1400	Average Score: -5.00
Episode 1500	Average Score: -5.00
Episode 1600	Average Score: -4.98
Episode 1700	Average Score: -5.00
Episode 1800	Average Score: -4.99
Episode 1900	Average Score: -4.99
Episode 2000	Average Score: -4.99
Episode 2100	Average Score: -4.98
Episode 2200	Average Score: -4.99
Episode 2300	Average Score: -5.00
Episode 2400	Average Score: -4.98
Episode 2500	Average Score: -5.00
Episode 2600	Average Score: -4.98
Episode 2700	Average Score: -4.99
Episode 2800	Average Score: -5.00
Episode 2900	Average Score: -4.99
Episode 3000	Average Sc

In [12]:
rl_monte_carlo.evaluate_agent(eval_env, 
               pixelcopter_hyperparameters["max_t"], 
               pixelcopter_hyperparameters["n_evaluation_episodes"],
               pixelcopter_policy)

(8.4, 9.232551110067032)