In [None]:
import sys
sys.path.append('..')

In [None]:
import numpy as np
import gym

from RL.utils import train, test, plot
from RL.environments.highlow import HighLow

env_cartpole = gym.make('CartPole-v1').unwrapped
env_pendulum = gym.make('Pendulum-v0').unwrapped
env_acrobot = gym.make('Acrobot-v1').unwrapped
# mountaincarcontinuous_env = gym.make('MountainCarContinuous-v0').unwrapped
# highlow_env = HighLow()

In [None]:
# universal params
gamma = .99
hidden_dim = 128
lr = 1e-3
device = 'cuda'
dtype = 'float32'

# agents that sample from memory
memory_maxlen = int(1e5) # float causes error

# training params
save_freq = 10

# CartPole

In [None]:
save_dir = './experiments/cartpole/'
max_steps = int(1e5)
max_episode_steps = int(1e3)
e_verbose = 100

### Random

In [None]:
%%time
params = {'file_name':'random', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':0}
train('random', env_cartpole, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

### TabularQ

In [None]:
%%time
params = {'file_name':'tabularq - td', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':0, 
          'online':True, 'gamma':0.95, 'learning_rate':0.1, 'target_update_freq':10, 
          'bin_range': [(-4.8, 4.8), (-4, 4), (-4.18e-01, 4.18e-01), (-4, 4)], 
          'split':None, 'environment': 'CartPole-v1', 'resample': False, 'resample_e':False, 
          'epsilon':1, 'epsilon_min':0.05, 'epsilon_decay':0.99995, 'memory_maxlen':memory_maxlen, 'numbins':20}
train('tabularq', env_cartpole, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

In [None]:
%%time
params = {'file_name':'tabularq - td', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':0, 
          'online':True, 'gamma':0.95, 'learning_rate':0.1, 'target_update_freq':10, 'bin_range': None, 
          'split':0.05, 'environment': 'CartPole-v1', 'resample': False, 'resample_e':False, 
          'epsilon':1, 'epsilon_min':0.05, 'epsilon_decay':0.99995, 'memory_maxlen':memory_maxlen, 'numbins':20}
train('tabularq', env_cartpole, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

In [None]:
%%time
params = {'file_name':'tabularq - mc', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':0, 
          'online':False, 'gamma':0.95, 'learning_rate':0.1, 'target_update_freq':None, 
          'bin_range': [(-4.8, 4.8), (-4, 4), (-4.18e-01, 4.18e-01), (-4, 4)], 
          'split': None, 'environment': 'CartPole-v1', 'resample': False, 'resample_e':False, 
          'epsilon':1, 'epsilon_min':0.05, 'epsilon_decay':0.99995, 'memory_maxlen':memory_maxlen, 'numbins':20}
train('tabularq', env_cartpole, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

### A2C

In [None]:
%%time
params = {'file_name':'a2c - mc, separate net', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':0,
          'gamma':gamma, 'hidden_dim':hidden_dim, 'learning_rate':lr, 'device':device, 'dtype':dtype,
          'activation':'relu', 'beta':0.1, 'clip':False, 'shared_network':False, 'target_update_freq':None}
train('a2c', env_cartpole, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

In [None]:
%%time
params = {'file_name':'a2c - mc, shared net', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':0,
          'gamma':gamma, 'hidden_dim':hidden_dim, 'learning_rate':lr, 'device':device, 'dtype':dtype,
          'activation':'relu', 'beta':0.1, 'clip':False, 'shared_network':True, 'target_update_freq':None}
train('a2c', env_cartpole, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

In [None]:
%%time
params = {'file_name':'a2c - td, separate net', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':0,
          'gamma':gamma, 'hidden_dim':hidden_dim, 'learning_rate':lr, 'device':device, 'dtype':dtype,
          'activation':'relu', 'beta':0.1, 'clip':False, 'shared_network':False, 'target_update_freq':100}
train('a2c', env_cartpole, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

In [None]:
%%time
params = {'file_name':'a2c - td, shared net', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':0,
          'gamma':gamma, 'hidden_dim':hidden_dim, 'learning_rate':lr, 'device':device, 'dtype':dtype,
          'activation':'relu', 'beta':0.1, 'clip':False, 'shared_network':True, 'target_update_freq':100}
train('a2c', env_cartpole, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

### DQN Offline

In [None]:
%%time
params = {'file_name':'dqn - offline', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':1e4,
          'online':False, 'gamma':gamma, 'hidden_dim':hidden_dim, 'learning_rate':lr, 'device':device, 'dtype':dtype,
          'batch_size':32, 'clip':False, 'dueling':False, 'epsilon':1.0, 'epsilon_min':.05, 'epsilon_decay':.99, 'memory_maxlen':memory_maxlen, 'per':False, 'memory_alpha':None, 'memory_beta':None, 'target_update_freq':None}
train('dqn', env_cartpole, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

In [None]:
%%time
params = {'file_name':'dqn - offline, double', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':1e4,
          'online':False, 'gamma':gamma, 'hidden_dim':hidden_dim, 'learning_rate':lr, 'device':device, 'dtype':dtype,
          'batch_size':32, 'clip':False, 'dueling':False, 'epsilon':1.0, 'epsilon_min':.05, 'epsilon_decay':.99, 'memory_maxlen':memory_maxlen, 'per':False, 'memory_alpha':None, 'memory_beta':None, 'target_update_freq':100}
train('dqn', env_cartpole, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

In [None]:
%%time
params = {'file_name':'dqn - offline, double, dueling', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':1e4,
          'online':False, 'gamma':gamma, 'hidden_dim':hidden_dim, 'learning_rate':lr, 'device':device, 'dtype':dtype,
          'batch_size':32, 'clip':False, 'dueling':True, 'epsilon':1.0, 'epsilon_min':.05, 'epsilon_decay':.99, 'memory_maxlen':memory_maxlen, 'per':False, 'memory_alpha':None, 'memory_beta':None, 'target_update_freq':100}
train('dqn', env_cartpole, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

### DQN Online

In [None]:
%%time
params = {'file_name':'dqn - online, double', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':1e4,
          'online':True, 'gamma':gamma, 'hidden_dim':hidden_dim, 'learning_rate':lr, 'device':device, 'dtype':dtype,
          'batch_size':32, 'clip':False, 'dueling':False, 'epsilon':1.0, 'epsilon_min':.05, 'epsilon_decay':.99, 'memory_maxlen':memory_maxlen, 'per':False, 'memory_alpha':None, 'memory_beta':None, 'target_update_freq':100}
train('dqn', env_cartpole, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

In [None]:
%%time
params = {'file_name':'dqn - online, double, dueling, per', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':1e4,
          'online':True, 'gamma':gamma, 'hidden_dim':hidden_dim, 'learning_rate':lr, 'device':device, 'dtype':dtype,
          'batch_size':32, 'clip':False, 'dueling':True, 'epsilon':1.0, 'epsilon_min':.05, 'epsilon_decay':.99, 'memory_maxlen':memory_maxlen, 'per':True, 'memory_alpha':.5, 'memory_beta':.4, 'target_update_freq':100}
train('dqn', env_cartpole, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

### Plots

In [None]:
plot('./experiments/cartpole/', ep_avg=False, ma_len=100, verbose=False)

### Test

In [None]:
test('./experiments/cartpole/dqn - online, double, dueling, per.pkl', './experiments/cartpole/dqn - online, double, dueling, per.pth', env_cartpole, 1, display=True, video_dir='./experiments/cartpole/monitor')

# Acrobot

In [None]:
save_dir = './experiments/acrobot/'
max_steps = int(1e5)
max_episode_steps = 500
e_verbose = 10

### Tabular Q

In [None]:
%%time
params = {'file_name':'tabularq - td', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':0, 
          'online':True, 'gamma':gamma, 'learning_rate':0.1, 'target_update_freq': 10, 'environment': 'Acrobot-v1', 
          'split': 0.05, 'resample': False, 'resample_e': 0, 
          'epsilon':1, 'epsilon_min':0.01, 'epsilon_decay':0.9995, 'memory_maxlen':memory_maxlen, 'numbins':20}
train('tabularq', env_acrobot, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

In [None]:
%%time
params = {'file_name':'tabularq - mc', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':0, 
          'online':False, 'gamma':gamma, 'learning_rate':0.1, 'target_update_freq': None, 'environment': 'Acrobot-v1', 
          'split': 0.05, 'resample': False, 'resample_e': 0, 
          'epsilon':1, 'epsilon_min':0.01, 'epsilon_decay':0.9995, 'memory_maxlen':memory_maxlen, 'numbins':20}
train('tabularq', env_acrobot, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

### Plots

In [None]:
plot('./experiments/acrobot/', ep_avg=True, ma_len=100, verbose=False)

### Test

In [None]:
test('./experiments/acrobot/tabularq - td.pkl', './experiments/acrobot/tabularq - td.pth', env_acrobot, 1, display=True, video_dir='./experiments/acrobot/monitor')

# Pendulum

In [None]:
save_dir = './experiments/pendulum/'
max_steps = int(1e5)
max_episode_steps = 200

### Random

In [None]:
params = {'file_name':'random', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':0}
train('random', env_pendulum, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

### TD3

In [None]:
%%time
params = {'file_name':'td3', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':1e4,
          'gamma':gamma, 'hidden_dim':hidden_dim, 'learning_rate':lr, 'device':device, 'dtype':dtype,
          'batch_size':100, 'expl_noise':.1, 'memory_maxlen':memory_maxlen, 'noise_clip':.5, 'policy_freq':2, 'policy_noise':.2, 'tau':.005}
train('td3', env_pendulum, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

### A2C

In [None]:
%%time
params = {'file_name':'a2c - mc, separate network', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':0,
          'gamma':gamma, 'hidden_dim':hidden_dim, 'learning_rate':lr, 'device':device, 'dtype':dtype,
          'activation':'tanh', 'beta':.78, 'clip':False, 'shared_network':False, 'target_update_freq':None}
train('a2c', env_pendulum, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

In [None]:
%%time
params = {'file_name':'a2c - mc, shared network', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':0,
          'gamma':gamma, 'hidden_dim':hidden_dim, 'learning_rate':lr, 'device':device, 'dtype':dtype,
          'activation':'tanh', 'beta':.78, 'clip':False, 'shared_network':True, 'target_update_freq':None}
train('a2c', env_pendulum, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

In [None]:
%%time
params = {'file_name':'a2c - td, separate network', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':0,
          'gamma':gamma, 'hidden_dim':hidden_dim, 'learning_rate':lr, 'device':device, 'dtype':dtype,
          'activation':'tanh', 'beta':.78, 'clip':False, 'shared_network':False, 'target_update_freq':100}
train('a2c', env_pendulum, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

In [None]:
%%time
params = {'file_name':'a2c - td, shared network', 'max_steps':max_steps, 'max_episode_steps':max_episode_steps, 'start_at':0,
          'gamma':gamma, 'hidden_dim':hidden_dim, 'learning_rate':lr, 'device':device, 'dtype':dtype,
          'activation':'tanh', 'beta':.78, 'clip':False, 'shared_network':True, 'target_update_freq':100}
train('a2c', env_pendulum, e_verbose=e_verbose, save_freq=save_freq, save_dir=save_dir, **params)

### Plots

In [None]:
plot('./experiments/pendulum/', ep_avg=True, ma_len=100, verbose=False)

### Test

In [None]:
test('./experiments/pendulum/td3.pkl', './experiments/pendulum/td3.pth', env_pendulum, 1, display=True, video_dir='./experiments/pendulum/monitor')