# Stochastic Variational Method with RL algorithms

In [1]:
import numpy as np
import gym
import torch
import subprocess
import os
import h5py

## Expoloring environment

In [2]:
env = gym.make('svm_env:svmEnv-v1', file_sigmas ="./svmCodeSVD/sigmas.dat" )

print('### Env Name ######', env.unwrapped.spec.id)

obs_space = env.observation_space

print('###### Observation space ####### \n', obs_space)

state_size = env.observation_space.shape[-1]

print('###### Size of observation space ####### \n', state_size)

act_space = env.action_space

print('###### Action space ####### \n', act_space)

act_size = env.action_space.shape[-1]

print('###### Number of actions ####### \n', act_size)

state = env.reset()

print('##### State after reset ###### \n', state)

print('##### File where will be stored sigmas \n', env.file_sigmas)



### Env Name ###### svmEnv-v1
###### Observation space ####### 
 Box(-inf, inf, (1,), float32)
###### Size of observation space ####### 
 1
###### Action space ####### 
 Box(-1.0, 1.0, (3,), float32)
###### Number of actions ####### 
 3
#### CALL RESET ####
Action chosen at reset:  [0.]
Actions taken at reset:  []
Energies got at reset:  [0.0]
##### State after reset ###### 
 [0.]
##### File where will be stored sigmas 
 ./svmCodeSVD/sigmas.dat


## Saving folders

In [None]:
models_dir_ppo = 'models/PPO'
models_dir_td3 = 'models/TD3'

logdir = 'logs'

if not os.path.exists(models_dir_ppo):
    os.makedirs(models_dir_ppo)
    
if not os.path.exists(models_dir_td3):
    os.makedirs(models_dir_td3)

if not os.path.exists(logdir):
    os.makedirs(logdir)

## Twin Delayed DDPG (TD3) from `stable_baseline3`

In [None]:
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

# The noise objects for DDPG
action_noise = NormalActionNoise(mean=np.zeros(act_size), sigma=0.2 * np.ones(act_size))

model = DDPG("MlpPolicy", env, action_noise=action_noise, batch_size=64, gamma=1.0, verbose=1, seed=0
            , tensorboard_log=logdir )

# (policy, env, learning_rate=0.001, buffer_size=1000000,learning_starts=100, batch_size=100, 
# tau=0.005, gamma=0.99, train_freq=(1, 'episode'),  gradient_steps=- 1, action_noise=None, 
# replay_buffer_class=None, replay_buffer_kwargs=None,  optimize_memory_usage=False, 
# tensorboard_log=None, create_eval_env=False, policy_kwargs=None,  verbose=0, seed=None, 
# device='auto', _init_setup_model=True)

model.learn(total_timesteps=300, log_interval = 5, n_eval_episodes = 1)

# learn(total_timesteps, callback=None, log_interval=4, eval_env=None, eval_freq=- 1,
# n_eval_episodes=5, tb_log_name='DDPG', eval_log_path=None, reset_num_timesteps=True)

## PPO with GAE from `stable_baseline3` 

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

total_t_steps = 20

model = PPO("MlpPolicy", env, verbose=1, gamma = 1.0, tensorboard_log=logdir, batch_size=2, n_steps=2)

# classstable_baselines3.ppo.PPO(policy, env, learning_rate=0.0003, n_steps=2048, 
#         batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2, 
#         clip_range_vf=None, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, 
#         use_sde=False, sde_sample_freq=- 1, target_kl=None, tensorboard_log=None, 
#         create_eval_env=False, policy_kwargs=None, verbose=0, seed=None, device='auto', 
#         _init_setup_model=True)

for i in range(1,10):
    model.learn(total_timesteps=total_t_steps, reset_num_timesteps=False)

# learn(total_timesteps, callback=None, log_interval=1, eval_env=None, eval_freq=- 1, 
#       n_eval_episodes=5, tb_log_name='PPO', eval_log_path=None, reset_num_timesteps=True)

    model.save(f"{models_dir_ppo}/{total_t_steps*i}")

## From my `ddpg_agent.py` code

In [3]:
from ddpg_agent import DDPG_agent
agent = DDPG_agent(state_size, act_size, seed = 0)

In [4]:
## Save all rewards, energies and princip dims in files during training
def create_info_h5(agent, env):
    # Check if file exist and creat it
    i = 0
    while os.path.exists(f'run_{i}.hdf5'):
        i += 1
    dataFile = h5py.File(f'run_{i}.hdf5', 'a')
    
    # Create dataset to store info in hdf5 file
    info = {'alg':agent.name, 'env':env.unwrapped.spec.id}
    st = h5py.string_dtype(encoding='utf-8')
    dataFile.create_dataset('info', dtype=st)
    for k in info.keys():
        dataFile['info'].attrs[k] = info[k]

    # Create dataset to store hyperparams of the model in hdf5 file
    hyperparams = {'batch_size':agent.batch_size, 'bootstrap_size':agent.bootstrap_size \
                   , 'gamma':agent.gamma, 'tau':agent.tau,'lr_critic':agent.lr_critic \
                  , 'lr_actor':agent.lr_actor, 'update_every':agent.update_every \
                   , 'transfer_every':agent.transfer_every, 'num_update':agent.num_update \
                  , 'add_noise_every':agent.add_noise_every}
    dataFile.create_dataset('hyperparams', dtype='f')
    for k in hyperparams.keys():
        dataFile['hyperparams'].attrs[k] = hyperparams[k]
    
    # Create group for rewards, energies, princip dims, actor and critic model
    dataFile.create_group('sigmas')
    dataFile.create_group('rewards')
    dataFile.create_group('energies')
    dataFile.create_group('princip_dims')
    dataFile.create_group('actor_models')
    dataFile.create_group('critic_models')
    
    return dataFile

def save_all(dat_file, i_ep, sigmas_i_ep, rew_i_ep, en_i_ep, pri_dim_i_ep, act_model_i_ep, cr_model_i_ep):
    # Create datasets for rewards, energies, pri dim and store data in it 
    dat_file['sigmas'].create_dataset(f'sigmas_ep_{i_ep}', dtype='f', data=sigmas_i_ep)
    dat_file['rewards'].create_dataset(f'rew_ep_{i_ep}', dtype='f', data=rew_i_ep)
    dat_file['energies'].create_dataset(f'en_ep_{i_ep}', dtype='f', data=en_i_ep)
    dat_file['princip_dims'].create_dataset(f'pri_dim_ep_{i_ep}', dtype='i', data=pri_dim_i_ep)
    
    # Store in actor models group the network params at each ep
    actor_model = torch.load(act_model_i_ep)
    dat_file['actor_models'].create_dataset(f'act_mod_{i_ep}', dtype='f')
    for k in actor_model.keys():
        dat_file['actor_models'][f'act_mod_{i_ep}'].attrs.create(name=k,data=actor_model[k].cpu().data.numpy())
    
    # Store in actor models group the network params at each ep
    critic_model = torch.load(cr_model_i_ep)
    dat_file['critic_models'].create_dataset(f'cri_mod_{i_ep}', dtype='f')
    for k in critic_model.keys():
        dat_file['critic_models'][f'cri_mod_{i_ep}'].attrs.create(name=k,data=critic_model[k].cpu().data.numpy())

def close_file(dat_file, actor_model_file, critic_model_file, file_sigmas):
    dat_file.close()
    os.remove(actor_model_file)
    os.remove(critic_model_file)
    os.remove(file_sigmas)

In [5]:
## Run ddpg algs   
def run_ddpg(max_t_step = 250, n_episodes=400):
    
    # Create h5 file and store info about alg and its hypereparams
    dat_file = create_info_h5(agent, env)
    
    for i_ep in range(n_episodes):
        state = env.reset()
        agent.reset()
        rew_i_ep = []
        en_i_ep = []
        pri_dim_i_ep = []

        ## Training loop of each episode
        for t_step in range(max_t_step):
            action = agent.act(state)
            next_state, reward, done, info = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state

            # Save rew, energies, princip dims, act and crit models
            rew_i_ep.append(reward)
            en_i_ep.append(state[0])
            pri_dim_i_ep.append(env.princp_dim)
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            if done:
                break
                
        ## Save data during training (to not lose the work done)
        save_all(dat_file=dat_file, i_ep=int(i_ep), sigmas_i_ep=env.actions_taken \
                 , rew_i_ep=rew_i_ep, en_i_ep=en_i_ep, pri_dim_i_ep=pri_dim_i_ep \
                 , act_model_i_ep='checkpoint_actor.pth', cr_model_i_ep='checkpoint_critic.pth')
        
        print('Episode {} ... Score: {:.3f}'.format(i_ep, np.sum(rew_i_ep)))

    close_file(dat_file, 'checkpoint_actor.pth', 'checkpoint_critic.pth', env.file_sigmas)
    return dat_file

In [None]:
all_data = run_ddpg(10, 10)

#### CALL RESET ####
Action chosen at reset:  [0.]
Actions taken at reset:  []
Energies got at reset:  [0.0]
#### CALL STEP #### 1
Action chosen at step:  [58.236084 36.820633 56.90082 ]
Basis size (it should be the same of full dim) =   1
With this action the energy is:  0.0257158
With this action the full dim is:  1  and princip dim is:  1
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is  -2.2487993359676643
#### CALL STEP #### 2
Action chosen at step:  [50.291565 40.510654 65.03306 ]
Basis size (it should be the same of full dim) =   2
With this action the energy is:  0.0252435
With this action the full dim is:  2  and princip dim is:  2
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is  -2.2159558473083756
#### CALL STEP #### 3
Action chosen at step:  [45.21618  34.25939  60.410885]
Basis size (it should be the same of full dim) =   3
With this action the energy is:  0.0250003
With this action the full dim is:  3  and princip dim is:  3
##

With this action the energy is:  0.0112388
With this action the full dim is:  5  and princip dim is:  5
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is  -1.2420765213995963
#### CALL STEP #### 6
Action chosen at step:  [79.3582   45.096256 61.160923]
Basis size (it should be the same of full dim) =   6
With this action the energy is:  0.0104849
With this action the full dim is:  6  and princip dim is:  6
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is  -1.1896507197899506
#### CALL STEP #### 7
Action chosen at step:  [96.427505 38.841225 71.24548 ]
Basis size (it should be the same of full dim) =   7
With this action the energy is:  0.0104566
With this action the full dim is:  7  and princip dim is:  7
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is  -1.1876827529700105
#### CALL STEP #### 8
Action chosen at step:  [76.66647 34.76061 65.12565]
Basis size (it should be the same of full dim) =   8
With this action the e

With this action the energy is:  0.0033804
With this action the full dim is:  10  and princip dim is:  10
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is  -0.6956076006286036
Episode 4 ... Score: -13.139
#### CALL RESET ####
Action chosen at reset:  [0.]
Actions taken at reset:  []
Energies got at reset:  [0.0]
#### CALL STEP #### 1
Action chosen at step:  [64.1879   55.270073 61.90925 ]
Basis size (it should be the same of full dim) =   1
With this action the energy is:  0.0242055
With this action the full dim is:  1  and princip dim is:  1
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is  -2.143773884089363
#### CALL STEP #### 2
Action chosen at step:  [47.73578  54.236473 58.88013 ]
Basis size (it should be the same of full dim) =   2
With this action the energy is:  0.0240952
With this action the full dim is:  2  and princip dim is:  2
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is  -2.136103681254049
#### CALL ST

## Random search as in original SVM

In [None]:
state = env.reset()
scores = []
step = 0
score = 0.0

while True:
    print(".....STEP.....", step)
    action = env.action_space.sample()
    next_state, reward, done, info = env.step(action)
    step = step + 1
    score += reward
    scores.append(score)
    state = next_state
    if done:
        break