# Stochastic Variational Method with RL algorithms

In [3]:
import numpy as np
import gym
import torch
import subprocess

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Expoloring environment

In [4]:
env = gym.make('svm_env:svmEnv-v1', file_sigmas ="./svmCodeSVD/sigmas1.dat" )

obs_space = env.observation_space

print('###### Observation space ####### \n', obs_space)

state_size = env.observation_space.shape[-1]

print('###### Size of observation space ####### \n', state_size)

act_space = env.action_space

print('###### Action space ####### \n', act_space)

act_size = env.action_space.shape[-1]

print('###### Number of actions ####### \n', act_size)

state = env.reset()

print('##### State after reset ###### \n', state)

print('##### File where will be stored sigmas \n', env.file_sigmas)

###### Observation space ####### 
 Box(-inf, inf, (1,), float32)
###### Size of observation space ####### 
 1
###### Action space ####### 
 Box(-1.0, 1.0, (3,), float32)
###### Number of actions ####### 
 3
#### CALL RESET ####
Action chosen at reset:  [0.]
Actions taken at reset:  []
Energies got at reset:  [0.0]
##### State after reset ###### 
 [0.]
##### File where will be stored sigmas 
 ./svmCodeSVD/sigmas1.dat


## Twin Delayed DDPG (TD3) from `stable_baseline3`

In [None]:
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

# The noise objects for DDPG
action_noise = NormalActionNoise(mean=np.zeros(act_size), sigma=0.2 * np.ones(act_size))

model = DDPG("MlpPolicy", env, action_noise = action_noise, batch_size=64, gamma=1.0, verbose=1, seed = 2)

# (policy, env, learning_rate=0.001, buffer_size=1000000,learning_starts=100, batch_size=100, 
# tau=0.005, gamma=0.99, train_freq=(1, 'episode'),  gradient_steps=- 1, action_noise=None, 
# replay_buffer_class=None, replay_buffer_kwargs=None,  optimize_memory_usage=False, 
# tensorboard_log=None, create_eval_env=False, policy_kwargs=None,  verbose=0, seed=None, 
# device='auto', _init_setup_model=True)

model.learn(total_timesteps=300, log_interval = 5, n_eval_episodes = 1)

# learn(total_timesteps, callback=None, log_interval=4, eval_env=None, eval_freq=- 1,
# n_eval_episodes=5, tb_log_name='DDPG', eval_log_path=None, reset_num_timesteps=True)

## PPO with GAE from `stable_baseline3` 

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

model = PPO("MlpPolicy", env, verbose=1, n_steps = 2, batch_size = 64, gamma = 1.0)

# classstable_baselines3.ppo.PPO(policy, env, learning_rate=0.0003, n_steps=2048, 
#         batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2, 
#         clip_range_vf=None, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, 
#         use_sde=False, sde_sample_freq=- 1, target_kl=None, tensorboard_log=None, 
#         create_eval_env=False, policy_kwargs=None, verbose=0, seed=None, device='auto', 
#         _init_setup_model=True)

model.learn(total_timesteps = 300, n_eval_episodes = 1)

# learn(total_timesteps, callback=None, log_interval=1, eval_env=None, eval_freq=- 1, 
#       n_eval_episodes=5, tb_log_name='PPO', eval_log_path=None, reset_num_timesteps=True)

model.save("ppo_svm")

In [None]:
model = PPO.load("ppo_svm")

obs = env.reset()
rewards = []
score = 0.0
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
    

## From my `ddpg_agent.py` code

In [5]:
from ddpg_agent_bootstrap import DDPG_agent, ActionNoise, ParameterNoise, ReplayBuffer, OUNoise
agent = DDPG_agent(state_size, act_size, device = device, seed = 0)
noise = ActionNoise(act_size, device = device, seed = 0)
agent.set_noise(noise)

In [6]:
## Save all rewards, energies and princip dims in files during training

def save_all(agent, rewards, energies, princip_dims):
    torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
    torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')

    name_rewards = 'rewards_RL_0.out'
    file_rewards = open(name_rewards,'w')
    np.savetxt(file_rewards, rewards, fmt="%f")
    file_rewards.close()

    name_energies = 'energies_RL_0.out'
    file_energies = open(name_energies,'w')
    np.savetxt(file_energies, energies, fmt="%f")
    file_energies.close()

    name_dim = 'princip_dims_RL_0.out'
    file_dim = open(name_dim,'w')
    np.savetxt(file_dim, princip_dims, fmt="%f")
    file_dim.close()

def run_ddpg(max_t_step = 300, n_episodes=700):
    ##Inizialization
    rewards = []
    energies = []  
    princip_dims = []

    for i_episode in range(n_episodes):
        state = env.reset()
        agent.reset()
        rew_per_i_episode = []
        energies_per_i_episode = []
        princip_dim_per_i_episode = []

        ## Training loop of each episode
        for t_step in range(max_t_step):
            action = agent.act(state)
            next_state, reward, done, info = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state

            # Save
            rew_per_i_episode.append(reward)
            energies_per_i_episode.append(state[0])
            princip_dim_per_i_episode.append(env.princp_dim)
            if done:
                break
                
        ## Save data during training (to not lose the work done)
        rewards.append(rew_per_i_episode)
        energies.append(energies_per_i_episode)
        princip_dims.append(princip_dim_per_i_episode)

        save_all(agent, rewards, energies, princip_dims)

        print('Episode {} ... Score: {:.3f}'.format(i_episode, np.sum(rewards[i_episode])))

        
    return rewards, energies, princip_dims


In [7]:
all_rewards, all_energies, all_princip_dim = run_ddpg()

#### CALL RESET ####
Action chosen at reset:  [0.]
Actions taken at reset:  []
Energies got at reset:  [0.0]


TypeError: unsupported operand type(s) for +=: 'numpy.ndarray' and 'Tensor'

In [None]:
import matplotlib.pyplot as plt

scores = np.loadtxt('scores_RL.out')
energies = np.loadtxt('energies_RL.out')
dim = np.loadtxt('princip_dim_RL.out')

plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')

In [None]:
plt.plot(np.arange(len(energies)), energies)
plt.ylabel('Eenergies (mK)')
plt.xlabel('Episode #')

In [None]:
plt.plot(np.arange(len(dim)), dim)
plt.ylabel('dim (mK)')
plt.xlabel('Episode #')

In [None]:
rewards = []

for i in range(10):
    rew_i_episode = []
    for j in range(20):
        rew_i_episode.append(j)
    rewards.append(rew_i_episode)
    print('Episode {} ... Score: {:.3f}'.format(i, np.sum(rewards[i])))
    
name_rewards = 'rewards_RL.out'
file_rewards = open(name_rewards,'w')
np.savetxt(file_rewards, rewards, fmt="%f")
file_rewards.close()

## Random search as in original SVM

In [None]:
state = env.reset()
scores = []
step = 0
score = 0.0

while True:
    print(".....STEP.....", step)
    action = env.action_space.sample()
    next_state, reward, done, info = env.step(action)
    step = step + 1
    score += reward
    scores.append(score)
    state = next_state
    if done:
        break