# Stochastic Variational Method with RL algorithms

In [1]:
import numpy as np
import gym
import svm_env as svm
import torch
import subprocess

## Expoloring environment

In [2]:
env = gym.make('svm_env:svmEnv-v0', file_sigmas ="./svmCodeSVD/sigmas.dat" )

obs_space = env.observation_space

print('###### Observation space ####### \n', obs_space)

state_size = env.observation_space.shape[-1]

print('###### Size of observation space ####### \n', state_size)

act_space = env.action_space

print('###### Action space ####### \n', act_space)

act_size = env.action_space.shape[-1]

print('###### Number of actions ####### \n', act_size)

state = env.reset()

print('##### State after reset ###### \n', state)

print('##### File where will be stored sigmas \n', env.file_sigmas)

t = act_space.sample()
t

###### Observation space ####### 
 Box(-inf, inf, (1,), float32)
###### Size of observation space ####### 
 1
###### Action space ####### 
 Box(-1.0, 1.0, (3,), float32)
###### Number of actions ####### 
 3
#### CALL RESET ####
Action chosen at reset:  [0.]
Actions taken at reset:  []
Energies got at reset:  [0.0]
##### State after reset ###### 
 [0.]
##### File where will be stored sigmas 
 ./svmCodeSVD/sigmas.dat


array([ 0.9306232 ,  0.6120736 , -0.04571519], dtype=float32)

In [None]:
if (state[0] == 0.0):
    print('**Ciao')

In [None]:
a1 = act_space.sample()
print(a1)

env.reset()
print(env.sigmas)


env.actions_taken.append(a1)

env.sigmas = open(env.file_sigmas, 'w')

np.savetxt(env.sigmas, env.actions_taken, fmt="%f")

env.sigmas.close()

a2 = act_space.sample()
print(a2)

env.actions_taken.append(a2)

env.sigmas = open(env.file_sigmas, 'w')

np.savetxt(env.sigmas, env.actions_taken, fmt="%f")

env.sigmas.close()

print(env.file_sigmas)

import subprocess

result = subprocess.check_output(['./svmCodeSVD/svmThree', './svmCodeSVD/remmy.input', env.file_sigmas]).splitlines()

print(result)

result = np.array(result,dtype=float)
result_en = result[0]

princp_dim = int(result[1])
full_dim = int(result[2])
print(princp_dim, full_dim, len(env.actions_taken)) 

## DDPG from `stable_baseline3`

In [None]:
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

# The noise objects for DDPG
action_noise = NormalActionNoise(mean=np.zeros(act_size), sigma=0.1 * np.ones(act_size))

model = DDPG("MlpPolicy", env, action_noise = action_noise, \
            batch_size=128, gamma=1.0, verbose=1)

# (policy, env, learning_rate=0.001, buffer_size=1000000,learning_starts=100, batch_size=100, 
# tau=0.005, gamma=0.99, train_freq=(1, 'episode'),  gradient_steps=- 1, action_noise=None, 
# replay_buffer_class=None, replay_buffer_kwargs=None,  optimize_memory_usage=False, 
# tensorboard_log=None, create_eval_env=False, policy_kwargs=None,  verbose=0, seed=None, 
# device='auto', _init_setup_model=True)

model.learn(total_timesteps=1000, log_interval=5)

# learn(total_timesteps, callback=None, log_interval=4, eval_env=None, eval_freq=- 1,
# n_eval_episodes=5, tb_log_name='DDPG', eval_log_path=None, reset_num_timesteps=True)

## PPO with GAE from `stable_baseline3` 

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

model = PPO("MlpPolicy", env, verbose=1, n_steps = 400)

# classstable_baselines3.ppo.PPO(policy, env, learning_rate=0.0003, n_steps=2048, 
#         batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2, 
#         clip_range_vf=None, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, 
#         use_sde=False, sde_sample_freq=- 1, target_kl=None, tensorboard_log=None, 
#         create_eval_env=False, policy_kwargs=None, verbose=0, seed=None, device='auto', 
#         _init_setup_model=True)

model.learn(total_timesteps = 1000)

# learn(total_timesteps, callback=None, log_interval=1, eval_env=None, eval_freq=- 1, 
#       n_eval_episodes=5, tb_log_name='PPO', eval_log_path=None, reset_num_timesteps=True)

model.save("ppo_svm")

Using cpu device
Wrapping the env in a DummyVecEnv.
#### CALL RESET ####
Action chosen at reset:  [0.]
Actions taken at reset:  []
Energies got at reset:  [0.0]
#### CALL STEP ####
Action chosen at step:  [ 28.92881     6.0894737 110.       ]
Basis size (it should be the same of full dim) =   1
With this action the energy is:  0.709763
With this action the full dim is:  1  and princip dim is:  1
#### THE ENERGY IS GREATER THEN THE PREVIOUS ONE #### --> Set reward:  -8.09763
Store the energy got and sigmas!
#### CALL STEP ####
Action chosen at step:  [ 0.      59.73591  0.     ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [ 21.858734  29.405764 110.      ]
Basis size (it should be the same of full dim) =   2
With this action the energy is:  -0.034855
With this action the full dim is:  2  and princip dim is:  2
#### THE ACTION IS A GOOD ONE #### --> Store the ene

With this action the energy is:  -0.0627567
With this action the full dim is:  10  and princip dim is:  10
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 10.01285
Calculate the diff between dim: 
Diff 2 is:  0
#### CALL STEP ####
Action chosen at step:  [110.        0.       81.05438]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [ 5.6064224 22.199787  17.211151 ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [ 0.       29.324411 61.94048 ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [47.65933  57.549637  0.      ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVE

With this action the energy is:  -0.0876984
With this action the full dim is:  16  and princip dim is:  16
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 16.052239999999998
Calculate the diff between dim: 
Diff 2 is:  0
#### CALL STEP ####
Action chosen at step:  [61.809902  0.       70.97414 ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [  0.        39.258224 104.28931 ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [110. 110.   0.]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [  8.436375  99.798096 109.13034 ]
Basis size (it should be the same of full dim) =   17
With this action

With this action the energy is:  -0.143896
With this action the full dim is:  26  and princip dim is:  26
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 26.00104
Calculate the diff between dim: 
Diff 2 is:  0
#### CALL STEP ####
Action chosen at step:  [ 25.41736  39.87423 108.80887]
Basis size (it should be the same of full dim) =   27
With this action the energy is:  -0.143914
With this action the full dim is:  27  and princip dim is:  27
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 27.004859999999997
Calculate the diff between dim: 
Diff 2 is:  0
#### CALL STEP ####
Action chosen at step:  [87.23604 45.05281 78.89728]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [ 77.531494  36.41444  105.9547  ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and si

With this action the energy is:  -0.146304
With this action the full dim is:  37  and princip dim is:  37
#### THE ENERGY IS GREATER THEN THE PREVIOUS ONE #### --> Set reward:  -1.0
Store the energy got and sigmas!
#### CALL STEP ####
Action chosen at step:  [  0.      103.72279   0.     ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [57.011288 43.81432  42.55438 ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [15.677704 50.65246  42.98219 ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [  0.       66.39885 110.     ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas,

With this action the energy is:  -0.146547
With this action the full dim is:  45  and princip dim is:  45
#### THE ENERGY IS GREATER THEN THE PREVIOUS ONE #### --> Set reward:  -1.0
Store the energy got and sigmas!
#### CALL STEP ####
Action chosen at step:  [ 57.141113 105.56732   66.55561 ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [ 73.41958  98.42705 110.     ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [48.927338  0.       46.698547]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [ 0.       54.36096  14.426769]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigm

With this action the energy is:  -0.146946
With this action the full dim is:  51  and princip dim is:  51
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 51.10607999999999
Calculate the diff between dim: 
Diff 2 is:  0
#### CALL STEP ####
Action chosen at step:  [45.479362 28.560253 14.833157]
Basis size (it should be the same of full dim) =   52
With this action the energy is:  -0.147118
With this action the full dim is:  52  and princip dim is:  52
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 52.08944000000001
Calculate the diff between dim: 
Diff 2 is:  0
#### CALL STEP ####
Action chosen at step:  [ 0.      28.28413 83.20441]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [51.910927 56.967167 25.516321]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken a

With this action the energy is:  -0.147381
With this action the full dim is:  60  and princip dim is:  60
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 60.019200000000005
Calculate the diff between dim: 
Diff 2 is:  0
#### CALL STEP ####
Action chosen at step:  [84.54381 93.85034 61.8299 ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [110.        46.770645  23.507866]
Basis size (it should be the same of full dim) =   61
With this action the energy is:  -0.1474
With this action the full dim is:  61  and princip dim is:  61
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 61.01159
Calculate the diff between dim: 
Diff 2 is:  0
#### CALL STEP ####
Action chosen at step:  [ 0.          0.27539825 18.028866  ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken an

With this action the energy is:  -0.147421
With this action the full dim is:  67  and princip dim is:  67
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 67.00670000000001
Calculate the diff between dim: 
Diff 2 is:  0
#### CALL STEP ####
Action chosen at step:  [58.505463   0.         4.2065353]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [104.585815    0.5519562   0.       ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [60.99921  23.072094  0.      ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [0.        4.1682663 5.689522 ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This ac

With this action the energy is:  -0.147452
With this action the full dim is:  74  and princip dim is:  74
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 74.00074000000001
Calculate the diff between dim: 
Diff 2 is:  0
#### CALL STEP ####
Action chosen at step:  [35.89598  0.       0.     ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [93.170975 25.571173  0.      ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [110.   0.   0.]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [82.41399  73.42864  17.523952]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from action

With this action the energy is:  -0.148921
With this action the full dim is:  84  and princip dim is:  84
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 84.00924
Calculate the diff between dim: 
Diff 2 is:  0
#### CALL STEP ####
Action chosen at step:  [ 39.06659 110.      110.     ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [18.007343 36.191216 67.02509 ]
Basis size (it should be the same of full dim) =   85
With this action the energy is:  -0.148924
With this action the full dim is:  85  and princip dim is:  85
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 85.00255
Calculate the diff between dim: 
Diff 2 is:  0
#### CALL STEP ####
Action chosen at step:  [38.881527 15.938    53.24577 ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the 

With this action the energy is:  -0.149166
With this action the full dim is:  97  and princip dim is:  97
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 97.21049000000001
Calculate the diff between dim: 
Diff 2 is:  0
#### CALL STEP ####
Action chosen at step:  [48.379623   2.7685242  0.       ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [ 4.681492   1.9850883 91.73239  ]
Basis size (it should be the same of full dim) =   98
With this action the energy is:  -0.149169
With this action the full dim is:  98  and princip dim is:  98
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 98.00294
Calculate the diff between dim: 
Diff 2 is:  0
#### CALL STEP ####
Action chosen at step:  [32.894646 48.527596 42.202988]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken a

With this action the energy is:  -0.149173
With this action the full dim is:  106  and princip dim is:  106
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 106.00106000000001
Calculate the diff between dim: 
Diff 2 is:  0
#### CALL STEP ####
Action chosen at step:  [14.746563 48.66565   0.      ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [35.890446 70.10835  47.213726]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [24.300049 56.56802  56.786392]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [ 0.      39.10328  0.     ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS R

With this action the energy is:  -0.149199
With this action the full dim is:  115  and princip dim is:  114
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 114.00228000000001
Calculate the diff between dim: 
Diff 2 is:  1
Add a small PENALTY on the rewards!!
Reward is slightly negative:  -22.800456000000004
#### CALL STEP ####
Action chosen at step:  [ 51.385788 102.61675   45.07635 ]
Basis size (it should be the same of full dim) =   116
With this action the energy is:  -0.149198
With this action the full dim is:  116  and princip dim is:  115
#### THE ENERGY IS GREATER THEN THE PREVIOUS ONE #### --> Set reward:  -1.00001
Store the energy got and sigmas!
#### CALL STEP ####
Action chosen at step:  [ 71.36051  110.        16.628319]
Basis size (it should be the same of full dim) =   117
With this action the energy is:  -0.149199
With this action the full dim is:  117  and princip dim is:  116
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!


With this action the energy is:  -0.149405
With this action the full dim is:  129  and princip dim is:  124
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 124.22816
Calculate the diff between dim: 
Diff 2 is:  3
Add a small PENALTY on the rewards!!
Reward is slightly negative:  -74.53689600000001
#### CALL STEP ####
Action chosen at step:  [16.419556 86.43947  51.94674 ]
Basis size (it should be the same of full dim) =   130
With this action the energy is:  -0.149406
With this action the full dim is:  130  and princip dim is:  125
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 125.00125000000001
Calculate the diff between dim: 
Diff 2 is:  0
#### CALL STEP ####
Action chosen at step:  [59.03829  89.755646 81.70368 ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [110. 110.   0.]
**** ILLEGAL ACTIO

With this action the energy is:  -0.149408
With this action the full dim is:  141  and princip dim is:  134
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 134.00268000000003
Calculate the diff between dim: 
Diff 2 is:  2
Add a small PENALTY on the rewards!!
Reward is slightly negative:  -53.601072000000016
#### CALL STEP ####
Action chosen at step:  [110.       110.        76.172874]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [ 82.958984  27.272837 110.      ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [54.98607  50.483562 78.56632 ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step: 

With this action the energy is:  -0.149411
With this action the full dim is:  151  and princip dim is:  142
#### THE ENERGY IS GREATER THEN THE PREVIOUS ONE #### --> Set reward:  -1.0
Store the energy got and sigmas!
#### CALL STEP ####
Action chosen at step:  [ 35.7482      7.1151657 102.3874   ]
Basis size (it should be the same of full dim) =   152
With this action the energy is:  -0.149411
With this action the full dim is:  152  and princip dim is:  143
#### THE ENERGY IS GREATER THEN THE PREVIOUS ONE #### --> Set reward:  -1.0
Store the energy got and sigmas!
#### CALL STEP ####
Action chosen at step:  [  0.        58.276245 110.      ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [82.23364  16.591022 60.921646]
Basis size (it should be the same of full dim) =   153
With this action the energy is:  -0.149412
With this action the full dim is:  153  and princ

With this action the energy is:  -0.149435
With this action the full dim is:  161  and princip dim is:  149
#### THE ENERGY IS GREATER THEN THE PREVIOUS ONE #### --> Set reward:  -1.0
Store the energy got and sigmas!
#### CALL STEP ####
Action chosen at step:  [110.        42.386753 110.      ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [ 0.7193756 38.515167  61.63408  ]
Basis size (it should be the same of full dim) =   162
With this action the energy is:  -0.149435
With this action the full dim is:  162  and princip dim is:  150
#### THE ENERGY IS GREATER THEN THE PREVIOUS ONE #### --> Set reward:  -1.0
Store the energy got and sigmas!
#### CALL STEP ####
Action chosen at step:  [64.05558    3.6096725 44.48127  ]
Basis size (it should be the same of full dim) =   163
With this action the energy is:  -0.149435
With this action the full dim is:  163  and princ

With this action the energy is:  -0.149438
With this action the full dim is:  169  and princip dim is:  154
#### THE ENERGY IS GREATER THEN THE PREVIOUS ONE #### --> Set reward:  -1.00001
Store the energy got and sigmas!
#### CALL STEP ####
Action chosen at step:  [ 8.142887 69.68777   0.      ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [110.       84.41571   0.     ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [57.872345 80.50345  68.885185]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [35.629333  0.        0.      ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and s

With this action the energy is:  -0.149459
With this action the full dim is:  179  and princip dim is:  161
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 161.00966
Calculate the diff between dim: 
Diff 2 is:  1
Add a small PENALTY on the rewards!!
Reward is slightly negative:  -32.201932
#### CALL STEP ####
Action chosen at step:  [109.91231   0.      110.     ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [110.       110.        32.698112]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [66.21844  31.295305 35.582382]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [110.   0.   0.]
***

With this action the energy is:  -0.149463
With this action the full dim is:  187  and princip dim is:  165
#### THE ENERGY IS GREATER THEN THE PREVIOUS ONE #### --> Set reward:  -1.0
Store the energy got and sigmas!
#### CALL STEP ####
Action chosen at step:  [ 61.909634 110.        93.968254]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [15.447433 69.2099   67.7636  ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [15.213711 68.59108   4.70393 ]
Basis size (it should be the same of full dim) =   188
With this action the energy is:  -0.149469
With this action the full dim is:  188  and princip dim is:  166
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 166.00995999999995
Calculate the diff between dim: 
Di

With this action the energy is:  -0.149655
With this action the full dim is:  195  and princip dim is:  168
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 168.01680000000005
Calculate the diff between dim: 
Diff 2 is:  2
Add a small PENALTY on the rewards!!
Reward is slightly negative:  -67.20672000000002
#### CALL STEP ####
Action chosen at step:  [ 77.60808 110.      110.     ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [51.54504  55.092197 90.59973 ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [66.17936   8.129768 79.00102 ]
Basis size (it should be the same of full dim) =   196
With this action the energy is:  -0.149655
With this action the full dim is:  196  and princip dim is:  169
#### THE ENERG

With this action the energy is:  -0.149646
With this action the full dim is:  207  and princip dim is:  176
#### THE ACTION IS A GOOD ONE #### --> Store the energy got!
Reward is positive! 176.00176000000002
Calculate the diff between dim: 
Diff 2 is:  1
Add a small PENALTY on the rewards!!
Reward is slightly negative:  -35.200352
#### CALL STEP ####
Action chosen at step:  [ 47.535343    1.4769783 110.       ]
Basis size (it should be the same of full dim) =   208
With this action the energy is:  -0.149646
With this action the full dim is:  208  and princip dim is:  177
#### THE ENERGY IS GREATER THEN THE PREVIOUS ONE #### --> Set reward:  -1.0
Store the energy got and sigmas!
#### CALL STEP ####
Action chosen at step:  [72.909096 70.06908  37.61917 ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [ 52.414494 110.       110.      ]
**** ILLEGAL ACTION **** --> Se

With this action the energy is:  -0.149641
With this action the full dim is:  212  and princip dim is:  178
#### THE ENERGY IS GREATER THEN THE PREVIOUS ONE #### --> Set reward:  -1.00003
Store the energy got and sigmas!
#### CALL STEP ####
Action chosen at step:  [94.41521  94.43801  85.993744]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [ 0.36003494  0.         57.601097  ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [39.50439  53.75067  62.442524]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [  0.   0. 110.]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, th

With this action the energy is:  -0.149634
With this action the full dim is:  221  and princip dim is:  180
#### THE ENERGY IS GREATER THEN THE PREVIOUS ONE #### --> Set reward:  -1.00004
Store the energy got and sigmas!
#### CALL STEP ####
Action chosen at step:  [ 0.       0.      72.43876]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [ 64.772606 110.       110.      ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [ 0.       0.      50.46089]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [105.63199    5.541088   0.      ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and s

With this action the energy is:  -0.149632
With this action the full dim is:  230  and princip dim is:  183
#### THE ENERGY IS GREATER THEN THE PREVIOUS ONE #### --> Set reward:  -1.0
Store the energy got and sigmas!
#### CALL STEP ####
Action chosen at step:  [  0.        32.584652 110.      ]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [89.9527    0.       24.339607]
**** ILLEGAL ACTION **** --> Set reward: -10.0
This action IS REMOVED from actions taken and sigmas, the energy is NOT STORED!
#### CALL STEP ####
Action chosen at step:  [34.26177  20.325157 71.969986]
Basis size (it should be the same of full dim) =   231
With this action the energy is:  -0.149632
With this action the full dim is:  231  and princip dim is:  183
#### THE ENERGY IS GREATER THEN THE PREVIOUS ONE #### --> Set reward:  -1.0
Store the energy got and sigmas!
#### CALL STEP ####
Action

## From my `ddpg_agent.py` code

In [None]:
from ddpg_agent import Agent

In [None]:
env = gym.make('svm_env:svmEnv-v0')
# Instance of the ddpg agent
agent = Agent(1, 3, random_seed=2)

### Training loop
def run_ddpg(max_t_step = 500, n_episodes = 1000):        
    """Deep Deterministic Policy Gradient learning for Reacher Unity Environment.
    
    Params Input
    ==========
        n_episode (int): maximum number of episodes
        queue (int): number of consecutive episodes 
        
    Params Output
    ==========
        scores_all (list of floats): are the scores collected at the end of each episode
        
    """
    
    ##Inizialization
    scores = []                         
    last_energies = []
    princip_dim = []
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        agent.reset()                  
        score = 0.0
               
        ## Training loop of each episode
        for t_step in range(max_t_step):
            action = agent.act(state)
            next_state, reward, done, info = env.step(action)                   
            agent.step(state, action, reward, next_state, done)
            score += reward
            state = next_state  
            if done:                                  
                break
        
        scores.append(score)
        last_energies.append(state[0])
        princip_dim.append(env.princp_dim)
        
        print('Episode {} ... Reward: {:.3f}'.format(i_episode, score))

    return scores, last_energies, princip_dim

In [None]:
scores, energies, princip_dim = run_ddpg()
torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')

In [None]:
len(env.actions_taken)
print(env.file_sigmas)

In [None]:
import matplotlib.pyplot as plt
scores = np.loadtxt('scores_RL_tri_0.out')
energies = np.loadtxt('energies_RL_tri_0.out')
dim = np.loadtxt('princip_dim_tri_0.out')

plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')

In [None]:
plt.plot(np.arange(len(energies)), energies)
plt.ylabel('Eenergies (mK)')
plt.xlabel('Episode #')

In [None]:
plt.plot(np.arange(len(dim)), dim)
plt.ylabel('dim (mK)')
plt.xlabel('Episode #')

## Random search as in original SVM

In [None]:
state = env.reset()
scores = []
step = 0
score = 0.0

while True:
    print(".....STEP.....", step)
    action = env.action_space.sample()
    next_state, reward, done, info = env.step(action)
    step = step + 1
    score += reward
    scores.append(score)
    state = next_state
    if done:
        break

In [None]:
for i in range(10):
    if i == 2:
        pass
    else:
        print(i)