# Stochastic Variational Method with RL algorithms

In [1]:
import numpy as np
import gym
import svm_env as svm
import torch

## Expoloring environment

In [2]:
env = gym.make('svm_env:svmEnv-v0', file_sigmas ="./svmCodeSVD/sigmas.dat" )

obs_space = env.observation_space

print('###### Observation space ####### \n', obs_space)

state_size = env.observation_space.shape[-1]

print('###### Size of observation space ####### \n', state_size)

act_space = env.action_space

print('###### Action space ####### \n', act_space)

act_size = env.action_space.shape[-1]

print('###### Number of actions ####### \n', act_size)

state = env.reset()

print('##### State after reset ###### \n', state)

print('##### File where will be stored sigmas \n', env.file_sigmas)

act_space.sample()

env.princp_dim

###### Observation space ####### 
 Box(-inf, inf, (1,), float32)
###### Size of observation space ####### 
 1
###### Action space ####### 
 Box(-1.0, 1.0, (3,), float32)
###### Number of actions ####### 
 3
*****CALL RESET******
Action chosen at reset:  [0.]
Actions taken at reset:  []
Energies got at reset:  [0.0]
##### State after reset ###### 
 [0.]
##### File where will be stored sigmas 
 ./svmCodeSVD/sigmas.dat


0

In [None]:
a1 = act_space.sample()
print(a1)

env.reset()
print(env.sigmas)


env.actions_taken.append(a1)

env.sigmas = open(env.file_sigmas, 'w')

np.savetxt(env.sigmas, env.actions_taken, fmt="%f")

env.sigmas.close()

a2 = act_space.sample()
print(a2)

env.actions_taken.append(a2)

env.sigmas = open(env.file_sigmas, 'w')

np.savetxt(env.sigmas, env.actions_taken, fmt="%f")

env.sigmas.close()

print(env.file_sigmas)

import subprocess

result = subprocess.check_output(['./svmCodeSVD/svmThree', './svmCodeSVD/remmy.input', env.file_sigmas]).splitlines()

print(result)

result = np.array(result,dtype=float)
result_en = result[0]

princp_dim = int(result[1])
full_dim = int(result[2])
print(princp_dim, full_dim, len(env.actions_taken)) 

## DDPG from `stable_baseline3`

In [None]:
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

# The noise objects for DDPG
action_noise = NormalActionNoise(mean=np.zeros(act_size), sigma=0.1 * np.ones(act_size))

model = DDPG("MlpPolicy", env, action_noise = action_noise, \
            batch_size=128, gamma=1.0, verbose=1)

# (policy, env, learning_rate=0.001, buffer_size=1000000,learning_starts=100, batch_size=100, 
# tau=0.005, gamma=0.99, train_freq=(1, 'episode'),  gradient_steps=- 1, action_noise=None, 
# replay_buffer_class=None, replay_buffer_kwargs=None,  optimize_memory_usage=False, 
# tensorboard_log=None, create_eval_env=False, policy_kwargs=None,  verbose=0, seed=None, 
# device='auto', _init_setup_model=True)

model.learn(total_timesteps=1000, log_interval=5)

# learn(total_timesteps, callback=None, log_interval=4, eval_env=None, eval_freq=- 1,
# n_eval_episodes=5, tb_log_name='DDPG', eval_log_path=None, reset_num_timesteps=True)

## From my `ddpg_agent.py` code

In [3]:
from ddpg_agent import Agent

In [4]:
env = gym.make('svm_env:svmEnv-v0')
# Instance of the ddpg agent
agent = Agent(1, 3, random_seed=2)

### Training loop
def run_ddpg(max_t_step = 300, n_episodes = 3):        
    """Deep Deterministic Policy Gradient learning for Reacher Unity Environment.
    
    Params Input
    ==========
        n_episode (int): maximum number of episodes
        queue (int): number of consecutive episodes 
        
    Params Output
    ==========
        scores_all (list of floats): are the scores collected at the end of each episode
        
    """
    
    ##Inizialization
    scores = []                         
    last_energies = []
    princip_dim = []    
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        agent.reset()                  
        score = 0.0
        
        ## Training loop of each episode
        for t_step in range(max_t_step):
            action = agent.act(state)
            next_state, reward, done, info = env.step(action)                   
            agent.step(state, action, reward, next_state, done)
            score += reward
            state = next_state  
            if done:                                  
                break
        
        scores.append(score)
        last_energies.append(state[0])
        princip_dim.append(env.princp_dim)
        
        print('Episode {} ... Reward: {:.3f}'.format(i_episode, score))

    return scores, last_energies, princip_dim

In [5]:
scores, energies, princip_dim = run_ddpg()
torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')

*****CALL RESET******
Action chosen at reset:  [0.]
Actions taken at reset:  []
Energies got at reset:  [0.0]
****CALL STEP****
Action chosen at step:  [47.826847 58.96609  49.90885 ]
****CALL STEP****
Action chosen at step:  [  0.      107.97255   0.     ]
****CALL STEP****
Action chosen at step:  [  0. 110.   0.]
****CALL STEP****
Action chosen at step:  [  0. 110.   0.]
****CALL STEP****
Action chosen at step:  [  0. 110.   0.]
****CALL STEP****
Action chosen at step:  [  0. 110.   0.]
****CALL STEP****
Action chosen at step:  [  0.34788895 110.           0.        ]
****CALL STEP****
Action chosen at step:  [ 0.      94.45726  0.     ]
****CALL STEP****
Action chosen at step:  [  0.      105.98485   0.     ]
****CALL STEP****
Action chosen at step:  [  0.14922714 110.           0.        ]
****CALL STEP****
Action chosen at step:  [  1.5977974 103.846634   26.985865 ]
## Basis size (it should be the same of full dim) =   1
With this action the energy is:  24.1239
With this action t

With this action the energy is:  -0.0758396
With this action the full dim is:  12  and princip dim is:  12
The new action:  [110.         19.612698    4.8099365]  makes the energy positive: False
The new action:  [110.         19.612698    4.8099365]  makes the energy greater than:  -0.0749561  the previous one:  False
Store the energy got!
Reward is positive! 12.10602
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [78.913956 50.000767 67.77232 ]
****CALL STEP****
Action chosen at step:  [110.   0.   0.]
****CALL STEP****
Action chosen at step:  [95.28731   0.       14.225101]
****CALL STEP****
Action chosen at step:  [97.19005   9.686184  0.      ]
****CALL STEP****
Action chosen at step:  [87.66123  0.       0.     ]
****CALL STEP****
Action chosen at step:  [104.92968   0.        0.     ]
****CALL STEP****
Action chosen at step:  [110.   0.   0.]
****CALL STEP****
Action chosen at step:  [110.          4.9133034   0.       ]
****CALL STEP****
A

With this action the energy is:  -0.136099
With this action the full dim is:  23  and princip dim is:  23
The new action:  [110.         59.217617    3.5834885]  makes the energy positive: False
The new action:  [110.         59.217617    3.5834885]  makes the energy greater than:  -0.136096  the previous one:  False
Store the energy got!
Reward is positive! 23.00069
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [110.        79.93565   59.318104]
****CALL STEP****
Action chosen at step:  [110.        38.71402   41.726883]
## Basis size (it should be the same of full dim) =   24
With this action the energy is:  -0.136224
With this action the full dim is:  24  and princip dim is:  24
The new action:  [110.        38.71402   41.726883]  makes the energy positive: False
The new action:  [110.        38.71402   41.726883]  makes the energy greater than:  -0.136099  the previous one:  False
Store the energy got!
Reward is positive! 24.030000000000005
C

With this action the energy is:  -0.143099
With this action the full dim is:  36  and princip dim is:  36
The new action:  [110.          7.3686867   4.418476 ]  makes the energy positive: False
The new action:  [110.          7.3686867   4.418476 ]  makes the energy greater than:  -0.137783  the previous one:  False
Store the energy got!
Reward is positive! 37.91376
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [66.66481  47.853046 38.606995]
****CALL STEP****
Action chosen at step:  [109.30146   14.402859   0.      ]
****CALL STEP****
Action chosen at step:  [110.        16.068817   0.      ]
****CALL STEP****
Action chosen at step:  [110.          5.2387924   0.       ]
****CALL STEP****
Action chosen at step:  [110.        10.746021   0.      ]
****CALL STEP****
Action chosen at step:  [110.        53.820946   0.      ]
****CALL STEP****
Action chosen at step:  [42.841446 61.66693   0.      ]
****CALL STEP****
Action chosen at step:  [46.7838

With this action the energy is:  -0.1487
With this action the full dim is:  46  and princip dim is:  46
The new action:  [  9.941891  25.971691 110.      ]  makes the energy positive: False
The new action:  [  9.941891  25.971691 110.      ]  makes the energy greater than:  -0.148411  the previous one:  False
Store the energy got!
Reward is positive! 46.132940000000005
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [12.285591 16.99459  84.04632 ]
## Basis size (it should be the same of full dim) =   47
With this action the energy is:  -0.148759
With this action the full dim is:  47  and princip dim is:  47
The new action:  [12.285591 16.99459  84.04632 ]  makes the energy positive: False
The new action:  [12.285591 16.99459  84.04632 ]  makes the energy greater than:  -0.1487  the previous one:  False
Store the energy got!
Reward is positive! 47.027730000000005
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [

With this action the energy is:  -0.149213
With this action the full dim is:  56  and princip dim is:  56
The new action:  [ 15.235062    3.7774544 110.       ]  makes the energy positive: False
The new action:  [ 15.235062    3.7774544 110.       ]  makes the energy greater than:  -0.149162  the previous one:  False
Store the energy got!
Reward is positive! 56.02856000000001
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [19.797035  8.292805 86.74638 ]
## Basis size (it should be the same of full dim) =   57
With this action the energy is:  -0.149228
With this action the full dim is:  57  and princip dim is:  57
The new action:  [19.797035  8.292805 86.74638 ]  makes the energy positive: False
The new action:  [19.797035  8.292805 86.74638 ]  makes the energy greater than:  -0.149213  the previous one:  False
Store the energy got!
Reward is positive! 57.00854999999999
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at 

With this action the energy is:  -0.00836943
With this action the full dim is:  2  and princip dim is:  2
The new action:  [54.156376 30.50951  88.278   ]  makes the energy positive: False
The new action:  [54.156376 30.50951  88.278   ]  makes the energy greater than:  -0.00628616  the previous one:  False
Store the energy got!
Reward is positive! 2.0416654
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [61.49713  28.514767 81.13431 ]
****CALL STEP****
Action chosen at step:  [42.844433 46.23662  70.334305]
****CALL STEP****
Action chosen at step:  [15.003979 29.687037 76.4176  ]
## Basis size (it should be the same of full dim) =   3
With this action the energy is:  -0.0465355
With this action the full dim is:  3  and princip dim is:  3
The new action:  [15.003979 29.687037 76.4176  ]  makes the energy positive: False
The new action:  [15.003979 29.687037 76.4176  ]  makes the energy greater than:  -0.00836943  the previous one:  False
Store the

With this action the energy is:  -0.129245
With this action the full dim is:  9  and princip dim is:  9
The new action:  [ 19.080135  48.154255 110.      ]  makes the energy positive: False
The new action:  [ 19.080135  48.154255 110.      ]  makes the energy greater than:  -0.129153  the previous one:  False
Store the energy got!
Reward is positive! 9.008280000000001
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [  9.38636   37.793617 110.      ]
## Basis size (it should be the same of full dim) =   10
With this action the energy is:  -0.130629
With this action the full dim is:  10  and princip dim is:  10
The new action:  [  9.38636   37.793617 110.      ]  makes the energy positive: False
The new action:  [  9.38636   37.793617 110.      ]  makes the energy greater than:  -0.129245  the previous one:  False
Store the energy got!
Reward is positive! 10.1384
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [ 

With this action the energy is:  -0.138043
With this action the full dim is:  20  and princip dim is:  20
The new action:  [ 8.090267 60.92928  93.24785 ]  makes the energy positive: False
The new action:  [ 8.090267 60.92928  93.24785 ]  makes the energy greater than:  -0.138007  the previous one:  False
Store the energy got!
Reward is positive! 20.007200000000005
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [18.87703 33.73704 98.7754 ]
## Basis size (it should be the same of full dim) =   21
With this action the energy is:  -0.138172
With this action the full dim is:  21  and princip dim is:  21
The new action:  [18.87703 33.73704 98.7754 ]  makes the energy positive: False
The new action:  [18.87703 33.73704 98.7754 ]  makes the energy greater than:  -0.138043  the previous one:  False
Store the energy got!
Reward is positive! 21.02709
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [  0.        22.394775

With this action the energy is:  -0.146614
With this action the full dim is:  29  and princip dim is:  29
The new action:  [ 33.89521     3.2747993 110.       ]  makes the energy positive: False
The new action:  [ 33.89521     3.2747993 110.       ]  makes the energy greater than:  -0.146612  the previous one:  False
Store the energy got!
Reward is positive! 29.000580000000003
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [ 29.616493   0.       110.      ]
****CALL STEP****
Action chosen at step:  [ 18.946278   0.       110.      ]
****CALL STEP****
Action chosen at step:  [  2.575512   0.       110.      ]
****CALL STEP****
Action chosen at step:  [  0.        0.      103.70999]
****CALL STEP****
Action chosen at step:  [  0.         0.       108.941795]
****CALL STEP****
Action chosen at step:  [  0.        15.049015 106.37453 ]
****CALL STEP****
Action chosen at step:  [  4.7558556   0.        110.       ]
****CALL STEP****
Action chosen at st

With this action the energy is:  -0.147745
With this action the full dim is:  41  and princip dim is:  41
The new action:  [ 49.0718     9.815765 110.      ]  makes the energy positive: False
The new action:  [ 49.0718     9.815765 110.      ]  makes the energy greater than:  -0.147737  the previous one:  False
Store the energy got!
Reward is positive! 41.00328
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [51.96994  20.389626 91.01665 ]
## Basis size (it should be the same of full dim) =   42
With this action the energy is:  -0.147758
With this action the full dim is:  42  and princip dim is:  42
The new action:  [51.96994  20.389626 91.01665 ]  makes the energy positive: False
The new action:  [51.96994  20.389626 91.01665 ]  makes the energy greater than:  -0.147745  the previous one:  False
Store the energy got!
Reward is positive! 42.00546000000001
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [ 60.761

With this action the energy is:  -0.0801868
With this action the full dim is:  4  and princip dim is:  4
The new action:  [  0.3246193  29.633385  110.       ]  makes the energy positive: False
The new action:  [  0.3246193  29.633385  110.       ]  makes the energy greater than: -0.0801868  the previous one:  True
The new action:  [  0.3246193  29.633385  110.       ]  makes the energy less than: -0.151 False
The new action:  [  0.3246193  29.633385  110.       ]  makes the energy nan:  False
The energy is greater than previous energy --> Set reward:  -1.0
Store the energy got!
****CALL STEP****
Action chosen at step:  [  0.        20.847618 110.      ]
****CALL STEP****
Action chosen at step:  [  0.         8.037491 110.      ]
****CALL STEP****
Action chosen at step:  [  0.       29.08093 110.     ]
****CALL STEP****
Action chosen at step:  [  0.        31.438065 110.      ]
****CALL STEP****
Action chosen at step:  [  0.        25.946173 110.      ]
****CALL STEP****
Action chosen 

With this action the energy is:  -0.103304
With this action the full dim is:  16  and princip dim is:  16
The new action:  [28.922977 29.767239 86.18495 ]  makes the energy positive: False
The new action:  [28.922977 29.767239 86.18495 ]  makes the energy greater than:  -0.100673  the previous one:  False
Store the energy got!
Reward is positive! 16.42096
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [35.825886 27.573195 96.65918 ]
## Basis size (it should be the same of full dim) =   17
With this action the energy is:  -0.10433
With this action the full dim is:  17  and princip dim is:  17
The new action:  [35.825886 27.573195 96.65918 ]  makes the energy positive: False
The new action:  [35.825886 27.573195 96.65918 ]  makes the energy greater than:  -0.103304  the previous one:  False
Store the energy got!
Reward is positive! 17.174419999999998
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [53.14607 10.7

With this action the energy is:  -0.141066
With this action the full dim is:  31  and princip dim is:  31
The new action:  [ 11.958813  72.826935 109.50528 ]  makes the energy positive: False
The new action:  [ 11.958813  72.826935 109.50528 ]  makes the energy greater than:  -0.141038  the previous one:  False
Store the energy got!
Reward is positive! 31.008680000000002
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [ 22.2872    46.910583 110.      ]
## Basis size (it should be the same of full dim) =   32
With this action the energy is:  -0.141066
With this action the full dim is:  32  and princip dim is:  32
The new action:  [ 22.2872    46.910583 110.      ]  makes the energy positive: False
The new action:  [ 22.2872    46.910583 110.      ]  makes the energy greater than: -0.141066  the previous one:  True
The new action:  [ 22.2872    46.910583 110.      ]  makes the energy less than: -0.151 False
The new action:  [ 22.2872    46.910583 110

With this action the energy is:  -0.142479
With this action the full dim is:  42  and princip dim is:  42
The new action:  [  9.260887  99.5022   110.      ]  makes the energy positive: False
The new action:  [  9.260887  99.5022   110.      ]  makes the energy greater than:  -0.142427  the previous one:  False
Store the energy got!
Reward is positive! 42.02184
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [  0.       57.15188 110.     ]
****CALL STEP****
Action chosen at step:  [  0.       97.83846 110.     ]
****CALL STEP****
Action chosen at step:  [ 22.841007  89.85907  110.      ]
****CALL STEP****
Action chosen at step:  [  0.       76.53464 110.     ]
****CALL STEP****
Action chosen at step:  [  2.2312088  76.44863   110.       ]
## Basis size (it should be the same of full dim) =   43
With this action the energy is:  -0.14248
With this action the full dim is:  43  and princip dim is:  43
The new action:  [  2.2312088  76.44863   110.     

With this action the energy is:  -0.145039
With this action the full dim is:  50  and princip dim is:  50
The new action:  [ 27.547485 110.        58.095695]  makes the energy positive: False
The new action:  [ 27.547485 110.        58.095695]  makes the energy greater than:  -0.145006  the previous one:  False
Store the energy got!
Reward is positive! 50.01649999999999
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [  0.       105.59588   59.142986]
****CALL STEP****
Action chosen at step:  [  0.      110.       79.10851]
****CALL STEP****
Action chosen at step:  [  0.        99.66289  105.091446]
****CALL STEP****
Action chosen at step:  [  0.       94.73685 103.09339]
****CALL STEP****
Action chosen at step:  [  4.602833  85.82007  104.00715 ]
## Basis size (it should be the same of full dim) =   51
With this action the energy is:  -0.145042
With this action the full dim is:  51  and princip dim is:  51
The new action:  [  4.602833  85.82007  1

With this action the energy is:  -0.145117
With this action the full dim is:  56  and princip dim is:  56
The new action:  [  6.8019753  59.843643  110.       ]  makes the energy positive: False
The new action:  [  6.8019753  59.843643  110.       ]  makes the energy greater than:  -0.145111  the previous one:  False
Store the energy got!
Reward is positive! 56.00336
Calculate the diff between dim: 
Diff 2:  0
****CALL STEP****
Action chosen at step:  [ 24.507969  85.73978  110.      ]
****CALL STEP****
Action chosen at step:  [ 32.7567    98.603485 110.      ]
****CALL STEP****
Action chosen at step:  [ 12.149368 110.       110.      ]
****CALL STEP****
Action chosen at step:  [ 23.468555 110.       110.      ]
****CALL STEP****
Action chosen at step:  [  1.9498253 110.        110.       ]
****CALL STEP****
Action chosen at step:  [ 12.709827 110.       110.      ]
****CALL STEP****
Action chosen at step:  [  0.       91.77052 110.     ]
****CALL STEP****
Action chosen at step:  [  0.

In [7]:
len(env.actions_taken)
print(env.file_sigmas)

./svmCodeSVD/sigmas.dat


## Random search as in original SVM

In [None]:
state = env.reset()
scores = []
step = 0
score = 0.0

while True:
    print(".....STEP.....", step)
    action = env.action_space.sample()
    next_state, reward, done, info = env.step(action)
    step = step + 1
    score += reward
    scores.append(score)
    state = next_state
    if done:
        break