# Stochastic Variational Method with RL algorithms

In [1]:
import numpy as np
import gym
import torch
import subprocess
import os
import pickle

## Expoloring environment

In [2]:
env = gym.make('svm_env:svmEnv-v2', n_pairs = 3, n_basis = 50, file_sigmas ="./svmCodeSVD/sigmas.dat" )

print('### Env Name ######', env.unwrapped.spec.id)

obs_space = env.observation_space

print('###### Observation space ####### \n', obs_space)

state_size = env.observation_space.shape[-1]

print('###### Size of observation space ####### \n', state_size)

act_space = env.action_space.shape

print('###### Action space ####### \n', act_space)

act_size = env.action_space.shape[0]*env.action_space.shape[-1]

print('###### Number of actions ####### \n', act_size)

state = env.reset()

print('##### State after reset ###### \n', state)

print('##### File where will be stored sigmas \n', env.file_sigmas)



### Env Name ###### svmEnv-v2
###### Observation space ####### 
 Box(-inf, inf, (1,), float32)
###### Size of observation space ####### 
 1
###### Action space ####### 
 (50, 3)
###### Number of actions ####### 
 150
*****CALL RESET******
Action chosen at reset:  [0.]
##### State after reset ###### 
 [0.]
##### File where will be stored sigmas 
 ./svmCodeSVD/sigmas.dat


# Your codes `DDPG` and `PPO`

## Functions for saving and clean

In [3]:
## Save all rewards, energies and princip dims in files during training
def create_info_h5(agent, env):
    # Check if file exist and creat it
    i = 0
    while os.path.exists(f'run_{i}.hdf5'):
        i += 1
    dataFile = h5py.File(f'run_{i}.hdf5', 'a')
    
    # Create dataset to store info in hdf5 file
    info = {'alg':agent.name, 'env':env.unwrapped.spec.id}
    st = h5py.string_dtype(encoding='utf-8')
    dataFile.create_dataset('info', dtype=st)
    for k in info.keys():
        dataFile['info'].attrs[k] = info[k]

    # Create dataset to store hyperparams of the model in hdf5 file
    hyperparams = {'batch_size':agent.batch_size, 'bootstrap_size':agent.bootstrap_size \
                   , 'gamma':agent.gamma, 'tau':agent.tau,'lr_critic':agent.lr_critic \
                  , 'lr_actor':agent.lr_actor, 'update_every':agent.update_every \
                   , 'transfer_every':agent.transfer_every, 'num_update':agent.num_update \
                  , 'add_noise_every':agent.add_noise_every}
    dataFile.create_dataset('hyperparams', dtype='f')
    for k in hyperparams.keys():
        dataFile['hyperparams'].attrs[k] = hyperparams[k]
    
    # Create group for rewards, energies, princip dims, actor and critic model
    dataFile.create_group('sigmas')
    dataFile.create_group('rewards')
    dataFile.create_group('energies')
    dataFile.create_group('princip_dims')
    dataFile.create_group('full_dims')   
    dataFile.create_group('actor_models')
    dataFile.create_group('critic_models')
    
    # Close and return data file name
    dataFile_name = dataFile.filename
    dataFile.close()
    
    return dataFile_name

def save_all(dat_file_name, i_ep, sigmas_i_ep, rew_i_ep, en_i_ep, pri_dim_i_ep \
             , full_dim_i_ep, act_model_i_ep, cr_model_i_ep):
    # Open data file
    dat_file = h5py.File(dat_file_name, 'a')
    
    # Create datasets for rewards, energies, pri dim, full dim and store data in it 
    dat_file['sigmas'].create_dataset(f'sigmas_ep_{i_ep}', dtype='f', data=sigmas_i_ep)
    dat_file['rewards'].create_dataset(f'rew_ep_{i_ep}', dtype='f', data=rew_i_ep)
    dat_file['energies'].create_dataset(f'en_ep_{i_ep}', dtype='f', data=en_i_ep)
    dat_file['princip_dims'].create_dataset(f'pri_dim_ep_{i_ep}', dtype='i', data=pri_dim_i_ep)
    dat_file['full_dims'].create_dataset(f'full_dim_ep_{i_ep}', dtype='i', data=full_dim_i_ep)
    
    # Store in actor models group the network params at each ep
    actor_model = torch.load(act_model_i_ep)
    dat_file['actor_models'].create_dataset(f'act_mod_{i_ep}', dtype='f')
    for k in actor_model.keys():
        dat_file['actor_models'][f'act_mod_{i_ep}'].attrs.create(name=k,data=actor_model[k].cpu().data.numpy())
    
    # Store in actor models group the network params at each ep
    critic_model = torch.load(cr_model_i_ep)
    dat_file['critic_models'].create_dataset(f'cri_mod_{i_ep}', dtype='f')
    for k in critic_model.keys():
        dat_file['critic_models'][f'cri_mod_{i_ep}'].attrs.create(name=k,data=critic_model[k].cpu().data.numpy())
    
    # Close data file
    dat_file.close()
    
def rm_useless_file(actor_model_file, critic_model_file, file_sigmas):
    os.remove(actor_model_file)
    os.remove(critic_model_file)
    os.remove(file_sigmas)

In [4]:
## Save all rewards, energies and princip dims in files during episode training
def create_run_fold_and_info(agent, env):
    
    # Check if folder exist and creat it
    i = 0
    while os.path.exists(f'runs/run_{i}/'):
        i += 1
    name_dir = f'runs/run_{i}/'
    os.makedirs(name_dir)
    
    # Create info.p to store info in pickle file
    info = {'alg':agent.name, 'env':env.unwrapped.spec.id \
            , 'batch_size':agent.batch_size, 'bootstrap_size':agent.bootstrap_size \
            , 'gamma':agent.gamma, 'tau':agent.tau,'lr_critic':agent.lr_critic \
            , 'lr_actor':agent.lr_actor, 'update_every':agent.update_every \
            , 'transfer_every':agent.transfer_every, 'num_update':agent.num_update \
            , 'add_noise_every':agent.add_noise_every}
    
    pickle.dump(info, open(name_dir+'info.p', 'wb'))
    return name_dir
    
def save_all(name_run_dir, i_ep, sigmas_i_ep, rew_i_ep, en_i_ep, pri_dim_i_ep \
             , full_dim_i_ep, act_model_i_ep, cr_model_i_ep):
    
    pickle.dump(sigmas_i_ep, open(name_run_dir+f'sigmas_{i_ep}.p', 'wb'))
    pickle.dump(rew_i_ep, open(name_run_dir+f'rew_{i_ep}.p', 'wb'))
    pickle.dump(pri_dim_i_ep, open(name_run_dir+f'pri_dim_{i_ep}.p', 'wb'))
    pickle.dump(full_dim_i_ep, open(name_run_dir+f'full_dim_{i_ep}.p', 'wb'))
    pickle.dump(act_model_i_ep, open(name_run_dir+f'act_model_{i_ep}.p', 'wb'))
    pickle.dump(cr_model_i_ep, open(name_run_dir+f'cr_model_{i_ep}.p', 'wb'))
    
def rm_useless_file(actor_model_file, critic_model_file, file_sigmas):
    os.remove(actor_model_file)
    os.remove(critic_model_file)
    os.remove(file_sigmas)

## From my `ddpg_agent.py` code

In [5]:
from ddpg_agent import DDPG_agent
agent = DDPG_agent(state_size, act_size, seed = 0)

In [6]:
## Run ddpg algs   
def run_ddpg(max_t_step = 10, n_episodes=10):
    
    # Create h5 file and store info about alg and its hypereparams
    name_run_dir = create_run_fold_and_info(agent, env)
    
    for i_ep in range(n_episodes):
        state = env.reset()
        agent.reset()
        rew_i_ep = []
        en_i_ep = []
        pri_dim_i_ep = []
        full_dim_i_ep = []
        action_i_episode = []

        ## Training loop of each episode
        for t_step in range(max_t_step):
            action = agent.act(state)
            next_state, reward, done, info = env.step(action.reshape((env.n_basis,env.n_pairs)))
            agent.step(state, action, reward, next_state, done)
            state = next_state

            # Save rew, energies, princip dims, act and crit models
            action_i_episode.append(action.reshape((env.n_basis,env.n_pairs)))
            rew_i_ep.append(reward)
            en_i_ep.append(state[0])
            pri_dim_i_ep.append(env.princp_dim)
            full_dim_i_ep.append(env.full_dim)
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            if done:
                break
                
        ## Save data during training (to not lose the work done)
        save_all(name_run_dir=name_run_dir, i_ep=int(i_ep), sigmas_i_ep=action_i_episode \
                 , rew_i_ep=rew_i_ep, en_i_ep=en_i_ep, pri_dim_i_ep=pri_dim_i_ep \
                 , full_dim_i_ep=full_dim_i_ep, act_model_i_ep='checkpoint_actor.pth' \
                 , cr_model_i_ep='checkpoint_critic.pth')
        
        print('Episode {} ... Score: {:.3f}'.format(i_ep, np.sum(rew_i_ep)))

    rm_useless_file('checkpoint_actor.pth', 'checkpoint_critic.pth', env.file_sigmas)
    return name_run_dir

In [7]:
all_data = run_ddpg(10, 10)

*****CALL RESET******
Action chosen at reset:  [0.]
****CALL STEP****
Action chosen at step:  [[56.62079  74.54903  71.81472 ]
 [46.366604 60.00624  46.208206]
 [72.19338  64.05096  43.73308 ]
 [48.05089  35.115444 51.17658 ]
 [63.10283  58.69724  66.3265  ]
 [57.2352   49.77198  36.248352]
 [40.77845  61.163216 59.994396]
 [37.145775 61.90211  34.994244]
 [78.00186  53.567673 73.89515 ]
 [53.84555  77.4702   62.86515 ]
 [46.758446 37.703266 50.87429 ]
 [55.498627 79.23291  59.081318]
 [42.57069  51.847466 74.4142  ]
 [55.218266 67.95764  75.30478 ]
 [27.697033 78.35595  31.315296]
 [43.471325 50.295387 64.188805]
 [84.21481  42.8381   65.45694 ]
 [47.931084 58.694885 59.48698 ]
 [72.98885  71.01546  61.818535]
 [79.39163  59.21627  52.20235 ]
 [51.3312   41.98659  58.39671 ]
 [68.37305  51.81308  64.949615]
 [52.055717 37.52563  51.46722 ]
 [35.880962 65.45378  74.24716 ]
 [41.80757  60.92245  65.95514 ]
 [73.28565  72.23555  56.23385 ]
 [51.718163 54.84013  46.400253]
 [66.0576   59.

With this action the energy is:  -0.0934616
With this action the full dim is:  50  and princip dim is:  47
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0934616
Set reward :  6.038733133529069
****CALL STEP****
Action chosen at step:  [[ 67.748825  79.58971   76.53144 ]
 [ 58.600243  75.84079   23.372826]
 [ 64.83693   57.99868   69.07976 ]
 [ 27.490805  27.792875  58.336563]
 [ 63.67158   52.627853  46.873528]
 [ 85.33781   23.590485  17.437279]
 [ 53.91177   45.675686  93.0444  ]
 [ 81.25935   89.3019    48.32096 ]
 [ 57.63809   49.745388  54.90922 ]
 [ 38.853973  49.514736  81.68819 ]
 [ 91.19113   67.06339   54.13704 ]
 [ 76.388626  65.701546  43.723   ]
 [ 63.982716  60.337086  84.41444 ]
 [ 73.44702   33.68642   81.78111 ]
 [ 49.323273  97.33129   40.453896]
 [ 70.22476   80.106735  85.906075]
 [ 81.39213   51.599445  71.04626 ]
 [ 66.56798   18.52155   27.139282]
 [ 57.469883  67.38969   48.699802]
 [ 56.261368  59.334637  48.910206]
 [ 51.706547 

With this action the energy is:  -0.127954
With this action the full dim is:  50  and princip dim is:  48
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.127954
Set reward :  8.43731612787925
****CALL STEP****
Action chosen at step:  [[ 47.801735   58.25182    76.5767   ]
 [ 68.39002    42.02959    28.365473 ]
 [ 51.855286   72.768394  105.16343  ]
 [ 37.13025    57.911922   72.10819  ]
 [ 68.4095     51.407017   52.15643  ]
 [ 97.549934   45.65104    25.673428 ]
 [ 51.584854   34.301987   96.03725  ]
 [ 34.56968    63.662743   47.811516 ]
 [ 44.81353    43.88756    71.5338   ]
 [  4.544899   74.16994    56.412064 ]
 [ 80.400604   68.26841    23.061874 ]
 [ 43.394653   56.33668    64.51883  ]
 [ 62.716232   63.26685    48.7362   ]
 [ 57.99383    50.376087   53.634327 ]
 [ 77.82807    76.59271    50.915146 ]
 [ 85.87834    40.995403   45.91805  ]
 [ 85.648315   56.48757    48.693493 ]
 [ 79.04066     1.         54.679176 ]
 [ 24.941689   41.570786   32.2909

With this action the energy is:  -0.0725703
With this action the full dim is:  50  and princip dim is:  47
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0725703
Set reward :  4.585963337577848
****CALL STEP****
Action chosen at step:  [[ 55.111458  57.6152    60.326103]
 [ 33.0036    23.151745  39.776257]
 [ 35.67621   64.06865   50.972458]
 [ 53.03271   80.629105  53.218174]
 [ 57.78268   40.144096  67.46254 ]
 [ 37.65551   53.353916  49.11226 ]
 [ 83.08518   20.782162  60.735073]
 [ 32.79574   98.48084   55.367588]
 [ 54.156094  67.37548   96.26492 ]
 [ 29.565126  87.35093   79.96447 ]
 [ 48.413322  18.154594  75.94587 ]
 [ 59.06898   35.892754  60.703087]
 [ 28.65165   50.770226  27.80414 ]
 [ 19.752056  76.23792   59.34504 ]
 [ 30.630772  68.11322   55.052864]
 [ 37.902004  50.087505  75.08625 ]
 [ 48.306786  30.30736   64.62451 ]
 [ 54.668823  58.917618  63.0123  ]
 [ 54.15004   51.82509   74.53355 ]
 [ 64.78549   60.051414  57.709553]
 [ 45.187668 

With this action the energy is:  -0.0863902
With this action the full dim is:  50  and princip dim is:  49
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0863902
Set reward :  5.546991770612952
****CALL STEP****
Action chosen at step:  [[18.921642  32.608566  41.315266 ]
 [21.857368  16.14795   47.83481  ]
 [15.033482  46.122223  50.85808  ]
 [62.663956  63.24831   71.51278  ]
 [54.31363   84.57776   75.653336 ]
 [40.145634  64.05784   38.98402  ]
 [55.442207   1.6973457 59.6579   ]
 [12.364811  83.92728   64.64378  ]
 [83.401825  74.69096   60.13481  ]
 [26.2741    87.9771    51.951782 ]
 [89.649475  54.87307   80.07233  ]
 [64.35091   39.020554  57.50876  ]
 [46.675705  66.62122   32.262947 ]
 [38.637016  83.21312   45.639107 ]
 [50.74434   35.872284  89.25559  ]
 [49.487774  56.140522  60.308357 ]
 [32.52081    1.        78.92904  ]
 [27.105957  54.061302  63.29855  ]
 [48.18659   88.28337   90.602905 ]
 [22.966576  66.185074  97.19718  ]
 [33.3488    

With this action the energy is:  -0.0244081
With this action the full dim is:  50  and princip dim is:  34
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0244081
Set reward :  1.2367897836792423
****CALL STEP****
Action chosen at step:  [[25.659794 56.04486  70.82524 ]
 [65.20405  47.59795  26.846691]
 [58.84083  66.11244  31.467379]
 [60.980995 70.48227  27.584213]
 [46.157654 40.07371  37.917404]
 [38.657722 57.03557  59.315968]
 [54.029427 77.820335 56.75276 ]
 [28.192213 59.567528 67.14253 ]
 [77.4292   62.348064 59.57092 ]
 [53.10104  66.38924  53.50121 ]
 [70.55391  50.382828 71.535164]
 [47.001305 63.61888  80.99994 ]
 [45.254578 29.344015 62.591003]
 [82.210266 40.478493 65.89771 ]
 [35.426872 51.466652 62.39249 ]
 [56.976326 76.41151  36.64791 ]
 [63.73505  44.41475  61.14997 ]
 [50.678875 24.13367  54.336014]
 [45.840794 42.563423 46.03108 ]
 [61.135933 29.536877 31.197433]
 [47.873596 69.55058  37.45482 ]
 [56.353443 69.05928  79.87462 ]
 [62.1

With this action the energy is:  -0.106189
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.106189
Set reward :  6.923789702578845
****CALL STEP****
Action chosen at step:  [[ 40.25115   26.595898  51.410477]
 [ 37.154823  63.53043   48.853886]
 [ 48.617672  79.57693    4.043068]
 [ 46.18681   44.74233   27.271053]
 [ 44.73722   72.31223   32.75614 ]
 [ 52.682785  29.415848  48.751987]
 [ 66.61014   93.98048   53.46925 ]
 [ 39.84952   50.68076   65.08051 ]
 [ 91.51077   78.74286   61.440308]
 [ 58.56106   74.523254 105.34286 ]
 [ 73.98857   56.548126  42.81214 ]
 [ 45.908436  63.1509    57.11466 ]
 [ 24.657682  47.34511   45.18466 ]
 [110.        10.171989  26.47017 ]
 [ 32.973705  52.239323  67.45065 ]
 [ 59.818066  62.35955   46.130165]
 [ 25.075153  67.02082   43.16306 ]
 [ 48.56579   20.172043  70.21002 ]
 [ 79.61903   72.57869   48.794834]
 [ 51.816746  27.934029  64.06769 ]
 [ 41.24921   4

With this action the energy is:  -0.0134059
With this action the full dim is:  50  and princip dim is:  34
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0134059
Set reward :  0.471702697235866
****CALL STEP****
Action chosen at step:  [[41.535187 68.06214  42.85551 ]
 [58.673904 32.21553  43.872498]
 [61.057426 48.28897  51.589302]
 [38.058456 62.62334  77.65999 ]
 [44.252983 62.395786 75.91816 ]
 [59.41414  32.29924  43.26313 ]
 [37.151085 65.890366 56.41993 ]
 [47.372    81.05307  32.997875]
 [49.231144 53.61007  51.980396]
 [52.21598  59.595924 45.541027]
 [64.21447  64.48512  62.953606]
 [63.85892  36.007767 38.09262 ]
 [70.20762  67.98838  41.51597 ]
 [42.85254  53.017735 58.80828 ]
 [51.211105 58.37331  45.26106 ]
 [66.94324  72.681274 56.647636]
 [66.56339  72.06758  51.31406 ]
 [56.382843 61.21346  90.67554 ]
 [23.211823 78.08719  49.039772]
 [40.249897 54.634624 48.089314]
 [64.99132  56.248653 39.77056 ]
 [45.55297  39.22372  69.721016]
 [64.70

With this action the energy is:  -0.0825585
With this action the full dim is:  50  and princip dim is:  48
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0825585
Set reward :  5.280537407928663
****CALL STEP****
Action chosen at step:  [[ 42.379143  54.86335   47.803375]
 [ 27.858616  41.01458   46.1803  ]
 [ 58.249214  43.782536  27.23973 ]
 [ 60.82771   87.72331   69.51535 ]
 [ 45.41075   66.68567   88.80269 ]
 [ 60.662422  28.761158  57.07873 ]
 [ 24.657038  62.487137  89.143585]
 [ 35.324806  73.470345  47.551258]
 [ 33.693596  54.261593  34.039146]
 [ 61.673264  93.03471   46.450005]
 [ 58.432045  60.85939   71.25004 ]
 [ 60.201     15.296112  47.568596]
 [ 93.87216   64.69229   27.810484]
 [ 11.124172  69.63901   82.784386]
 [ 57.17581   50.6216    29.242521]
 [ 46.0662    63.53391   45.187687]
 [ 50.5199    54.174717  41.62173 ]
 [ 72.21233   77.7129    91.31508 ]
 [ 51.136856  78.500244  41.872463]
 [ 27.623331  25.296747  47.749977]
 [ 65.443245 

With this action the energy is:  -0.0813959
With this action the full dim is:  50  and princip dim is:  49
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0813959
Set reward :  5.199690827544823
****CALL STEP****
Action chosen at step:  [[ 69.7652    55.197186  49.99138 ]
 [ 27.942476  69.816284  51.643303]
 [ 91.09096   50.061028  54.104813]
 [ 52.4656    95.08758   70.76849 ]
 [ 38.406742  38.661255  77.78866 ]
 [ 59.079754  34.456135  87.37827 ]
 [ 37.45179   63.682674  91.10707 ]
 [ 52.091686  29.555302  33.114723]
 [ 72.2206    73.48828   55.935333]
 [ 25.321049  75.2995    68.02457 ]
 [ 46.232815  56.126144  92.58078 ]
 [100.425385  26.217325  39.9799  ]
 [ 89.4196    30.9055    53.17613 ]
 [ 44.684967  50.533783  89.92445 ]
 [ 65.116165  74.86631   60.6495  ]
 [ 65.265884  57.689285  33.49571 ]
 [ 69.030945  47.011192  28.6044  ]
 [ 54.675682  53.058884  65.68216 ]
 [ 55.54887   48.06065   42.015137]
 [ 35.225864  72.76817   52.208942]
 [ 26.124771 

With this action the energy is:  -0.0717235
With this action the full dim is:  50  and princip dim is:  47
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0717235
Set reward :  4.527077319799562
****CALL STEP****
Action chosen at step:  [[ 49.885704  58.54366   58.170563]
 [ 67.89543   70.62174   40.24399 ]
 [ 91.607315  89.18826   45.03142 ]
 [ 68.9997    84.17115   45.63354 ]
 [ 17.10794   57.395733  44.1267  ]
 [ 36.555733  66.081535   6.55785 ]
 [ 84.05461   76.24163   54.40861 ]
 [ 68.22813   72.202     70.51245 ]
 [ 87.996376  77.72815  100.96469 ]
 [ 43.33893   57.896343  66.69681 ]
 [ 45.5074    56.9505    61.7183  ]
 [ 82.99688   52.882156  40.652134]
 [ 30.23963   78.32522   83.15511 ]
 [ 73.85022   76.148834  30.592224]
 [ 84.996605  50.07596   67.230446]
 [ 28.42859   54.303844  56.625977]
 [ 73.88643   77.09375   56.441517]
 [ 41.569244  34.49172   47.160294]
 [ 79.27809   78.99994   64.17589 ]
 [ 49.411385  41.941963  30.214506]
 [ 41.633327 

With this action the energy is:  -0.0900497
With this action the full dim is:  50  and princip dim is:  49
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0900497
Set reward :  5.801471437664954
****CALL STEP****
Action chosen at step:  [[ 55.777462  81.51813   62.968475]
 [ 76.475586  58.77365   10.471222]
 [ 79.71804   82.67661   69.44502 ]
 [ 47.26612   71.07008   16.466278]
 [ 25.443342  36.567505  30.050276]
 [ 23.513258  52.726444   8.183868]
 [ 69.38975   68.271576  89.38251 ]
 [ 52.575207  65.75194   58.32188 ]
 [ 53.278595  65.05376   94.39564 ]
 [ 58.657375 105.54605   62.84308 ]
 [ 40.817017  67.48903   61.065037]
 [ 89.371216  50.569622  21.703163]
 [ 45.62728   33.66402   81.57378 ]
 [ 62.488655  71.283905  66.80667 ]
 [ 54.243954  42.901455  29.327976]
 [ 47.780487  56.694016  54.857086]
 [ 74.13698   15.147366  37.77281 ]
 [ 57.15049   55.279057  68.83004 ]
 [ 64.778305  59.85941   24.316399]
 [ 71.34769   57.28756    9.743519]
 [ 33.821823 

With this action the energy is:  -0.0316754
With this action the full dim is:  50  and princip dim is:  43
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0316754
Set reward :  1.7421539275150177
****CALL STEP****
Action chosen at step:  [[25.284506 68.60718  56.4267  ]
 [91.70038  84.61867  54.795403]
 [66.02859  66.351006 42.65988 ]
 [39.73063  58.085117 45.551353]
 [70.84462  50.572475 56.965805]
 [35.47273  20.888966 72.39997 ]
 [32.712765 73.025536 64.92597 ]
 [69.50182  57.626465 34.713875]
 [86.17593  39.965454 47.43292 ]
 [58.82888  31.075218 27.617634]
 [86.189804 55.34619  57.698433]
 [72.04847  63.553238 58.47888 ]
 [52.18279  28.725563 33.446667]
 [57.484966 57.684376 83.67816 ]
 [35.184372 34.887386 44.011402]
 [19.536888 50.065437 60.622555]
 [33.566154 58.29966  71.68505 ]
 [56.085358 55.83811  85.40139 ]
 [54.68927  72.909706 51.980484]
 [60.580956 64.7484   79.82162 ]
 [58.50972  49.068104 52.999706]
 [72.8634   61.635544 37.328777]
 [57.6

With this action the energy is:  -0.0571621
With this action the full dim is:  50  and princip dim is:  48
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0571621
Set reward :  3.514485374503476
****CALL STEP****
Action chosen at step:  [[  1.        81.5267    80.205986]
 [ 64.603676  72.82541   60.624428]
 [ 31.96045   59.597496  40.75999 ]
 [ 48.826523  52.542236  53.88399 ]
 [ 90.867256  45.664642  59.257416]
 [ 28.219425   5.061981  65.18683 ]
 [ 63.492874  90.748245  41.03042 ]
 [ 68.22468   61.52524   46.519684]
 [ 55.527714  41.628826  51.864468]
 [ 47.36901   53.533817  56.24032 ]
 [ 43.752953  55.131157  32.20427 ]
 [ 74.448456  51.155087  61.830822]
 [ 54.990253  66.92194   31.49361 ]
 [ 80.90316   55.098045  88.38439 ]
 [ 59.410336  74.015076  53.47183 ]
 [ 58.224342  42.64304   58.058907]
 [ 86.20484   73.70773   42.554344]
 [ 51.99306   59.50764   64.73963 ]
 [ 59.5397    71.38779   89.41529 ]
 [ 58.50498   62.633648  60.285473]
 [ 59.47765  

With this action the energy is:  -0.0270916
With this action the full dim is:  50  and princip dim is:  39
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0270916
Set reward :  1.4233989342555642
****CALL STEP****
Action chosen at step:  [[41.234188 45.720085 73.88545 ]
 [62.83762  75.058365 58.67099 ]
 [61.748127 76.792656 26.347036]
 [52.5199   46.34224  44.69616 ]
 [71.6893   31.483606 52.72114 ]
 [30.063395 56.93397  54.35887 ]
 [72.73206  63.415268 73.605835]
 [37.74318  37.541374 48.31645 ]
 [93.746574 44.663773 66.58155 ]
 [54.254494 57.801174 58.01063 ]
 [53.94709  67.44771  58.66543 ]
 [94.158325 53.18449  60.187943]
 [42.473225 43.786324 51.31929 ]
 [47.213898 40.653786 44.585114]
 [13.802723 38.308144 72.22223 ]
 [71.52837  49.698795 66.287796]
 [54.739536 50.883236 73.209724]
 [60.960823 54.487747 58.06919 ]
 [62.406994 83.08647  80.5295  ]
 [49.455006 52.033623 51.514526]
 [59.803764 47.3013   45.282433]
 [66.26637  43.15917  71.23542 ]
 [85.6

With this action the energy is:  -0.0712596
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0712596
Set reward :  4.494817962634533
****CALL STEP****
Action chosen at step:  [[ 64.89595   55.115112  75.64411 ]
 [ 94.27901   83.81601   71.923706]
 [ 63.88606   67.24791   25.7438  ]
 [ 51.184185  38.986176  36.698387]
 [ 76.798515  37.88105   27.526634]
 [ 33.145752  40.40566   38.10322 ]
 [ 77.61398   70.93945   72.6089  ]
 [ 36.350266  37.91517   87.55839 ]
 [110.        28.366718  75.97909 ]
 [ 20.694431  50.423748  81.972694]
 [ 22.287304  85.24818   62.409264]
 [110.        85.41922   49.894447]
 [ 22.754883  58.193314  56.05017 ]
 [ 64.231094  52.86288   45.859894]
 [ 32.630707  42.089085  62.30024 ]
 [ 46.964615  30.217176  56.235077]
 [ 80.527115  27.68493   52.01825 ]
 [ 94.12836   73.983315  68.067566]
 [ 77.05405  103.6017    62.27801 ]
 [ 77.324066  52.0166    57.63571 ]
 [ 57.554585 

With this action the energy is:  -0.101495
With this action the full dim is:  50  and princip dim is:  49
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.101495
Set reward :  6.597371460430437
****CALL STEP****
Action chosen at step:  [[ 40.8105     84.642624   68.58249  ]
 [ 71.94679    67.97402    68.60211  ]
 [ 65.20149    81.09798    49.68096  ]
 [ 54.103188   40.123924   55.897842 ]
 [ 68.60678    66.460434   51.32521  ]
 [ 43.622047   55.325687   36.25473  ]
 [ 35.786407   37.507584   53.5957   ]
 [ 41.213913   72.36612    94.09128  ]
 [ 91.190796   37.13324    48.23498  ]
 [ 14.2234955  11.034008   52.52308  ]
 [ 22.371593   61.791763   33.31174  ]
 [ 91.659485   51.8569     28.990757 ]
 [ 22.830952   28.426659   44.597153 ]
 [ 41.124866   57.366062   39.513252 ]
 [ 41.71554    61.373264   47.861984 ]
 [ 12.68663    38.746246   83.32723  ]
 [ 61.913357   21.325672   62.216217 ]
 [ 62.807716   73.29581   105.49975  ]
 [ 50.77424   103.38568    54.153

With this action the energy is:  -0.0810641
With this action the full dim is:  50  and princip dim is:  46
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0810641
Set reward :  5.176617633521636
****CALL STEP****
Action chosen at step:  [[56.015827  67.89509   73.09103  ]
 [53.934673  44.175987  25.241446 ]
 [23.563103  85.8533    29.864061 ]
 [35.612625  15.993511  56.25297  ]
 [50.4617    30.199903  79.23718  ]
 [44.673866  44.259315  56.71517  ]
 [47.34787   32.032074  48.694153 ]
 [73.123825  95.16348   47.39882  ]
 [41.352043  41.976894  44.262905 ]
 [60.524036  34.476208  77.25546  ]
 [57.084915  80.20939   63.806965 ]
 [42.62937   50.59956   85.560135 ]
 [31.038227  52.960022  48.366802 ]
 [55.984196  73.43837   76.23974  ]
 [40.08119   35.18061   49.210873 ]
 [ 3.6562958 88.23341   65.12681  ]
 [54.727222  48.845505  71.447014 ]
 [40.06564   69.57385   63.5841   ]
 [61.882587  78.80005   50.22487  ]
 [46.701923  66.900276  48.426823 ]
 [40.893158  

With this action the energy is:  -0.0697966
With this action the full dim is:  50  and princip dim is:  48
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0697966
Set reward :  4.393081727384614
****CALL STEP****
Action chosen at step:  [[ 70.83731    62.93222    56.70222  ]
 [ 66.75116    32.80919    37.556664 ]
 [ 40.272896  100.65333    24.108639 ]
 [  6.510895   20.369953   74.57193  ]
 [ 48.540443   39.240353   63.795444 ]
 [ 86.60385    31.546646   28.558695 ]
 [ 17.022781   43.14093    71.77747  ]
 [ 59.150944   90.59195    62.635784 ]
 [ 30.9286     52.139687   49.168755 ]
 [ 89.82849    45.217842   54.57486  ]
 [ 57.678673   48.762398   73.08955  ]
 [ 68.610214   58.245895   28.27089  ]
 [ 37.374672   31.22684    19.517838 ]
 [ 68.60531    63.819294   73.93269  ]
 [ 22.07698    30.181318   87.181114 ]
 [ 19.88869   106.17636    59.559452 ]
 [ 52.785183   53.533043   46.927177 ]
 [ 37.57773    61.076847   41.638943 ]
 [ 70.57419    75.91063    42.8

With this action the energy is:  -0.0354111
With this action the full dim is:  50  and princip dim is:  42
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0354111
Set reward :  2.001932501693503
****CALL STEP****
Action chosen at step:  [[54.86395  66.96789  67.72156 ]
 [55.090843 56.86407  24.38909 ]
 [54.741135 48.173786 41.809074]
 [63.704277 68.34616  96.26047 ]
 [58.60193  49.091362 68.49833 ]
 [31.219677 42.734642 20.887665]
 [38.9843   40.21889  53.785553]
 [64.81041  50.736244 59.754433]
 [58.9299   52.973225 50.39812 ]
 [62.815144 64.275826 53.312687]
 [48.243042 90.559715 76.79494 ]
 [77.51802  67.322754 47.04638 ]
 [46.715733 73.9226   72.72354 ]
 [47.232536 32.981674 77.51976 ]
 [ 7.033104 75.997925 23.78005 ]
 [63.01934  87.206345 66.17937 ]
 [68.4085   39.923317 53.83554 ]
 [81.34435  72.56828  48.93481 ]
 [28.934284 81.47498  51.862186]
 [33.533363 48.596992 47.617798]
 [56.82255  71.806305 65.44697 ]
 [60.90691  41.853146 66.76821 ]
 [77.42

With this action the energy is:  -0.0588025
With this action the full dim is:  50  and princip dim is:  49
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0588025
Set reward :  3.628557910596414
****CALL STEP****
Action chosen at step:  [[ 63.240643   61.68389    79.283356 ]
 [ 31.142334   91.173706   50.80047  ]
 [ 54.37181    52.619247   39.784447 ]
 [ 45.566154   53.149208   87.19956  ]
 [  1.         56.09098    64.87116  ]
 [ 32.00168    57.366398   31.99883  ]
 [ 49.309628   63.804226   83.08723  ]
 [ 53.41204    80.46756    84.25429  ]
 [ 46.2168     57.648174   34.55806  ]
 [109.81134    31.807705   69.43685  ]
 [ 37.694237   90.32414    90.08254  ]
 [ 82.02277    49.127056   69.44205  ]
 [ 28.243864   64.920494   85.749115 ]
 [ 39.28627    69.3832     62.769047 ]
 [ 14.034622   43.29528    35.251163 ]
 [ 39.180367   56.79632    46.038017 ]
 [ 91.63393    39.25923    14.613132 ]
 [ 72.96652    52.10573    50.77485  ]
 [ 50.155563   89.54036    21.0

With this action the energy is:  -0.0826126
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0826126
Set reward :  5.284299492909538
Episode 8 ... Score: 46.213
*****CALL RESET******
Action chosen at reset:  [0.]
****CALL STEP****
Action chosen at step:  [[31.240824 59.935417 48.8548  ]
 [63.42421  38.227295 55.313545]
 [77.09405  72.68236  42.310722]
 [76.842705 55.721817 51.920025]
 [56.929226 33.61811  61.731754]
 [30.745642 41.47526  31.725473]
 [49.416897 59.704914 49.30264 ]
 [23.559704 60.945908 66.86305 ]
 [65.7872   69.709    30.432596]
 [39.46923  44.627266 52.554585]
 [54.86277  40.773296 80.75219 ]
 [83.41913  40.652508 57.3116  ]
 [52.14695  41.963264 78.52382 ]
 [85.8647   53.884666 55.78762 ]
 [62.922066 64.433495 61.964592]
 [43.202557 71.839294 43.757465]
 [67.48628  47.942253 72.907005]
 [63.37357  66.97316  50.979084]
 [57.22541  60.86277  45.590195]
 [41.900635 43.66946  40.0

With this action the energy is:  -0.0822789
With this action the full dim is:  50  and princip dim is:  49
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0822789
Set reward :  5.261094173905507
****CALL STEP****
Action chosen at step:  [[ 62.214813  34.434113  11.584568]
 [ 51.268253  59.227818  47.97581 ]
 [ 82.98835  110.        35.6594  ]
 [ 96.943375  43.68708   56.017406]
 [ 69.93837   56.183823  62.09902 ]
 [ 86.85576   33.33281   58.46516 ]
 [ 62.135494  41.55043   77.247826]
 [ 24.918013  43.18961   61.602276]
 [ 57.55326   85.60373   50.43951 ]
 [ 83.026436  24.29991   71.16204 ]
 [ 46.546894  32.4787    71.78644 ]
 [ 68.35512   47.47901   28.155354]
 [ 69.559044  47.201637  76.55533 ]
 [ 82.7007    38.499878  56.277523]
 [ 28.049593  85.61439   62.774147]
 [ 22.64835   79.95729   67.2451  ]
 [ 71.21335   29.428913  89.03184 ]
 [ 54.31028   80.3522    61.60347 ]
 [ 66.539566  29.029589  40.5934  ]
 [ 66.90608   54.53421   50.00857 ]
 [ 48.39028  

With this action the energy is:  -0.0703178
With this action the full dim is:  50  and princip dim is:  47
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0703178
Set reward :  4.429325695814045
****CALL STEP****
Action chosen at step:  [[ 59.933907   67.993065   23.370197 ]
 [ 68.59425    59.474743   46.780914 ]
 [ 37.246777   76.05448    22.617554 ]
 [106.6116     40.932747   48.036194 ]
 [ 76.27219    84.36238    46.98077  ]
 [ 71.79177    23.54012    13.7167015]
 [ 69.502205   88.2005     58.76285  ]
 [ 60.66497    38.52665    57.160633 ]
 [ 74.78988   101.21445    66.68398  ]
 [ 61.776512   36.824642   43.955925 ]
 [ 35.358414   87.81642    83.82697  ]
 [ 88.338135   46.81646    74.97836  ]
 [ 72.99154   100.89432    36.399776 ]
 [ 71.87889    36.727997   55.960022 ]
 [ 44.701004   78.30735    71.026566 ]
 [ 38.394077   72.34662    51.67237  ]
 [ 49.80279     2.3840294  75.72258  ]
 [ 69.52249    53.932858   74.1877   ]
 [ 51.113792   44.170414   61.6

## From my `ppo_agent.py` code

In [None]:
agent = PPO_agent(env, state_size, act_size, seed = 0)

In [None]:
def run_ppo(max_t_step):
    
    traj_obs, traj_act, rew_t_fut, traj_log_pol, len_traj = agent.collect_traj()

## Random search as in original SVM

In [None]:
state = env.reset()
scores = []
step = 0
score = 0.0

while True:
    print(".....STEP.....", step)
    action = env.action_space.sample()
    next_state, reward, done, info = env.step(action)
    step = step + 1
    score += reward
    scores.append(score)
    state = next_state
    if done:
        break