# Stochastic Variational Method with RL algorithms

In [1]:
import numpy as np
import gym
import torch
import subprocess
import os
import pickle

## Expoloring environment

In [2]:
env = gym.make('svm_env:svmEnv-v2', n_pairs = 3, n_basis = 50, file_sigmas ="./svmCodeSVD/sigmas.dat" )

print('### Env Name ######', env.unwrapped.spec.id)

obs_space = env.observation_space

print('###### Observation space ####### \n', obs_space)

state_size = env.observation_space.shape[-1]

print('###### Size of observation space ####### \n', state_size)

act_space = env.action_space.shape

print('###### Action space ####### \n', act_space)

act_size = env.action_space.shape[0]*env.action_space.shape[-1]

print('###### Number of actions ####### \n', act_size)

state = env.reset()

print('##### State after reset ###### \n', state)

print('##### File where will be stored sigmas \n', env.file_sigmas)



### Env Name ###### svmEnv-v2
###### Observation space ####### 
 Box(-inf, inf, (1,), float32)
###### Size of observation space ####### 
 1
###### Action space ####### 
 (50, 3)
###### Number of actions ####### 
 150
*****CALL RESET******
Action chosen at reset:  [0.]
##### State after reset ###### 
 [0.]
##### File where will be stored sigmas 
 ./svmCodeSVD/sigmas.dat


# Your codes `DDPG` and `PPO`

## Functions for saving and clean

In [3]:
## Save all rewards, energies and princip dims in files during episode training
def create_run_fold_and_info(agent, env):
    
    # Check if folder exist and creat it
    i = 0
    while os.path.exists(f'runs_optim_envs/run_{i}/'):
        i += 1
    name_dir = f'runs_optim_envs/run_{i}/'
    os.makedirs(name_dir)
    
    # Create info.p to store info in pickle file
    info = {'alg':agent.name, 'env':env.unwrapped.spec.id \
            , 'batch_size':agent.batch_size, 'bootstrap_size':agent.bootstrap_size \
            , 'gamma':agent.gamma, 'tau':agent.tau,'lr_critic':agent.lr_critic \
            , 'lr_actor':agent.lr_actor, 'update_every':agent.update_every \
            , 'transfer_every':agent.transfer_every, 'num_update':agent.num_update \
            , 'add_noise_every':agent.add_noise_every}
    
    pickle.dump(info, open(name_dir+'info.p', 'wb'))
    return name_dir
    
def save_all(name_run_dir, i_ep, sigmas_i_ep, rew_i_ep, en_i_ep, pri_dim_i_ep \
             , full_dim_i_ep, act_model_i_ep, cr_model_i_ep):
    
    pickle.dump(sigmas_i_ep, open(name_run_dir+f'sigmas_{i_ep}.p', 'wb'))
    pickle.dump(rew_i_ep, open(name_run_dir+f'rew_{i_ep}.p', 'wb'))
    pickle.dump(en_i_ep, open(name_run_dir+f'en_{i_ep}.p', 'wb'))
    pickle.dump(pri_dim_i_ep, open(name_run_dir+f'pri_dim_{i_ep}.p', 'wb'))
    pickle.dump(full_dim_i_ep, open(name_run_dir+f'full_dim_{i_ep}.p', 'wb'))
    pickle.dump(act_model_i_ep, open(name_run_dir+f'act_model_{i_ep}.p', 'wb'))
    pickle.dump(cr_model_i_ep, open(name_run_dir+f'cr_model_{i_ep}.p', 'wb'))
    
def rm_useless_file(actor_model_file, critic_model_file, file_sigmas):
    os.remove(actor_model_file)
    os.remove(critic_model_file)
    os.remove(file_sigmas)

## From my `ddpg_agent.py` code

In [4]:
from ddpg_agent import DDPG_agent
agent = DDPG_agent(state_size, act_size, seed = 0)

In [5]:
## Run ddpg algs   
def run_ddpg(max_t_step = 10, n_episodes=10):
    
    # Create h5 file and store info about alg and its hypereparams
    name_run_dir = create_run_fold_and_info(agent, env)
    
    for i_ep in range(n_episodes):
        state = env.reset()
        agent.reset()
        rew_i_ep = []
        en_i_ep = []
        pri_dim_i_ep = []
        full_dim_i_ep = []
        action_i_episode = []

        ## Training loop of each episode
        for t_step in range(max_t_step):
            action = agent.act(state)
            next_state, reward, done, info = env.step(action.reshape((env.n_basis,env.n_pairs)))
            agent.step(state, action, reward, next_state, done)
            state = next_state

            # Save rew, energies, princip dims, act and crit models
            action_i_episode.append(action.reshape((env.n_basis,env.n_pairs)))
            rew_i_ep.append(reward)
            en_i_ep.append(state[0])
            pri_dim_i_ep.append(env.princp_dim)
            full_dim_i_ep.append(env.full_dim)
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            if done:
                break
                
        ## Save data during training (to not lose the work done)
        save_all(name_run_dir=name_run_dir, i_ep=int(i_ep), sigmas_i_ep=action_i_episode \
                 , rew_i_ep=rew_i_ep, en_i_ep=en_i_ep, pri_dim_i_ep=pri_dim_i_ep \
                 , full_dim_i_ep=full_dim_i_ep, act_model_i_ep='checkpoint_actor.pth' \
                 , cr_model_i_ep='checkpoint_critic.pth')
        
        print('Episode {} ... Score: {:.3f}'.format(i_ep, np.sum(rew_i_ep)))

    rm_useless_file('checkpoint_actor.pth', 'checkpoint_critic.pth', env.file_sigmas)
    return name_run_dir

In [6]:
all_data = run_ddpg(10, 10)

*****CALL RESET******
Action chosen at reset:  [0.]
****CALL STEP****
Action chosen at step:  [[41.326897 68.98927  55.587734]
 [48.889088 59.22312  35.692894]
 [51.29618  57.28906  48.881516]
 [41.43883  50.13721  62.284298]
 [89.4814   59.430107 53.87242 ]
 [56.14401  50.503357 44.48779 ]
 [51.86254  58.857254 57.10333 ]
 [46.87075  50.75939  40.84858 ]
 [58.652058 39.463802 53.874794]
 [54.72371  58.052185 58.337666]
 [54.806976 64.17156  84.03984 ]
 [70.155266 52.255135 66.979485]
 [54.24046  59.21776  57.1173  ]
 [48.28026  55.15761  36.95992 ]
 [61.50089  56.796158 36.627068]
 [37.003384 73.59915  59.882336]
 [67.41905  59.715607 85.764145]
 [49.90651  71.252785 59.29526 ]
 [57.004475 64.53133  69.82104 ]
 [58.951782 28.688417 52.747124]
 [53.03532  72.84357  58.07345 ]
 [62.052658 45.58359  69.56333 ]
 [61.38929  56.885548 51.68843 ]
 [18.923637 63.8274   54.944267]
 [57.244347 38.59173  36.41712 ]
 [43.869205 46.660786 65.255905]
 [49.579334 50.732285 73.27075 ]
 [68.653984 58.

With this action the energy is:  -0.134066
With this action the full dim is:  50  and princip dim is:  49
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.134066
Set reward :  8.862341329415479
****CALL STEP****
Action chosen at step:  [[ 33.679382   66.66064    59.12148  ]
 [ 24.500326   58.053566   32.26056  ]
 [ 53.85709    65.19148    44.77108  ]
 [ 37.144707   45.222237   33.08874  ]
 [ 73.137474   64.54739    59.945244 ]
 [ 63.31408    64.900665   39.681656 ]
 [ 48.133488   37.39016    92.2818   ]
 [ 72.07161    52.177254   87.973236 ]
 [ 20.943668   21.730762   53.030342 ]
 [ 68.62994    41.77978    17.557968 ]
 [ 85.63463    92.864456   55.463245 ]
 [ 38.66725    90.210266   64.20435  ]
 [ 66.1895     15.745331   66.806305 ]
 [ 72.292786   64.701355   54.247196 ]
 [ 37.671204   57.336967   56.462574 ]
 [ 31.525873   42.355854   84.927574 ]
 [ 71.28471    27.636143   59.134293 ]
 [ 61.9612     53.3092     74.703674 ]
 [ 38.277763   60.66115    25.126

With this action the energy is:  -0.100199
With this action the full dim is:  50  and princip dim is:  49
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.100199
Set reward :  6.507248315602073
****CALL STEP****
Action chosen at step:  [[ 34.004272   79.83817    50.273636 ]
 [ 46.65849    41.534424   73.62589  ]
 [ 63.144726  102.232185   27.460356 ]
 [ 61.95534    56.23026     6.0188675]
 [ 75.10234    39.630314   76.04285  ]
 [ 57.90514    87.71609    46.07244  ]
 [ 60.965282   57.806057   63.88155  ]
 [ 54.27757    25.75589    84.58429  ]
 [ 32.966843   33.09056    56.529137 ]
 [ 86.04549     7.004551   15.423931 ]
 [ 97.09453    86.08309    65.730675 ]
 [ 33.882755   59.887028   25.408464 ]
 [ 80.72923    26.941484   87.34805  ]
 [ 49.87752    57.458027   53.895905 ]
 [ 30.886972   82.24356    60.090816 ]
 [ 40.50428    39.98011    36.003975 ]
 [ 54.641026    3.6561623  51.06532  ]
 [ 73.66351    66.42173    60.41052  ]
 [ 40.29087    61.323986   28.005

With this action the energy is:  -0.0840137
With this action the full dim is:  50  and princip dim is:  48
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0840137
Set reward :  5.381731235362485
****CALL STEP****
Action chosen at step:  [[ 39.69971    57.616013   49.34289  ]
 [ 32.384903   47.775513   36.855553 ]
 [ 72.2301     78.84552     5.522255 ]
 [ 43.749275   60.321194    8.468388 ]
 [ 35.363777   57.9575     77.98147  ]
 [ 81.40776    84.036705   39.92944  ]
 [ 20.804089   63.591843   47.40631  ]
 [ 68.83162    58.178932   47.751305 ]
 [ 57.702953    2.9481812  71.34408  ]
 [ 60.250572   60.62394    47.847675 ]
 [ 86.67345    31.022291   67.62132  ]
 [ 64.652374   45.480156   87.08812  ]
 [ 41.488976   38.680733   12.469711 ]
 [ 43.5062     66.22968    64.37441  ]
 [ 26.655561   41.766624   50.520508 ]
 [ 66.98982    44.10478    44.120834 ]
 [ 63.678253   52.97131    38.988277 ]
 [ 76.71607    29.568844   77.94588  ]
 [ 52.583126   43.04497    48.1

With this action the energy is:  -0.111585
With this action the full dim is:  50  and princip dim is:  48
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.111585
Set reward :  7.2990246481759495
****CALL STEP****
Action chosen at step:  [[48.763897  58.981964  88.441284 ]
 [54.08146   62.254696  38.532295 ]
 [52.29311   58.33161   30.01917  ]
 [25.805685  78.927605   1.       ]
 [40.89472   51.360126  31.9734   ]
 [93.01083   68.14388   38.111908 ]
 [24.629925  53.27687   51.367466 ]
 [66.39863   59.777245  45.32629  ]
 [55.234936  33.303078  38.973167 ]
 [50.78676   81.24705   61.95667  ]
 [57.59647   36.133976  57.18177  ]
 [72.62708   53.92046   88.86922  ]
 [69.34933   41.505478  38.351624 ]
 [48.891014  54.337753  75.82645  ]
 [42.336357  41.82577   68.16199  ]
 [49.533257  42.445145  24.452116 ]
 [72.126465  26.804136  57.788    ]
 [73.13686   36.875427  63.203224 ]
 [45.60701    4.398304  29.712269 ]
 [38.543854  39.79442   29.34102  ]
 [19.055534  5

With this action the energy is:  -0.0714293
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0714293
Set reward :  4.506618809607816
****CALL STEP****
Action chosen at step:  [[ 57.107685  52.40256   43.194386]
 [ 41.281654   6.941696  68.67253 ]
 [ 56.516212  69.35978   25.661978]
 [ 66.90805   45.261627  43.80683 ]
 [ 61.35343   34.774605  62.51364 ]
 [ 79.72972   37.59263   44.36296 ]
 [ 47.73793   50.326668  40.030827]
 [ 47.970814  80.31221   53.443554]
 [ 32.60403   67.75193   47.969086]
 [ 54.176735  64.76239   68.13528 ]
 [ 35.604744  53.64796   60.02034 ]
 [ 86.82482   36.77836   52.48804 ]
 [ 19.980827  60.54895   68.57879 ]
 [ 90.6565    47.219486  56.725616]
 [ 46.64745   62.421513  97.26954 ]
 [ 42.888084  64.266106  42.368977]
 [ 45.483288  52.02878   40.04326 ]
 [ 57.997414  58.31473   74.340294]
 [ 31.353989  55.17022   80.70428 ]
 [100.49308   56.677616  76.02368 ]
 [ 84.176544 

With this action the energy is:  -0.0838368
With this action the full dim is:  50  and princip dim is:  49
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0838368
Set reward :  5.369429704251268
****CALL STEP****
Action chosen at step:  [[ 18.579124   74.36392    27.153984 ]
 [ 78.19717    32.840256   65.33959  ]
 [ 54.467125   71.33044    26.423141 ]
 [ 70.34227    46.487186   44.845978 ]
 [ 47.366352   43.46896    68.94696  ]
 [ 66.02426    66.28494    58.465816 ]
 [ 72.04421    39.264755   40.28352  ]
 [ 63.07762    77.71658    57.748417 ]
 [ 51.877995   88.03491    68.8048   ]
 [ 62.71047    37.834854   97.69786  ]
 [ 59.991264   66.2448     50.045578 ]
 [ 64.55028    61.61191    69.73695  ]
 [ 42.64122    37.283974   36.218094 ]
 [ 76.23246    69.15563    42.51806  ]
 [ 22.691212   35.20343    93.25485  ]
 [ 30.613537   56.998146   41.75195  ]
 [ 54.936253   58.635773   73.09491  ]
 [ 66.3374     56.322037   87.89458  ]
 [ 49.71744    58.153008   51.3

With this action the energy is:  -0.0381015
With this action the full dim is:  50  and princip dim is:  36
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0381015
Set reward :  2.189021474568679
****CALL STEP****
Action chosen at step:  [[ 52.080196  44.45817   72.546684]
 [ 69.8898    63.361183  33.026062]
 [ 57.5962    73.054565  40.60838 ]
 [ 66.63002   94.45846   47.92963 ]
 [ 88.361946  26.865637  64.50724 ]
 [ 42.07475   55.494938  62.526443]
 [ 73.63429   41.47878  103.201065]
 [ 60.51027   75.44095   34.654892]
 [ 39.356148  67.88374   42.93528 ]
 [ 51.26699   34.98574   54.114994]
 [ 66.807205  46.116966  59.76157 ]
 [ 39.523712  50.6485    30.902555]
 [ 42.95726   57.55382   67.73786 ]
 [ 65.814514  46.418736  43.18467 ]
 [ 51.63621   71.88298   27.850807]
 [ 50.95092   50.360626  89.03267 ]
 [ 73.99339   47.79036   49.486385]
 [ 81.79511   63.88449   78.35558 ]
 [ 70.26692   70.02778   62.84702 ]
 [ 62.422455  64.99241   71.34172 ]
 [ 75.11577  

With this action the energy is:  -0.0668245
With this action the full dim is:  50  and princip dim is:  49
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0668245
Set reward :  4.186403487612718
****CALL STEP****
Action chosen at step:  [[ 29.725435   41.45393    77.32985  ]
 [ 77.094666   14.897942   42.829624 ]
 [ 58.433716   63.547413   53.52611  ]
 [ 70.56283    85.66536    47.235085 ]
 [ 72.628586   63.317383   84.48217  ]
 [ 15.01376    69.22806    62.37656  ]
 [ 74.478134   29.09521    68.45481  ]
 [ 58.429707   62.109528   31.625494 ]
 [ 57.111294   66.14459    41.2611   ]
 [ 45.001976   14.582508   68.573685 ]
 [ 75.13007    54.172188   58.29545  ]
 [ 19.838337   41.85489    46.626217 ]
 [ 29.071346   54.02232   107.00227  ]
 [ 57.874367   53.881824   66.93529  ]
 [ 38.230354   44.418106   22.94712  ]
 [ 55.577045   11.206093   74.23561  ]
 [ 74.9805     34.26955    69.4605   ]
 [ 37.794487   94.564896   79.604706 ]
 [ 63.93718    87.614      76.7

With this action the energy is:  -0.0785695
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0785695
Set reward :  5.003144487619759
****CALL STEP****
Action chosen at step:  [[ 41.246284   75.84107    68.79222  ]
 [ 55.038036   51.08429    31.48864  ]
 [ 30.301437   45.43915    31.28769  ]
 [ 60.672096   67.80366    59.134426 ]
 [ 46.516624   36.813354   54.16254  ]
 [ 12.584721   55.173218   66.10613  ]
 [ 89.53853    62.437626   43.34204  ]
 [104.83679    47.47244    46.1845   ]
 [ 25.654776   67.74126    47.76027  ]
 [ 42.650383   29.69754    45.171196 ]
 [ 80.928024   62.419197   39.259583 ]
 [ 56.398926   68.69897    32.97708  ]
 [ 52.574852   63.67965   110.       ]
 [ 73.49641    38.982838   55.69472  ]
 [ 74.92399    23.843645   67.72527  ]
 [ 62.994392   36.42672    90.86129  ]
 [ 80.478195   24.063427   32.979134 ]
 [ 47.548508   95.95393    45.91234  ]
 [ 45.960968  102.04239    67.5

With this action the energy is:  -0.0802415
With this action the full dim is:  50  and princip dim is:  46
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0802415
Set reward :  5.119414470762522
****CALL STEP****
Action chosen at step:  [[ 47.581787  58.35205  105.54518 ]
 [ 73.4025    41.846237  61.32961 ]
 [ 22.541225  26.24242   26.789547]
 [ 56.699642  72.72434   61.71862 ]
 [ 62.43721   47.7975    51.346455]
 [ 26.673409  26.603022  61.303223]
 [ 64.38478   54.526085  44.936844]
 [ 26.306606  99.04056   31.884033]
 [ 27.382748  31.138243  46.230522]
 [ 49.007423  66.22322   52.0941  ]
 [ 18.585896  31.778635  18.675095]
 [ 49.16871   72.70569   55.70643 ]
 [ 56.674034  39.240776  40.83414 ]
 [ 38.16838   37.101273  49.065193]
 [ 27.069376  66.72125   29.866629]
 [  1.        50.12714   58.46264 ]
 [ 69.625206  42.485695  29.613972]
 [ 65.292175  35.516235  51.83668 ]
 [ 73.59362   62.54293   62.337265]
 [ 51.725044  56.2078    86.09738 ]
 [ 55.288906 

With this action the energy is:  -0.0932056
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0932056
Set reward :  6.020931030846923
****CALL STEP****
Action chosen at step:  [[43.870667  23.305576  77.762985 ]
 [85.95182   54.485218  60.644928 ]
 [46.09104   68.25168   51.99307  ]
 [56.73217   63.467434  68.7013   ]
 [53.191288  52.072906  50.576824 ]
 [47.07637   24.510508  73.54857  ]
 [55.864307  51.905163  71.296036 ]
 [31.753056  94.064095  32.965515 ]
 [31.996439  61.535965  57.554466 ]
 [21.094746  70.918594  71.65324  ]
 [56.985588  23.411797  35.616997 ]
 [71.12394   65.02771   72.48671  ]
 [56.513187  43.899734  74.96736  ]
 [35.669106  55.627815  46.39973  ]
 [ 1.        72.89755   38.630547 ]
 [18.199604  54.27187   51.631332 ]
 [63.167274  28.218765  35.504097 ]
 [96.397736  54.990257  68.486084 ]
 [81.693954  70.46518   50.757214 ]
 [47.600628  75.85893   60.241478 ]
 [39.67743   

With this action the energy is:  -0.022109
With this action the full dim is:  50  and princip dim is:  34
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.022109
Set reward :  1.076911602911581
****CALL STEP****
Action chosen at step:  [[25.494158 48.470036 64.46061 ]
 [24.46113  49.28727  51.781292]
 [65.962715 64.2224   52.65325 ]
 [55.01367  47.594913 61.356163]
 [54.498737 49.041016 58.1151  ]
 [58.531315 55.058624 61.242363]
 [32.918846 46.564972 41.504932]
 [56.20183  64.82523  57.210594]
 [61.338566 59.131958 38.30639 ]
 [76.99191  25.468782 48.61373 ]
 [45.42331  72.377914 65.959946]
 [35.010586 55.473225 64.37335 ]
 [58.365814 33.20282  65.02051 ]
 [53.061375 56.132095 57.285088]
 [54.516655 52.322987 40.295944]
 [47.486282 82.4431   50.989735]
 [37.34906  53.389805 38.16193 ]
 [53.879883 53.90904  73.69208 ]
 [56.72882  49.491955 52.07475 ]
 [41.076275 58.16483  42.400074]
 [46.129456 75.81733  46.45575 ]
 [44.01994  63.004894 52.686836]
 [41.0292

With this action the energy is:  -0.124631
With this action the full dim is:  50  and princip dim is:  48
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.124631
Set reward :  8.206236490329363
****CALL STEP****
Action chosen at step:  [[ 23.303017   33.53383    47.404697 ]
 [ 29.453009   51.50333    31.628815 ]
 [ 67.20837    59.531975   77.750046 ]
 [ 61.838955   71.67239    45.001884 ]
 [ 74.574455   41.755295   50.19646  ]
 [ 73.10698    33.96895    58.728935 ]
 [ 28.143162   46.265076   68.41379  ]
 [  1.2492332  91.028175   51.394295 ]
 [ 72.325005   20.136295   51.149086 ]
 [ 80.16915    53.91223     6.944294 ]
 [ 33.413597  100.47374    76.69409  ]
 [ 47.334522   31.60047    74.116104 ]
 [ 56.274178   54.33265    48.217663 ]
 [ 72.00257    51.376465   56.66145  ]
 [ 50.665295   14.582382   43.82134  ]
 [ 30.872446  110.         50.032894 ]
 [ 47.089188   42.041985   12.903263 ]
 [ 59.475822   33.587563   84.87161  ]
 [ 40.34365    11.690437   76.109

With this action the energy is:  -0.0935731
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0935731
Set reward :  6.046486783720708
Episode 5 ... Score: 48.775
*****CALL RESET******
Action chosen at reset:  [0.]
****CALL STEP****
Action chosen at step:  [[69.19752  68.18523  54.933178]
 [59.36713  48.74951  62.860218]
 [73.79993  97.03079  25.730772]
 [41.7588   51.22553  52.34214 ]
 [55.288734 45.15496  80.69043 ]
 [29.031923 57.549183 52.78958 ]
 [54.992626 55.240788 51.56438 ]
 [58.041786 66.20702  67.5167  ]
 [46.529507 55.58025  35.615356]
 [62.964336 77.32462  51.797653]
 [64.19492  47.267838 85.69655 ]
 [70.76572  47.837036 62.04382 ]
 [51.758934 48.448635 77.94098 ]
 [52.003822 61.80645  68.916214]
 [26.437744 80.53151  37.67604 ]
 [44.85301  46.251297 35.766693]
 [56.421474 56.179424 69.92139 ]
 [42.71144  45.87433  57.119247]
 [33.524002 70.49895  45.705135]
 [40.170757 73.75015  55.5

With this action the energy is:  -0.120744
With this action the full dim is:  50  and princip dim is:  48
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.120744
Set reward :  7.9359365953078775
****CALL STEP****
Action chosen at step:  [[ 61.271587   47.08286    53.944702 ]
 [ 72.84954    71.23529    45.737087 ]
 [ 79.97425    46.912354   19.177235 ]
 [ 30.450302   93.414795   70.656456 ]
 [ 58.579445   29.534351   88.308914 ]
 [ 60.0807     70.14274    92.688705 ]
 [ 17.437866   89.77091    34.015167 ]
 [ 32.455933   66.57182    51.515488 ]
 [110.         92.1492     48.6589   ]
 [ 56.2075     68.056366   16.216496 ]
 [ 63.09425    71.533325  105.22552  ]
 [ 37.27061    50.37412    50.830345 ]
 [ 50.161682   42.094913   80.47847  ]
 [ 30.483435   73.79855    72.78485  ]
 [ 44.50448    88.00664    80.09495  ]
 [ 53.652798   64.74983    48.58649  ]
 [ 41.948574   49.747013   53.08621  ]
 [ 82.31447    73.119095   38.001663 ]
 [ 46.653812   66.27212    41.17

With this action the energy is:  -0.109674
With this action the full dim is:  50  and princip dim is:  49
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.109674
Set reward :  7.1661347332322745
****CALL STEP****
Action chosen at step:  [[ 55.089287   78.42052    44.660282 ]
 [ 77.32504    66.61787    42.989273 ]
 [ 59.36238    72.0437     60.236515 ]
 [ 40.040295   35.506363   63.987354 ]
 [ 57.957      40.62771    66.34398  ]
 [ 42.710995   82.496      61.10853  ]
 [ 54.168976   58.631985   26.486519 ]
 [ 36.2284     54.018215   23.088135 ]
 [ 95.83802    78.25814    39.012142 ]
 [ 54.446915   79.103615   24.393265 ]
 [ 52.842438   52.557255   96.31499  ]
 [ 59.10633    23.772442   44.19564  ]
 [ 72.231865   49.531036   63.542072 ]
 [ 43.413223   48.410892   77.18905  ]
 [ 52.35347   107.249374   68.84498  ]
 [ 58.267105   78.30425    29.47288  ]
 [ 62.49084    43.31444    28.777203 ]
 [ 45.732475   56.017025   55.306892 ]
 [ 56.424522   55.09383    55.53

With this action the energy is:  -0.0666327
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0666327
Set reward :  4.1730658184938285
****CALL STEP****
Action chosen at step:  [[ 29.60773    74.175064   53.698593 ]
 [ 89.38251    11.2740555  59.945713 ]
 [ 53.345913   62.87452    48.537193 ]
 [ 39.22516    36.341194   45.114815 ]
 [ 76.71646    50.01754    80.000885 ]
 [ 54.370926   31.448652   24.367388 ]
 [ 46.428005   53.66537    38.83757  ]
 [ 34.12171    59.532906   64.58267  ]
 [ 45.485264   31.561916   31.496796 ]
 [ 90.21092    72.649536   41.41802  ]
 [ 18.357033   67.174515   78.04474  ]
 [ 57.713783   83.46522    87.34491  ]
 [ 47.87115    59.43983    48.975502 ]
 [ 54.176292   31.94286    55.49586  ]
 [ 65.11794    66.95649    13.691689 ]
 [ 17.809418   85.94942    49.113132 ]
 [ 51.134068   29.856358   54.354633 ]
 [ 70.94128    87.09114    41.64353  ]
 [ 59.695168   65.565254   61.

With this action the energy is:  -0.12139
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.12139
Set reward :  7.980859088794854
****CALL STEP****
Action chosen at step:  [[ 25.137062  64.581955  44.7276  ]
 [100.2085    47.510254  38.66073 ]
 [ 46.172016  55.348667  40.735107]
 [ 61.312904  12.784229  53.615692]
 [ 67.33411   33.697067  67.0498  ]
 [ 15.46809   82.495544  43.010437]
 [ 67.68227   61.20543   27.170279]
 [ 62.11711   38.033127  63.059677]
 [ 42.620865  43.07466   47.977856]
 [ 80.82714   29.023684  16.301796]
 [ 29.582033  61.55899   88.26082 ]
 [ 30.55659   71.69238   59.114677]
 [ 40.03588   32.899723  57.255413]
 [ 56.039345  30.858232  96.604065]
 [ 34.68004   65.44729   55.932495]
 [ 23.319347  61.90336   34.176205]
 [ 71.145035  52.543167  51.999474]
 [ 89.38516   71.05404   46.982437]
 [ 61.72812   94.1021    12.952232]
 [ 67.57063   34.662514  57.910084]
 [ 27.559261  83.

With this action the energy is:  -0.0488859
With this action the full dim is:  50  and princip dim is:  44
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0488859
Set reward :  2.938962865839512
****CALL STEP****
Action chosen at step:  [[28.190466 62.47266  47.523197]
 [68.93037  40.07841  32.003468]
 [63.832737 64.68102  40.436283]
 [46.418266 66.32342  46.67482 ]
 [54.49462  26.426254 77.6977  ]
 [57.220818 28.18444  51.225056]
 [42.051678 63.858547 69.995926]
 [22.234211 76.54282  61.19825 ]
 [39.15943  53.059692 49.47113 ]
 [59.384068 47.582172 64.60004 ]
 [71.23276  70.4725   83.11413 ]
 [76.9083   56.039505 53.75403 ]
 [60.384293 29.222479 76.46565 ]
 [64.14649  39.772446 55.622032]
 [48.819565 66.73039  59.07714 ]
 [61.49665  25.289162 42.104366]
 [54.178932 25.516556 57.14176 ]
 [74.39986  55.280575 71.56169 ]
 [89.42616  68.4733   57.93313 ]
 [36.392353 64.71904  54.677895]
 [38.989635 68.02301  66.23338 ]
 [47.243675 38.831577 98.62596 ]
 [78.02

With this action the energy is:  -0.073912
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.073912
Set reward :  4.679264435892828
****CALL STEP****
Action chosen at step:  [[ 28.119684   41.485157   50.154907 ]
 [ 66.24278    49.350956   23.917515 ]
 [ 86.778854   70.97293    20.642303 ]
 [ 34.303814   39.053642   54.733143 ]
 [ 66.06994    32.619278   67.33439  ]
 [ 49.629215   59.06801    22.422718 ]
 [ 50.13341    64.03717    67.23717  ]
 [ 24.20792    70.98085    74.08486  ]
 [ 52.67643    34.5045     35.097153 ]
 [ 63.94199    56.533855   48.935482 ]
 [ 26.621037   57.724022   99.02901  ]
 [ 90.97135    54.58726    56.238388 ]
 [ 36.274567   43.403305   87.07086  ]
 [ 56.18124    45.94909    66.79956  ]
 [ 55.447746   60.200863   12.817383 ]
 [ 69.70001    46.705284   44.606335 ]
 [ 72.13331    43.945797   58.33441  ]
 [ 55.16808    78.137184   71.71872  ]
 [ 68.848564   96.024704   63.405

With this action the energy is:  -0.0844021
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0844021
Set reward :  5.4087403630255535
****CALL STEP****
Action chosen at step:  [[ 59.112854   53.0694     71.276344 ]
 [ 63.88912    56.510498   27.030624 ]
 [ 57.99853    86.80476     1.       ]
 [ 50.216732   34.10731    72.05094  ]
 [ 29.296837   34.41693    41.1608   ]
 [ 17.531223   82.73757    56.9132   ]
 [ 60.89554    54.594177   72.43455  ]
 [ 20.394035   85.00153    74.50687  ]
 [ 25.984968   10.907394   22.214165 ]
 [ 73.23358    47.037575   63.66427  ]
 [ 14.228958   66.58948    92.89601  ]
 [104.83606    76.47728    74.95509  ]
 [ 38.576294   75.61326    94.82745  ]
 [ 36.970066   27.121014   57.618233 ]
 [ 28.837124   49.406708   22.396587 ]
 [ 21.827682   56.704567   45.76152  ]
 [ 82.39631    53.736633   74.565895 ]
 [ 49.626884   79.19259    76.36945  ]
 [ 38.780945   53.673183   78.

With this action the energy is:  -0.0839498
With this action the full dim is:  50  and princip dim is:  47
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0839498
Set reward :  5.37728766363831
****CALL STEP****
Action chosen at step:  [[ 66.67254    50.548824   38.54698  ]
 [ 48.934826   24.02734    61.800735 ]
 [ 67.14026    51.646347   44.644524 ]
 [ 46.982754   26.090578   63.565567 ]
 [ 25.531065   39.19717    52.651066 ]
 [ 25.81262    59.266      59.347088 ]
 [ 61.431202   51.959976   17.203224 ]
 [  1.         66.266426   39.838997 ]
 [ 85.29955    50.08593    50.349937 ]
 [ 38.447956   58.904274   44.28389  ]
 [ 32.069557   61.380356   71.59207  ]
 [ 53.908222   71.74536    38.176666 ]
 [ 69.530945   11.292469   60.719246 ]
 [ 52.11249    48.171455   69.485794 ]
 [ 72.413185   85.029816   41.759323 ]
 [ 37.358124   92.148254   66.551    ]
 [ 45.51373    66.10988    54.986423 ]
 [ 61.70317    50.348087   82.87063  ]
 [ 73.08905    68.1559     32.96

With this action the energy is:  -0.0841987
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0841987
Set reward :  5.39459603612888
****CALL STEP****
Action chosen at step:  [[ 50.840775   72.5003     41.917343 ]
 [ 52.172417    1.         52.17785  ]
 [ 69.91957    44.282898   14.129799 ]
 [ 42.38736     9.743992   70.932816 ]
 [ 46.68638    35.228832   27.207285 ]
 [ 19.142715   62.77353    71.55912  ]
 [ 67.1468     61.591835   23.63118  ]
 [ 27.909546  104.07356    69.252426 ]
 [ 94.60664    47.648815   57.897594 ]
 [ 27.008299   67.437126   36.592384 ]
 [ 29.572912   60.409035   86.41126  ]
 [ 68.69773    61.543037   61.319595 ]
 [ 68.83096     3.7768364  81.77573  ]
 [ 56.75248    51.252026   74.07371  ]
 [ 23.69475    81.63413    28.046515 ]
 [  1.         90.325485   67.41424  ]
 [ 59.850834   45.135452   79.78562  ]
 [ 62.56062    73.78535    80.76652  ]
 [ 43.175587   52.466747   18.51

## Random search as in original SVM

In [None]:
state = env.reset()
scores = []
step = 0
score = 0.0

while True:
    print(".....STEP.....", step)
    action = env.action_space.sample()
    next_state, reward, done, info = env.step(action)
    step = step + 1
    score += reward
    scores.append(score)
    state = next_state
    if done:
        break