# Stochastic Variational Method with RL algorithms

In [1]:
import numpy as np
import gym
import torch
import subprocess
import os
import pickle

## Expoloring environment

In [2]:
env = gym.make('svm_env:svmEnv-v2', n_pairs = 3, n_basis = 50, file_sigmas ="./svmCodeSVD/sigmas.dat" )

print('### Env Name ######', env.unwrapped.spec.id)

obs_space = env.observation_space

print('###### Observation space ####### \n', obs_space)

state_size = env.observation_space.shape[-1]

print('###### Size of observation space ####### \n', state_size)

act_space = env.action_space.shape

print('###### Action space ####### \n', act_space)

act_size = env.action_space.shape[0]*env.action_space.shape[-1]

print('###### Number of actions ####### \n', act_size)

state = env.reset()

print('##### State after reset ###### \n', state)

print('##### File where will be stored sigmas \n', env.file_sigmas)



### Env Name ###### svmEnv-v2
###### Observation space ####### 
 Box(-inf, inf, (1,), float32)
###### Size of observation space ####### 
 1
###### Action space ####### 
 (50, 3)
###### Number of actions ####### 
 150
*****CALL RESET******
Action chosen at reset:  [0.]
##### State after reset ###### 
 [0.]
##### File where will be stored sigmas 
 ./svmCodeSVD/sigmas.dat


# Your codes `DDPG` and `PPO`

## Functions for saving and clean

In [3]:
## Save all rewards, energies and princip dims in files during episode training
def create_run_fold_and_info(agent, env):
    
    # Check if folder exist and creat it
    i = 0
    while os.path.exists(f'runs_optim_envs/run_{i}/'):
        i += 1
    name_dir = f'runs_optim_envs/run_{i}/'
    os.makedirs(name_dir)
    
    # Create info.p to store info in pickle file
    info = {'alg':agent.name, 'env':env.unwrapped.spec.id , 'basis_size':env.n_basis \
            , 'batch_size':agent.batch_size, 'bootstrap_size':agent.bootstrap_size \
            , 'gamma':agent.gamma, 'tau':agent.tau,'lr_critic':agent.lr_critic \
            , 'lr_actor':agent.lr_actor, 'update_every':agent.update_every \
            , 'transfer_every':agent.transfer_every, 'num_update':agent.num_update \
            , 'add_noise_every':agent.add_noise_every}
    
    pickle.dump(info, open(name_dir+'info.p', 'wb'))
    return name_dir
    
def save_all(name_run_dir, i_ep, sigmas_i_ep, rew_i_ep, en_i_ep, pri_dim_i_ep \
             , full_dim_i_ep, act_model_i_ep, cr_model_i_ep):
    
    pickle.dump(sigmas_i_ep, open(name_run_dir+f'sigmas_{i_ep}.p', 'wb'))
    pickle.dump(rew_i_ep, open(name_run_dir+f'rew_{i_ep}.p', 'wb'))
    pickle.dump(en_i_ep, open(name_run_dir+f'en_{i_ep}.p', 'wb'))
    pickle.dump(pri_dim_i_ep, open(name_run_dir+f'pri_dim_{i_ep}.p', 'wb'))
    pickle.dump(full_dim_i_ep, open(name_run_dir+f'full_dim_{i_ep}.p', 'wb'))
    pickle.dump(act_model_i_ep, open(name_run_dir+f'act_model_{i_ep}.p', 'wb'))
    pickle.dump(cr_model_i_ep, open(name_run_dir+f'cr_model_{i_ep}.p', 'wb'))
    
def rm_useless_file(actor_model_file, critic_model_file, file_sigmas):
    os.remove(actor_model_file)
    os.remove(critic_model_file)
    os.remove(file_sigmas)

## From my `ddpg_agent.py` code

In [4]:
from ddpg_agent import DDPG_agent
agent = DDPG_agent(state_size, act_size, seed = 0)

In [5]:
## Run ddpg algs   
def run_ddpg(max_t_step = 10, n_episodes=10):
    
    # Create h5 file and store info about alg and its hypereparams
    name_run_dir = create_run_fold_and_info(agent, env)
    
    for i_ep in range(n_episodes):
        state = env.reset()
        agent.reset()
        rew_i_ep = []
        en_i_ep = []
        pri_dim_i_ep = []
        full_dim_i_ep = []
        action_i_episode = []

        ## Training loop of each episode
        for t_step in range(max_t_step):
            action = agent.act(state)
            next_state, reward, done, info = env.step(action.reshape((env.n_basis,env.n_pairs)))
            agent.step(state, action, reward, next_state, done)
            state = next_state

            # Save rew, energies, princip dims, act and crit models
            action_i_episode.append(action.reshape((env.n_basis,env.n_pairs)))
            rew_i_ep.append(reward)
            en_i_ep.append(state[0])
            pri_dim_i_ep.append(env.princp_dim)
            full_dim_i_ep.append(env.full_dim)
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            if done:
                break
                
        ## Save data during training (to not lose the work done)
        save_all(name_run_dir=name_run_dir, i_ep=int(i_ep), sigmas_i_ep=action_i_episode \
                 , rew_i_ep=rew_i_ep, en_i_ep=en_i_ep, pri_dim_i_ep=pri_dim_i_ep \
                 , full_dim_i_ep=full_dim_i_ep, act_model_i_ep='checkpoint_actor.pth' \
                 , cr_model_i_ep='checkpoint_critic.pth')
        
        print('Episode {} ... Score: {:.3f}'.format(i_ep, np.sum(rew_i_ep)))

    rm_useless_file('checkpoint_actor.pth', 'checkpoint_critic.pth', env.file_sigmas)
    return name_run_dir

In [6]:
all_data = run_ddpg(10, 10)

*****CALL RESET******
Action chosen at reset:  [0.]
****CALL STEP****
Action chosen at step:  [[50.072845 56.929455 59.501633]
 [52.250034 34.559227 43.501587]
 [69.3533   58.621216 42.75607 ]
 [63.978046 59.135555 66.0863  ]
 [41.53401  73.24413  40.14331 ]
 [43.737583 64.73474  60.997383]
 [80.35115  49.306587 55.009674]
 [66.11502  55.713226 58.963757]
 [68.96707  71.19537  55.571434]
 [67.822685 38.9048   57.794865]
 [64.07574  53.109283 77.59395 ]
 [43.269043 50.85245  67.81748 ]
 [45.936638 47.437347 53.816032]
 [45.433517 55.54785  73.11075 ]
 [66.68302  61.524677 69.0898  ]
 [55.760532 59.582672 50.89537 ]
 [62.075413 70.3105   62.022034]
 [45.539757 47.263195 46.332912]
 [90.82975  66.02377  69.61709 ]
 [43.456844 66.45541  73.74588 ]
 [61.10525  40.024307 72.54977 ]
 [52.919365 63.2393   46.964615]
 [46.273148 59.848316 64.95668 ]
 [76.48852  70.03212  56.907513]
 [76.57767  53.520897 52.050827]
 [40.087807 70.93245  45.784874]
 [60.35365  81.0475   54.985733]
 [61.4239   42.

With this action the energy is:  -0.0714407
With this action the full dim is:  50  and princip dim is:  46
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0714407
Set reward :  4.50741155949288
****CALL STEP****
Action chosen at step:  [[ 72.74019    61.207417   80.454956 ]
 [ 90.270004   73.301186   41.565544 ]
 [ 37.735565   60.333717   48.091087 ]
 [ 22.391567   30.91507    65.15376  ]
 [ 36.83303    43.273476   55.788685 ]
 [ 87.93541    75.86522    57.55922  ]
 [ 57.307823   63.56945    69.92623  ]
 [ 15.967724  109.310135   81.05629  ]
 [ 91.52809    30.287537   56.93226  ]
 [ 61.022594   70.27894    54.385574 ]
 [ 74.43622    58.13622    55.60167  ]
 [ 65.449554   74.58558    28.42765  ]
 [ 55.73499    30.658587   64.657425 ]
 [ 68.60836    29.391832   97.702515 ]
 [ 66.507515   55.601875   40.08684  ]
 [ 79.650375   49.15297    53.49122  ]
 [ 40.819607   54.998478   64.34801  ]
 [ 42.785694   51.633114   10.899338 ]
 [ 71.848404   31.013517   51.82

With this action the energy is:  -0.0773305
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0773305
Set reward :  4.916985092216716
****CALL STEP****
Action chosen at step:  [[ 69.848495   82.43274    88.639946 ]
 [ 86.2413     77.82727    45.393547 ]
 [ 29.938496   44.43492    52.01001  ]
 [ 20.780567   67.04061    79.619354 ]
 [ 25.320467    1.         66.63853  ]
 [ 87.00914   106.14501    58.142666 ]
 [ 64.239944   70.6122     73.95979  ]
 [ 22.239788   70.07058    67.2175   ]
 [ 71.88642    73.969635   22.486969 ]
 [ 73.43607    57.412712   41.543953 ]
 [ 56.94631    42.890015   42.30543  ]
 [ 46.84287    33.264465   65.79669  ]
 [ 36.615925   19.87009    58.945473 ]
 [ 20.036938   16.218582   83.735435 ]
 [ 67.02592    48.747456   25.760368 ]
 [ 79.480484   58.355648   73.838776 ]
 [ 64.27967    24.71337    94.06642  ]
 [ 68.353424   52.793495   20.92744  ]
 [105.9727     64.79233    46.5

With this action the energy is:  -0.101608
With this action the full dim is:  50  and princip dim is:  48
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.101608
Set reward :  6.605229419817478
****CALL STEP****
Action chosen at step:  [[47.910282  65.91817   52.869972 ]
 [38.01597   70.43113   33.705574 ]
 [19.018047  40.813667  61.06211  ]
 [55.706947  92.465706  61.047997 ]
 [69.16368   64.7321    46.046814 ]
 [64.92832   52.98606   58.735188 ]
 [65.27815   54.863953  31.864119 ]
 [52.56764   51.051975  29.429878 ]
 [38.407463  44.287327  35.053123 ]
 [60.441982  78.77763   15.07066  ]
 [58.392765  52.504944  53.09008  ]
 [85.28138   57.29036   57.91792  ]
 [69.89972   28.834358  42.38278  ]
 [60.789753  36.943756  73.0849   ]
 [60.073402  60.290108  50.473232 ]
 [35.24772   81.34396   51.887585 ]
 [60.46126   57.252064  73.61586  ]
 [ 6.6646423 42.71991   39.22461  ]
 [69.434685  37.99357   79.87404  ]
 [42.944702  81.15651   32.737103 ]
 [52.812614  59

With this action the energy is:  -0.075075
With this action the full dim is:  50  and princip dim is:  48
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.075075
Set reward :  4.7601388320621085
****CALL STEP****
Action chosen at step:  [[ 15.61969   80.71357   57.212833]
 [ 35.136936  95.66864   24.288115]
 [ 27.172476  65.403915  57.731422]
 [ 75.94577   99.70474   51.63765 ]
 [ 89.056915  63.963657  44.217133]
 [ 67.55074   89.236435  86.52193 ]
 [ 67.861374  35.583794  39.418705]
 [ 54.207508  95.372604  68.11985 ]
 [ 49.899967  70.84878   55.82368 ]
 [ 79.92824   93.87468   36.978733]
 [ 84.72542   88.41527   44.80714 ]
 [ 64.16576   96.82272   85.396385]
 [ 71.81859   18.81131   35.668213]
 [ 53.02034   66.10429   48.68408 ]
 [ 60.475224 110.        62.256935]
 [ 48.85979   67.18933   90.73338 ]
 [ 45.552635  65.71806   53.967186]
 [ 44.772953  20.018223  76.69427 ]
 [ 69.29628   77.025604  58.59526 ]
 [ 64.45555   57.918575  37.5013  ]
 [ 44.873905  

With this action the energy is:  -0.0180571
With this action the full dim is:  50  and princip dim is:  37
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0180571
Set reward :  0.7951446503421007
****CALL STEP****
Action chosen at step:  [[20.992783 78.44256  78.59615 ]
 [51.80768  64.77025  81.09013 ]
 [67.30664  74.052444 55.92447 ]
 [58.265533 61.16427  56.479107]
 [50.36339  36.226204 69.77409 ]
 [40.62158  45.025726 72.09288 ]
 [49.741547 44.087208 84.668236]
 [55.046856 60.5913   46.8948  ]
 [57.489033 18.373867 81.74139 ]
 [50.187263 75.94972  41.295498]
 [67.62977  53.46878  70.15514 ]
 [55.768948 29.24283  62.16985 ]
 [41.890213 73.93755  29.8315  ]
 [78.71513  40.580032 44.002846]
 [31.302975 41.35173  38.72    ]
 [76.138214 30.260025 69.24471 ]
 [57.962486 53.081005 63.228607]
 [44.38462  64.0611   51.948196]
 [67.25438  54.55919  37.700733]
 [25.344994 79.42023  63.63482 ]
 [58.080704 64.62561  32.86452 ]
 [61.12751  35.812366 73.212845]
 [49.1

With this action the energy is:  -0.0878133
With this action the full dim is:  50  and princip dim is:  45
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0878133
Set reward :  5.645953381265146
****CALL STEP****
Action chosen at step:  [[ 27.303524   70.16206    99.52907  ]
 [ 62.28585    69.021034   61.478107 ]
 [ 76.0237     72.78197    88.96741  ]
 [ 81.551315   51.46771    32.9349   ]
 [ 66.39257    45.551426   28.21194  ]
 [ 44.335682   46.950348   77.75009  ]
 [ 78.58806    27.778221   59.631866 ]
 [ 73.38089    55.365894   75.162476 ]
 [ 80.20644    32.340996   89.28889  ]
 [ 71.565125   70.91533    34.50335  ]
 [ 65.87747    43.04914    34.068848 ]
 [ 84.69197    13.995144   40.562252 ]
 [ 65.15288    36.362408   38.4019   ]
 [ 67.80929    65.10017    47.87964  ]
 [ 55.51904    39.898777   68.90865  ]
 [ 87.81599    12.5949745  64.56172  ]
 [ 50.861588   41.62982    67.35172  ]
 [ 33.872604   48.694065   58.267597 ]
 [ 44.903744   47.02062    37.3

With this action the energy is:  -0.114917
With this action the full dim is:  50  and princip dim is:  48
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.114917
Set reward :  7.530730140898253
****CALL STEP****
Action chosen at step:  [[ 16.056335  38.426987  49.68719 ]
 [ 57.1328    58.62496   88.55473 ]
 [ 67.14616   42.87083   50.47529 ]
 [ 57.36338   62.735077   1.      ]
 [ 99.39589   19.816853  17.589615]
 [ 44.550762  47.780117  44.05873 ]
 [ 77.5734    44.35186   62.534863]
 [ 34.587597  50.10704   60.090504]
 [ 57.0404    32.83376   78.101326]
 [ 97.105255  64.901024  33.709816]
 [ 62.748802  72.91452   43.353848]
 [ 61.396595  32.665928  74.86796 ]
 [ 41.210167  42.336617  48.767143]
 [ 68.68459   86.580605  62.508904]
 [ 68.078094  71.739265  25.618483]
 [ 60.83779   61.425972  63.756813]
 [ 68.71433   25.853561  60.27902 ]
 [ 30.721867  42.849762  63.538258]
 [ 45.989326  55.149944  59.088673]
 [ 22.276466  45.350315  58.1923  ]
 [ 86.850044  4

With this action the energy is:  -0.118233
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.118233
Set reward :  7.761323002202923
****CALL STEP****
Action chosen at step:  [[ 17.332489   74.2291     29.345284 ]
 [ 62.123055   67.37998    46.964226 ]
 [ 56.302727   58.58745    67.878174 ]
 [ 30.412794   66.68169    75.12334  ]
 [ 35.44966    35.54499    45.497562 ]
 [ 37.89778    46.14193    68.37188  ]
 [ 35.537422   61.589226   61.556587 ]
 [ 31.225225   40.253506   27.411762 ]
 [ 51.846634   74.21947    28.765205 ]
 [ 86.90165    65.06938    54.91796  ]
 [ 30.193544   46.608814   50.706764 ]
 [ 33.72137    91.12549    50.19951  ]
 [ 56.566055   57.365765   77.05146  ]
 [ 40.393852   93.77403   110.       ]
 [ 47.3722     75.18051    63.959045 ]
 [ 47.70407    30.731651    4.2761154]
 [ 81.8695     23.255562   63.787415 ]
 [ 39.960384   62.620506   40.78875  ]
 [ 61.89573    95.843155   63.887

With this action the energy is:  -0.11306
With this action the full dim is:  50  and princip dim is:  49
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.11306
Set reward :  7.401595356989093
****CALL STEP****
Action chosen at step:  [[  2.2026863  61.848045   40.0209   ]
 [ 72.08954    58.243694   70.932755 ]
 [ 81.0885     85.107544   46.850372 ]
 [ 40.517982   52.51181    49.150955 ]
 [ 12.157299   28.152752   59.70372  ]
 [ 37.690094   39.450233   79.33539  ]
 [ 70.650345   65.41341    51.350895 ]
 [ 19.05545    64.96241    36.189484 ]
 [ 54.25074    55.53051    70.129944 ]
 [ 69.16579    45.704327   69.44194  ]
 [ 22.998043   89.0882     79.28713  ]
 [ 21.485138   55.13374    40.65336  ]
 [ 31.961199   55.566135   58.90017  ]
 [ 47.48217    74.64775    34.4835   ]
 [ 53.62851    61.619884   56.39761  ]
 [ 39.820564   56.602924   26.704977 ]
 [ 95.78371    42.97498    71.19353  ]
 [ 53.030483   38.530415   46.620472 ]
 [ 69.808205   60.810047   50.95828

With this action the energy is:  -0.0396197
With this action the full dim is:  50  and princip dim is:  43
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0396197
Set reward :  2.294596288209436
****CALL STEP****
Action chosen at step:  [[ 45.77464   85.27327   58.122307]
 [ 44.27574   72.04227   57.82903 ]
 [ 45.969894  73.20416   60.45958 ]
 [ 65.01494   26.55652   79.56193 ]
 [ 49.30137   53.563374  46.478493]
 [ 62.75265   46.102715  50.296608]
 [ 82.4046    77.055695  73.03488 ]
 [ 78.79349   51.232254  44.105892]
 [ 56.994934 103.66077   51.969078]
 [ 78.54452   72.53029   46.48191 ]
 [ 71.214714  56.592342 102.893906]
 [ 51.48192   42.99917   74.257   ]
 [ 23.106792  51.796352  48.0178  ]
 [ 73.355804  31.53064   44.410088]
 [ 67.32661   75.120575  49.799263]
 [ 89.38994   45.04992   57.838547]
 [ 68.44546   43.893158  62.9057  ]
 [ 52.554146  53.217308  46.795403]
 [ 71.78032   77.05659   31.879316]
 [ 76.5987    52.82066   21.189556]
 [ 44.553383 

With this action the energy is:  -0.0702215
With this action the full dim is:  50  and princip dim is:  47
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0702215
Set reward :  4.422629045469161
****CALL STEP****
Action chosen at step:  [[ 46.247154  51.986282  62.955124]
 [ 55.94912  110.        49.803352]
 [ 64.37976   41.296055  27.51328 ]
 [ 78.68317   72.51102  101.312   ]
 [ 52.231087  57.815136  15.439339]
 [ 54.21859   46.680008  51.20625 ]
 [ 57.9498    75.20123   67.72286 ]
 [ 62.661514  38.14126   32.945236]
 [ 22.4371    96.93266   59.924168]
 [ 65.06052   66.01659   43.77334 ]
 [ 77.865524  81.979195  89.25893 ]
 [ 49.36729   45.775204  74.76061 ]
 [ 40.85084   57.15759   74.076584]
 [ 94.01547   28.005142  56.95895 ]
 [ 66.18031   50.609     38.987022]
 [ 61.043346  55.792934  38.861374]
 [ 42.595894  50.276405  74.670685]
 [ 62.09044   67.025246  45.861946]
 [ 42.437256  88.05231   34.94645 ]
 [ 59.474174  56.212746  24.049845]
 [ 59.04688  

With this action the energy is:  -0.0561059
With this action the full dim is:  50  and princip dim is:  38
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0561059
Set reward :  3.4410377930469043
****CALL STEP****
Action chosen at step:  [[47.42683  50.02324  67.85418 ]
 [57.75377  76.66686  45.049232]
 [43.625336 54.26181  21.266071]
 [49.88846  63.001873 57.179   ]
 [53.600754 53.672604 71.28042 ]
 [71.16524  55.687363 72.946335]
 [70.19331  53.488483 67.70409 ]
 [56.08513  41.22137  28.929344]
 [55.589127 66.960655 44.15433 ]
 [30.847214 76.114845 66.1125  ]
 [56.62727  38.198288 79.78675 ]
 [60.678566 79.31768  61.460716]
 [57.249096 18.347713 47.822475]
 [52.73028  44.938267 58.257465]
 [60.201176 55.82227  45.868732]
 [43.160156 58.9696   51.4728  ]
 [60.636658 47.889614 94.72384 ]
 [54.310707 41.81595  72.46135 ]
 [48.819824 47.617607 64.16568 ]
 [78.213905 58.823498 77.24002 ]
 [36.517746 31.874683 41.50035 ]
 [62.905453 65.54685  18.952919]
 [30.6

With this action the energy is:  -0.0892454
With this action the full dim is:  50  and princip dim is:  48
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0892454
Set reward :  5.74554084708976
****CALL STEP****
Action chosen at step:  [[ 35.34579    77.04826    77.71465  ]
 [ 94.57201    73.76883    89.91095  ]
 [ 47.541664   70.95756     7.9523697]
 [ 65.53087    74.82608    65.27841  ]
 [ 43.998016   69.895874   85.1136   ]
 [ 50.334045   30.691278   35.013832 ]
 [ 27.153273   33.074333   68.50562  ]
 [ 59.252213   46.98806    34.132126 ]
 [ 41.63136    60.552612   66.02846  ]
 [ 39.5708     84.3603     94.27421  ]
 [ 59.94709    36.948074   77.68582  ]
 [ 52.523754   52.13815    56.624496 ]
 [ 64.642105   42.56427    26.936333 ]
 [ 63.68023    65.94727    71.99799  ]
 [ 20.900845   37.907425   60.27242  ]
 [ 81.15692    73.138916   52.23241  ]
 [ 63.87976    54.16146   109.234856 ]
 [ 82.63294    67.65931    65.55826  ]
 [ 45.267944   39.99969    67.49

With this action the energy is:  -0.103711
With this action the full dim is:  50  and princip dim is:  47
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.103711
Set reward :  6.751470911772762
Episode 5 ... Score: 57.247
*****CALL RESET******
Action chosen at reset:  [0.]
****CALL STEP****
Action chosen at step:  [[28.140089 73.27046  67.75143 ]
 [50.526012 69.15374  59.761517]
 [54.360878 67.16387  45.56038 ]
 [39.23943  52.13904  47.006042]
 [62.36241  45.589325 63.248493]
 [58.649536 49.04778  48.4026  ]
 [37.222073 59.033035 48.427727]
 [56.42794  63.95592  58.54399 ]
 [79.95308  56.38743  48.425896]
 [66.51749  74.66918  51.515583]
 [55.603798 74.46088  43.331573]
 [54.842247 81.70694  70.68673 ]
 [39.57959  44.55781  68.72674 ]
 [65.53902  57.69315  72.95575 ]
 [54.893913 50.25094  63.729877]
 [85.38974  39.35441  51.109562]
 [92.91     45.08041  60.97798 ]
 [49.674797 51.126743 48.44167 ]
 [48.55724  45.369274 46.252632]
 [42.629486 70.03603  77.411

With this action the energy is:  -0.0714204
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0714204
Set reward :  4.505999908381756
****CALL STEP****
Action chosen at step:  [[ 23.59202    86.66342    72.997665 ]
 [ 23.933697   55.08296    35.81888  ]
 [ 44.447136   77.60487    50.535492 ]
 [ 22.187836   70.288246   44.41737  ]
 [ 30.456675   36.412468   35.475296 ]
 [ 54.269176   24.596334   72.46361  ]
 [ 42.54218    50.89944    49.81328  ]
 [ 16.139229   76.12677    52.29614  ]
 [ 94.69997    59.0392     64.25237  ]
 [ 83.55198    93.75946    46.833237 ]
 [ 48.540752   51.81854    94.91573  ]
 [ 66.912865   68.21658    80.12556  ]
 [ 53.908165   45.376495   52.010025 ]
 [ 48.559967   30.683586   76.675186 ]
 [ 44.968807   69.109276   43.787994 ]
 [105.67534    46.33055    41.4205   ]
 [ 54.154778    1.         42.356102 ]
 [ 12.697548   64.7495     11.403973 ]
 [ 60.304585   49.572643   57.8

With this action the energy is:  -0.131845
With this action the full dim is:  50  and princip dim is:  48
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.131845
Set reward :  8.707894180755144
****CALL STEP****
Action chosen at step:  [[  6.0508003  73.046104   77.84151  ]
 [ 86.844955   39.07952    46.289524 ]
 [ 42.372288   87.988594   22.557114 ]
 [ 36.342346   54.764687   33.595894 ]
 [  1.         64.82914    41.933002 ]
 [ 59.987335   52.7349     75.91823  ]
 [ 25.60148    52.601654   50.744675 ]
 [ 14.854649   59.353046   29.05082  ]
 [ 90.825645   53.53559    29.38487  ]
 [ 48.86145    58.571434   60.602722 ]
 [ 71.129486   63.110382  105.482315 ]
 [ 43.34409    72.22865    66.28515  ]
 [ 45.85852    52.270966   38.87253  ]
 [ 64.85597    41.26127    67.65553  ]
 [ 37.634605   74.62972    20.131367 ]
 [ 87.70145    29.651459   57.08627  ]
 [ 66.16134    32.448204   52.020775 ]
 [  9.2709465  50.76139    18.918182 ]
 [ 58.609913   47.311897   82.944

With this action the energy is:  -0.0627326
With this action the full dim is:  50  and princip dim is:  47
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0627326
Set reward :  3.901854956499154
****CALL STEP****
Action chosen at step:  [[ 38.094917  52.247414  70.83754 ]
 [ 38.61866   76.199615  48.626404]
 [ 60.470978  65.8865    50.96228 ]
 [ 49.269375  46.34501   67.090355]
 [ 67.43738   26.898117  63.71873 ]
 [ 26.214262  62.619854  19.898422]
 [ 47.68953   50.10518   53.23657 ]
 [ 40.917328  46.53991   20.98687 ]
 [ 55.380188  70.95715   79.531265]
 [ 91.45314   43.73536   61.188267]
 [ 66.02844   35.021873  51.925266]
 [ 40.113766  58.591072  69.632645]
 [ 78.584496  52.280155  78.98946 ]
 [ 95.04576   53.970055  61.918503]
 [ 35.412247  45.555103  43.71839 ]
 [ 42.27081   27.528662  49.804565]
 [ 72.61566   28.454994  50.719788]
 [ 55.324726  60.61552   20.02269 ]
 [ 92.40458   50.47179   80.278496]
 [ 28.875721  90.42136   78.87851 ]
 [ 56.19875  

With this action the energy is:  -0.0831642
With this action the full dim is:  50  and princip dim is:  49
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0831642
Set reward :  5.322657461032475
****CALL STEP****
Action chosen at step:  [[ 15.854046   56.65345    50.23858  ]
 [ 73.74382    66.17758    79.838554 ]
 [ 69.13476    50.698463   54.85597  ]
 [ 67.94118    61.37014    60.92004  ]
 [ 30.172718   45.569046   48.08433  ]
 [ 36.02253    82.6145     47.0219   ]
 [ 53.84356    30.776426   55.1204   ]
 [ 58.22171    62.13626    51.37891  ]
 [ 57.54416    58.128414   51.97753  ]
 [110.         77.21268    55.101173 ]
 [ 78.62054    60.344208   36.257454 ]
 [ 54.680084   53.384163   66.54803  ]
 [ 77.86142    74.39475    29.323923 ]
 [ 76.26199    62.00355    65.0444   ]
 [ 28.302471  100.38871    36.398155 ]
 [ 50.289097   39.24777    41.74059  ]
 [110.          9.685875   54.578815 ]
 [ 46.072113   40.84681    11.041183 ]
 [ 47.85792    27.6406     69.8

With this action the energy is:  -0.105066
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.105066
Set reward :  6.84569688495365
Episode 7 ... Score: 53.010
*****CALL RESET******
Action chosen at reset:  [0.]
****CALL STEP****
Action chosen at step:  [[ 54.4631    75.16336   71.71941 ]
 [ 53.575043  70.13186   55.47815 ]
 [ 37.53122   53.9954    61.62355 ]
 [ 41.331947  73.03727   70.95475 ]
 [ 53.56985   63.365074  56.316547]
 [ 62.406395  53.77362   26.053854]
 [ 43.559994  62.713764  43.24096 ]
 [ 55.170692  15.55933   58.886063]
 [ 60.798256  65.91675   70.897095]
 [ 63.076283  51.690796  48.013245]
 [ 57.367085  50.49729   60.08862 ]
 [ 41.18273   84.26447   70.43283 ]
 [ 55.531334  41.61033   47.29569 ]
 [ 55.44496   67.89206   66.33828 ]
 [ 56.566074  50.462914  45.70279 ]
 [ 74.910904  32.916245  45.319508]
 [ 62.082367  39.11413   49.662964]
 [ 38.216145  55.956726  37.36086 ]
 [ 48.96

With this action the energy is:  -0.093389
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.093389
Set reward :  6.033684568471554
****CALL STEP****
Action chosen at step:  [[ 39.109013   53.976864   51.72052  ]
 [ 60.463585   67.03827    45.601757 ]
 [ 33.925926   45.634804   46.147926 ]
 [ 40.983986   78.46034    73.51695  ]
 [ 30.785715    6.4982452  59.766426 ]
 [ 75.72879    51.039913   18.193214 ]
 [ 42.06144    63.27906    23.641085 ]
 [ 60.993332   18.736485   28.465555 ]
 [ 51.326385   93.58705    51.855804 ]
 [ 36.33253    61.954704   80.45743  ]
 [ 49.208954   62.060104   67.65469  ]
 [ 46.823708   55.780453   49.05209  ]
 [ 53.979813   24.813702   54.12896  ]
 [ 58.76335    71.9299     61.532104 ]
 [ 82.29196    82.44205    61.13601  ]
 [ 53.679787   25.43415    47.3871   ]
 [ 62.24784    52.68949    24.507132 ]
 [ 47.798576   22.868404   47.688744 ]
 [ 68.68362    66.81532    61.056

With this action the energy is:  -0.127659
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.127659
Set reward :  8.416801986116619
****CALL STEP****
Action chosen at step:  [[ 66.630974  64.313705  87.56597 ]
 [ 32.693703  58.728058  66.94182 ]
 [ 28.547619  26.815441  47.175076]
 [ 23.899454  46.926125  77.12244 ]
 [ 20.450916  32.529922  39.61525 ]
 [ 55.4041    43.975746  39.828262]
 [ 36.790466  64.02343   41.501568]
 [ 43.623173  41.28737   28.375393]
 [ 52.100292  74.03333   51.941654]
 [ 31.807579  55.981476  91.424576]
 [101.741745  25.616884  93.73393 ]
 [ 37.450085  56.339252  78.44215 ]
 [ 64.41364   21.26841   67.806946]
 [ 60.780655  91.403305  58.84205 ]
 [ 70.57737   83.62318   29.937164]
 [ 57.244003  17.67643   31.57415 ]
 [ 93.73443   54.309593  12.768738]
 [ 34.12267   39.501827  68.77689 ]
 [ 81.504105  35.91239   75.16913 ]
 [ 31.069307  63.867027  49.61257 ]
 [ 24.917725  7

With this action the energy is:  -0.076749
With this action the full dim is:  50  and princip dim is:  48
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.076749
Set reward :  4.876547894132077
****CALL STEP****
Action chosen at step:  [[35.010094  86.19047   94.933716 ]
 [51.14501   92.28937   46.418373 ]
 [50.65117   81.17784   70.91638  ]
 [75.75331   52.656036  69.93029  ]
 [83.11556   12.238152  40.29245  ]
 [73.41055   49.59772   58.110146 ]
 [51.573437  61.714584  50.42079  ]
 [19.916782  33.493313  75.67859  ]
 [67.40957   37.41964   29.220104 ]
 [25.294058  66.70994   74.10732  ]
 [50.350925  44.634087  92.17368  ]
 [66.08438   68.71976   66.69591  ]
 [74.203354  45.64144   41.0497   ]
 [81.44327   96.1172    87.3165   ]
 [86.47322   70.174255  28.319584 ]
 [47.64692   36.29316   15.2960205]
 [85.040405  62.611984  41.77728  ]
 [26.24085   78.425446  36.01673  ]
 [58.366432  73.69209   67.74269  ]
 [37.29825   64.01653   53.475803 ]
 [37.357708  57

With this action the energy is:  -0.0773378
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0773378
Set reward :  4.917492730301013
****CALL STEP****
Action chosen at step:  [[ 25.574934  98.494705  90.11548 ]
 [ 42.714184  66.33938   67.634926]
 [ 26.546963  65.84837   99.54618 ]
 [ 72.38805   52.500546  51.832405]
 [ 68.492966  23.794128  13.990364]
 [ 55.5278    50.38293   77.95918 ]
 [ 54.391     50.155766  62.379986]
 [ 23.3373     9.202427  65.900444]
 [110.        40.545444  38.948326]
 [ 33.98884   84.97124   52.738598]
 [ 44.445595  34.37535  110.      ]
 [ 84.67518   69.008446  47.740486]
 [109.02443   44.491646   9.716888]
 [ 85.994385  65.37996   80.81267 ]
 [ 96.37155   73.97499   18.176441]
 [ 51.021652  17.467255   1.      ]
 [ 99.125656  50.454494  29.426916]
 [ 40.779724  54.30432   34.910423]
 [ 66.10927   52.996338  40.1663  ]
 [ 29.643993  61.827736  75.87224 ]
 [  6.250538 

With this action the energy is:  -0.136924
With this action the full dim is:  50  and princip dim is:  50
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.136924
Set reward :  9.061085116390373
Episode 9 ... Score: 65.008


## Random search as in original SVM

In [None]:
state = env.reset()
scores = []
step = 0
score = 0.0

while True:
    print(".....STEP.....", step)
    action = env.action_space.sample()
    next_state, reward, done, info = env.step(action)
    step = step + 1
    score += reward
    scores.append(score)
    state = next_state
    if done:
        break