# Stochastic Variational Method with RL algorithms

In [7]:
import numpy as np
import gym
import torch
import subprocess
import os
import pickle

## Expoloring environment

In [8]:
env = gym.make('svm_env:svmEnv-v2', n_pairs = 3, n_basis = 50, file_sigmas ="./svmCodeSVD/sigmas.dat" )

print('### Env Name ######', env.unwrapped.spec.id)

obs_space = env.observation_space

print('###### Observation space ####### \n', obs_space)

state_size = env.observation_space.shape[-1]

print('###### Size of observation space ####### \n', state_size)

act_space = env.action_space.shape

print('###### Action space ####### \n', act_space)

act_size = env.action_space.shape[0]*env.action_space.shape[-1]

print('###### Number of actions ####### \n', act_size)

state = env.reset()

print('##### State after reset ###### \n', state)

print('##### File where will be stored sigmas \n', env.file_sigmas)



### Env Name ###### svmEnv-v2
###### Observation space ####### 
 Box(-inf, inf, (1,), float32)
###### Size of observation space ####### 
 1
###### Action space ####### 
 (50, 3)
###### Number of actions ####### 
 150
*****CALL RESET******
Action chosen at reset:  [0.]
##### State after reset ###### 
 [0.]
##### File where will be stored sigmas 
 ./svmCodeSVD/sigmas.dat


# Your codes `DDPG` and `PPO`

## Functions for saving and clean `ddpg` alg

In [9]:
## Save all rewards, energies and princip dims in files during episode training
def create_run_fold_and_info_ddpg(agent, env):
    
    # Check if folder exist and creat it
    i = 0
    while os.path.exists(f'runs_optim_envs/run_{i}/'):
        i += 1
    name_dir = f'runs_optim_envs/run_{i}/'
    os.makedirs(name_dir)
    
    # Create info.p to store info in pickle file
    info = {'alg':agent.name, 'env':env.unwrapped.spec.id , 'basis_size':env.n_basis \
            , 'batch_size':agent.batch_size, 'bootstrap_size':agent.bootstrap_size \
            , 'gamma':agent.gamma, 'tau':agent.tau,'lr_critic':agent.lr_critic \
            , 'lr_actor':agent.lr_actor, 'update_every':agent.update_every \
            , 'transfer_every':agent.transfer_every, 'num_update':agent.num_update \
            , 'add_noise_every':agent.add_noise_every}
    
    pickle.dump(info, open(name_dir+'info.p', 'wb'))
    return name_dir
    
def save_all(name_run_dir, i_ep, sigmas_i_ep, rew_i_ep, en_i_ep, pri_dim_i_ep \
             , full_dim_i_ep, act_model_i_ep, cr_model_i_ep):
    
    pickle.dump(sigmas_i_ep, open(name_run_dir+f'sigmas_{i_ep}.p', 'wb'))
    pickle.dump(rew_i_ep, open(name_run_dir+f'rew_{i_ep}.p', 'wb'))
    pickle.dump(en_i_ep, open(name_run_dir+f'en_{i_ep}.p', 'wb'))
    pickle.dump(pri_dim_i_ep, open(name_run_dir+f'pri_dim_{i_ep}.p', 'wb'))
    pickle.dump(full_dim_i_ep, open(name_run_dir+f'full_dim_{i_ep}.p', 'wb'))
    pickle.dump(act_model_i_ep, open(name_run_dir+f'act_model_{i_ep}.p', 'wb'))
    pickle.dump(cr_model_i_ep, open(name_run_dir+f'cr_model_{i_ep}.p', 'wb'))
    
def rm_useless_file(actor_model_file, critic_model_file, file_sigmas):
    os.remove(actor_model_file)
    os.remove(critic_model_file)
    os.remove(file_sigmas)

## From my `ddpg_agent.py` code

In [None]:
from ddpg_agent import DDPG_agent
agent = DDPG_agent(state_size, act_size, seed = 0)

In [None]:
## Run ddpg algs   
def run_ddpg(max_t_step = 10, n_episodes=10):
    
    # Create h5 file and store info about alg and its hypereparams
    name_run_dir = create_run_fold_and_info(agent, env)
    
    for i_ep in range(n_episodes):
        state = env.reset()
        agent.reset()
        rew_i_ep = []
        en_i_ep = []
        pri_dim_i_ep = []
        full_dim_i_ep = []
        action_i_episode = []

        ## Training loop of each episode
        for t_step in range(max_t_step):
            action = agent.act(state)
            next_state, reward, done, info = env.step(action.reshape((env.n_basis,env.n_pairs)))
            agent.step(state, action, reward, next_state, done)
            state = next_state

            # Save rew, energies, princip dims, act and crit models
            action_i_episode.append(action.reshape((env.n_basis,env.n_pairs)))
            rew_i_ep.append(reward)
            en_i_ep.append(state[0])
            pri_dim_i_ep.append(env.princp_dim)
            full_dim_i_ep.append(env.full_dim)
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            if done:
                break
                
        ## Save data during training (to not lose the work done)
        save_all(name_run_dir=name_run_dir, i_ep=int(i_ep), sigmas_i_ep=action_i_episode \
                 , rew_i_ep=rew_i_ep, en_i_ep=en_i_ep, pri_dim_i_ep=pri_dim_i_ep \
                 , full_dim_i_ep=full_dim_i_ep, act_model_i_ep='checkpoint_actor.pth' \
                 , cr_model_i_ep='checkpoint_critic.pth')
        
        print('Episode {} ... Score: {:.3f}'.format(i_ep, np.sum(rew_i_ep)))

    rm_useless_file('checkpoint_actor.pth', 'checkpoint_critic.pth', env.file_sigmas)
    return name_run_dir

In [None]:
all_data = run_ddpg(10, 10)

## Functions for saving and clean `ppo` alg

In [10]:
## Save all rewards, energies and princip dims in files during episode training
def create_run_fold_and_info_ppo(agent, env):
    
    # Check if folder exist and creat it
    i = 0
    while os.path.exists(f'runs_optim_envs/run_{i}/'):
        i += 1
    name_dir = f'runs_optim_envs/run_{i}/'
    os.makedirs(name_dir)
    
    # Create info.p to store info in pickle file
    info = {'alg':agent.name, 'env':env.unwrapped.spec.id , 'basis_size':env.n_basis \
            , 'lambda_gae':agent.lambda_gae ,'gamma':agent.gamma \
            , 'clip':agent.clip,'lr_critic':agent.lr_critic \
            , 'lr_actor':agent.lr_actor, 'num_update':agent.num_update \
            , 'add_noise_every':agent.add_noise_every}
    
    pickle.dump(info, open(name_dir+'info.p', 'wb'))
    return name_dir

## From my `ppo_agent.py` code

In [11]:
from ppo_agent import PPO_agent
agent = PPO_agent(state_size, act_size, seed = 0)

In [12]:
def run_ppo(num_episodes = 1, num_trajs = 2, length_traj = 3):
    name_run_dir = create_run_fold_and_info_ppo(agent, env)
    for k in range(num_episodes):
        ## Data for trajectories
        trajs_states = []
        trajs_acts = []
        all_rews = []
        trajs_pri_dim = []
        trajs_full_dim = []
        trajs_log_pol = []
        len_trajs = []
        
        ## Run to collect trajs for a maximum of length_traj
        for i in range(num_trajs):
            ## Episodic data. Keeps track of rewards per traj
            print(f'##### {i}th Traj #####')
            ep_rews = []
            state = env.reset()
            done = False

            for t_traj in range(length_traj):
                # Track observations in this batch
                trajs_states.append(state)

                # Calculate action and log policy and perform a step of the env
                action, log_policy = agent.act(state)
                state, reward, done, info = env.step(action.reshape((env.n_basis,env.n_pairs)))

                # Track recent reward, action, and action log policy, pri dim, full dim
                ep_rews.append(reward)
                trajs_acts.append(action)
                trajs_log_pol.append(log_policy)
                trajs_pri_dim.append(env.princp_dim)
                trajs_full_dim.append(env.full_dim)
                
                if done:
                    break
                    
            len_trajs.append(1 + t_traj)
            all_rews.append(ep_rews)

        # Reshape data as tensors
        trajs_states = torch.tensor(trajs_states, dtype=torch.float)
        trajs_acts = torch.tensor(trajs_acts, dtype=torch.float)
        trajs_log_pol = torch.tensor(trajs_log_pol, dtype=torch.float)

        # Run step for learning
        agent.step(trajs_states, trajs_acts, trajs_log_pol, all_rews, len_trajs)
        torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
        torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
        
        # Save energies (states), sigmas (actions), rew, pri dim, full dim
        # actor, critic models 
        save_all(name_run_dir=name_run_dir, i_ep=int(k), \
                sigmas_i_ep=trajs_acts.reshape((num_trajs,length_traj,env.n_basis,env.n_pairs)), \
                rew_i_ep=all_rews, \
                en_i_ep=trajs_states.reshape((num_trajs,length_traj)),\
                pri_dim_i_ep=np.reshape(trajs_pri_dim, (num_trajs,length_traj)), \
                full_dim_i_ep=np.reshape(trajs_full_dim, (num_trajs,length_traj)), \
                act_model_i_ep='checkpoint_actor.pth', \
                cr_model_i_ep='checkpoint_critic.pth')
        
        rm_useless_file('checkpoint_actor.pth', 'checkpoint_critic.pth', env.file_sigmas)
        
        # Calculate metrics to print
        avg_iter_lens = np.mean(len_trajs)
        avg_iter_retur = np.mean([np.sum(ep_rews) for ep_rews in all_rews])
        
        # Print logging statements
        print(flush=True)
        print(f"-------------------- Iteration #{agent.k_step} --------------------", flush=True)
        print(f"Average Episodic Length: {avg_iter_lens}", flush=True)
        print(f"Average Episodic Return: {avg_iter_retur}", flush=True)
        print(f"Timesteps So Far: {agent.t_step}", flush=True)
        print(f"------------------------------------------------------", flush=True)
        print(flush=True)
    return name_run_dir

In [13]:
name_dir_ppo = run_ppo()

##### 0th Traj #####
*****CALL RESET******
Action chosen at reset:  [0.]
****CALL STEP****
Action chosen at step:  [[110.         78.13669    37.84396  ]
 [  0.         77.65255    63.174236 ]
 [ 55.025597  110.          7.86932  ]
 [  0.         88.853485   63.706417 ]
 [  2.183628   56.061157   96.90524  ]
 [ 52.28917    74.58578    50.459785 ]
 [ 44.00922     1.8432693  47.56466  ]
 [  8.763382   32.694794    9.682529 ]
 [ 19.278801   82.059006  106.47586  ]
 [  0.         33.766212   65.775894 ]
 [ 77.65105    64.08187    50.381027 ]
 [  6.21056    29.013102   60.22737  ]
 [ 62.6778     59.867085   12.85672  ]
 [ 67.54057   109.179       0.       ]
 [  0.308918  110.        110.       ]
 [  1.4039307  73.65468    28.494665 ]
 [  5.132839   57.072334   38.386787 ]
 [ 32.023056   42.32032    58.1405   ]
 [ 75.91939    39.13339    57.515976 ]
 [105.44864    96.49936    83.87012  ]
 [  3.453949   31.464834   27.507957 ]
 [ 68.860565  110.         60.41141  ]
 [ 64.84862    91.875786   

With this action the energy is:  -0.0963499
With this action the full dim is:  33  and princip dim is:  33
#### THE ACTION IS A GOOD ONE ####
**** THE AGENT STATE IS THE ENERGY **** -0.0963499
Set reward :  6.239583966251107
****CALL STEP****
Action chosen at step:  [[ 14.684887   85.512215   80.397736 ]
 [104.491745   73.52156    28.623278 ]
 [ 20.096931  107.465805   88.56851  ]
 [ 16.764183   92.847534   72.75238  ]
 [ 52.70705   110.         50.467773 ]
 [  0.        110.          4.530472 ]
 [ 48.089573   69.38127    85.051636 ]
 [ 52.680943  110.         82.208824 ]
 [ 73.14836    76.33143    51.7959   ]
 [ 37.324566  110.        110.       ]
 [ 26.625008   37.212273   95.63509  ]
 [ 36.842407   10.44368    99.31581  ]
 [ 72.121895   22.835625   45.79837  ]
 [  9.002365    0.          0.       ]
 [ 85.323265   82.15439    46.759193 ]
 [ 34.275597   46.449585  102.62042  ]
 [ 71.15596     0.8058586  94.61369  ]
 [ 85.93828    83.107574   74.05152  ]
 [110.         31.343977   39.2

  trajs_states = torch.tensor(trajs_states, dtype=torch.float)


In [14]:
print(name_dir_ppo)

runs_optim_envs/run_6/


## Random search as in original SVM

In [None]:
state = env.reset()
scores = []
step = 0
score = 0.0

while True:
    print(".....STEP.....", step)
    action = env.action_space.sample()
    next_state, reward, done, info = env.step(action)
    step = step + 1
    score += reward
    scores.append(score)
    state = next_state
    if done:
        break