# Modifying and Loading Parameters of Policies

## Importing Libraries

In [1]:
from typing import Dict

import gym
import numpy as np
import cma
from collections import OrderedDict
import torch as th

from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy

## Function to Convert Params Dict to Flattened N-D array

In [2]:
def flatten(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = []
    for key in params.keys():
        params_.append(params[key].flatten())
    return np.concatenate(params_)

## Convert to Flattened Params to Dictionary

In [19]:
def to_dict(flat_vec, params):
    """
    :param flat_vec: (np.ndarray)
    :param params: (OrderedDict)
    :return: (OrderedDict)
    """
    params_ = OrderedDict()
    start_idx = 0
    for key in params.keys():
        n_elem = list(params[key].size())
        # print(n_elem, params[key].nelement())
        params_[key] = th.from_numpy(flat_vec[start_idx:start_idx + params[key].nelement()].reshape(params[key].shape))
        start_idx += params[key].nelement()
    return params_

## Create Model and Find Initial Parameters

In [4]:
env = gym.make('LunarLander-v2')
model = A2C(
    "MlpPolicy",
    env,
    verbose=1
)

# Use traditional actor-critic policy gradient updates to
# find good initial parameters
model.learn(total_timesteps=10_000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 109      |
|    ep_rew_mean        | -235     |
| time/                 |          |
|    fps                | 635      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.552   |
|    explained_variance | 0.0209   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -1.61    |
|    value_loss         | 6.86     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 143      |
|    ep_rew_mean        | -343     |
| time/                 |          |
|    fps                | 657      |
|    iterations         | 200      |
|    time_elapsed 

<stable_baselines3.a2c.a2c.A2C at 0x7f1b0e7b5da0>

## Retrieve Policy Parameters

In [5]:
# Include only variables with "policy", "action" (policy) or "shared_net" (shared layers)
# in their name: only these ones affect the action.
# NOTE: you can retrieve those parameters using model.get_parameters() too
mean_params = dict(
    (key, value)
    for key, value in model.policy.state_dict().items()
    if ("policy" in key or "shared_net" in key or "action" in key)
)

mean_params

{'mlp_extractor.policy_net.0.weight': tensor([[ 3.4069e-01,  1.8471e-01,  1.4974e-01,  3.1768e-01, -2.1731e-01,
          -1.6110e-02, -1.6159e-01,  2.2111e-02],
         [ 1.0475e-01, -5.8903e-02,  2.9604e-02,  3.9440e-01, -2.3462e-01,
           1.9133e-01, -4.5873e-02,  2.6314e-01],
         [-1.5947e-01, -1.1737e-01, -1.9051e-01, -1.9990e-03,  1.5501e-01,
          -8.9028e-02,  1.7459e-01, -1.6864e-01],
         [-2.0997e-01, -1.2236e-01, -6.4666e-01,  1.3344e-01,  3.6844e-01,
           9.3169e-02,  1.4598e-01,  1.6314e-01],
         [ 1.4613e-01,  1.4389e-01,  1.6092e-01, -1.0492e-01,  4.1506e-01,
           4.6695e-02, -2.0175e-01, -3.2073e-02],
         [ 6.7694e-03, -6.4617e-02,  4.3404e-01,  2.7436e-01, -1.9169e-01,
          -7.2924e-02,  1.6485e-01,  6.3676e-02],
         [ 3.3669e-01, -1.8311e-01,  2.9578e-01,  2.4164e-01, -4.5920e-02,
           6.8801e-03,  1.2487e-01, -2.8257e-01],
         [-2.8457e-02, -1.7506e-01, -7.5848e-02, -6.4995e-01, -3.5214e-03,
          -2.

## Create Evolution Strategy Object

In [6]:
es = cma.CMAEvolutionStrategy(flatten(mean_params), sigma0=1)

(14_w,29)-aCMA-ES (mu_w=8.4,w_1=21%) in dimension 4996 (seed=91831, Sat Oct 23 10:53:46 2021)


## Iterate through Mutated Policy Params

In [20]:
for iteration in range(10):
    # Create population of candidates and evaluate them
    candidates, fitnesses = es.ask(), []
    for candidate in candidates:
        # Load new policy parameters to agent.
        # print(candidate.shape)
        model.policy.load_state_dict(to_dict(candidate, mean_params), strict=False)
        # Evaluate the agent using stable-baselines predict function
        fitness, _ = evaluate_policy(model, env)
        fitnesses.append(fitness)
    # CMA-ES update
    es.tell(candidates, fitnesses)
    # Display some training infos
    mean_fitness = np.mean(sorted(fitnesses)[:int(0.1 * len(candidates))])
    print("Iteration {:<3} Mean top 10% reward: {:.2f}".format(iteration, -mean_fitness))



Iteration 0   Mean top 10% reward: 924.50
Iteration 1   Mean top 10% reward: 1773.43
Iteration 2   Mean top 10% reward: 1087.66
Iteration 3   Mean top 10% reward: 768.52
Iteration 4   Mean top 10% reward: 1079.23
Iteration 5   Mean top 10% reward: 1614.32
Iteration 6   Mean top 10% reward: 1089.86
Iteration 7   Mean top 10% reward: 1164.56
Iteration 8   Mean top 10% reward: 1112.89
Iteration 9   Mean top 10% reward: 1182.44


# Exporting Params as JSON

## Function to Convert Params Dict to Flattened List

In [26]:
def flatten_list(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = {}
    for key in params.keys():
        params_[key] = params[key].flatten().tolist()
    return params_

## Write Parameters to JSON File

In [30]:
import json

with open('data.json', 'w') as f:
    json.dump(flatten_list(mean_params), f, indent='\t')