# Modifying and Loading Parameters of Policies

## Importing Libraries

In [1]:
from typing import Dict

import gym
import numpy as np
import cma
from collections import OrderedDict
import torch as th

from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy

## Function to Convert Params Dict to Flattened N-D array

In [2]:
def flatten(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = []
    for key in params.keys():
        params_.append(params[key].flatten())
    return np.concatenate(params_)

## Convert to Flattened Params to Dictionary

In [3]:
def to_dict(flat_vec, params):
    """
    :param flat_vec: (np.ndarray)
    :param params: (OrderedDict)
    :return: (OrderedDict)
    """
    params_ = OrderedDict()
    start_idx = 0
    for key in params.keys():
        n_elem = list(params[key].size())
        # print(n_elem, params[key].nelement())
        params_[key] = th.from_numpy(flat_vec[start_idx:start_idx + params[key].nelement()].reshape(params[key].shape))
        start_idx += params[key].nelement()
    return params_

## Create Model and Find Initial Parameters

In [4]:
env = gym.make('LunarLander-v2')
model = A2C(
    "MlpPolicy",
    env
)

# Use traditional actor-critic policy gradient updates to
# find good initial parameters
model.learn(total_timesteps=10_000)

fitnesses = []
iterations = 10
for i in range(iterations):
    fitness, _ = evaluate_policy(model, env)
    print(fitness, end=" ")
    fitnesses.append(fitness)

mean_fitness = np.mean(sorted(fitnesses)[:int(0.1 * iterations)])
print("\nIterations {:<3} Mean top 10% reward: {:.2f}".format(iterations, -mean_fitness))

## Retrieve Policy Parameters

In [None]:
# Include only variables with "policy", "action" (policy) or "shared_net" (shared layers)
# in their name: only these ones affect the action.
# NOTE: you can retrieve those parameters using model.get_parameters() too
mean_params = dict(
    (key, value)
    for key, value in model.policy.state_dict().items()
    if ("policy" in key or "shared_net" in key or "action" in key)
)

mean_params

{'mlp_extractor.policy_net.0.weight': tensor([[ 1.8901e-01,  1.6658e-01,  3.2475e-01,  1.8143e-01, -1.2517e-01,
           7.8223e-02,  4.7420e-02, -1.8982e-01],
         [ 1.5637e-02, -2.7493e-01, -2.1620e-01,  2.4034e-01,  6.9712e-02,
           6.2896e-02,  2.0219e-02,  1.2486e-01],
         [ 2.2521e-01,  6.1729e-02,  2.4893e-01, -2.9939e-01, -4.7456e-01,
           1.4684e-01, -1.1493e-01, -1.1568e-01],
         [ 4.9491e-02, -1.5579e-01,  2.1615e-01, -3.1867e-02, -3.7086e-01,
          -1.2873e-01,  2.3219e-01,  1.1578e-01],
         [ 6.9435e-02, -5.5763e-02, -7.3520e-02,  3.5442e-01,  2.4250e-01,
          -1.2105e-01,  1.1685e-01,  2.9595e-01],
         [-6.0801e-02, -7.7410e-02, -4.0017e-01, -2.0015e-01,  4.3800e-01,
           1.5279e-01,  2.3266e-01, -1.2463e-01],
         [ 4.8204e-02, -1.2625e-01, -2.2877e-02,  3.0992e-01, -1.6442e-01,
          -5.2426e-02, -5.6300e-02,  2.7125e-01],
         [ 5.6869e-02,  1.5014e-01,  1.6561e-01, -1.9453e-01, -1.8167e-02,
          -4.

## Create Evolution Strategy Object

In [None]:
es = cma.CMAEvolutionStrategy(flatten(mean_params), sigma0=1)

(14_w,29)-aCMA-ES (mu_w=8.4,w_1=21%) in dimension 4996 (seed=122836, Sat Oct 23 23:56:38 2021)


## Iterate through Mutated Policy Params

In [None]:
for iteration in range(10):
    # Create population of candidates and evaluate them
    candidates, fitnesses = es.ask(), []
    for candidate in candidates:
        # Load new policy parameters to agent.
        # print(candidate.shape)
        model.policy.load_state_dict(to_dict(candidate, mean_params), strict=False)
        # Evaluate the agent using stable-baselines predict function
        fitness, _ = evaluate_policy(model, env)
        fitnesses.append(fitness)
    # CMA-ES update
    es.tell(candidates, fitnesses)
    # Display some training infos
    mean_fitness = np.mean(sorted(fitnesses)[:int(0.1 * len(candidates))])
    print("Iteration {:<3} Mean top 10% reward: {:.2f}".format(iteration, -mean_fitness))

Iteration 0   Mean top 10% reward: 1176.26
Iteration 1   Mean top 10% reward: 954.47
Iteration 2   Mean top 10% reward: 829.68
Iteration 3   Mean top 10% reward: 702.57
Iteration 4   Mean top 10% reward: 805.67
Iteration 5   Mean top 10% reward: 798.87
Iteration 6   Mean top 10% reward: 1408.48
Iteration 7   Mean top 10% reward: 1089.03
Iteration 8   Mean top 10% reward: 1009.76
Iteration 9   Mean top 10% reward: 1077.32


## Re-learn the Model and Evaluate

In [None]:
model.learn(total_timesteps=10_000)

fitnesses
iterations = 10
for i in range(iterations):
    fitness, _ = evaluate_policy(model, env)
    print(fitness, end=" ")
    fitnesses.append(fitness)

mean_fitness = np.mean(sorted(fitnesses)[:int(0.1 * iterations)])
print("\nIterations {:<3} Mean top 10% reward: {:.2f}".format(iterations, -mean_fitness))

-143.89861566210166 -630.4719898006879 -305.69003718341844 -378.62984872718516 -179.02497340796327 -290.96745057394145 -269.34067771626576 -174.80621866821312 -237.411879973975 -335.8981815130916 
Iterations 9   Mean top 10% reward: 1110.12


# Exporting Params as JSON

## Function to Convert Params Dict to Flattened List

In [None]:
def flatten_list(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = {}
    for key in params.keys():
        params_[key] = params[key].flatten().tolist()
    return params_

## Write Parameters to JSON File

In [None]:
import json

with open('data.json', 'w') as f:
    json.dump(flatten_list(mean_params), f, indent='\t')