# Modifying and Loading Parameters of Policies

## Importing Libraries

In [1]:
from typing import Dict

import gym
import numpy as np
import cma
from collections import OrderedDict
import torch as th

from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy

## Function to Convert Params Dict to Flattened N-D array

In [2]:
def flatten(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = []
    for key in params.keys():
        params_.append(params[key].flatten())
    return np.concatenate(params_)

## Convert to Flattened Params to Dictionary

In [3]:
def to_dict(flat_vec, params):
    """
    :param flat_vec: (np.ndarray)
    :param params: (OrderedDict)
    :return: (OrderedDict)
    """
    params_ = OrderedDict()
    start_idx = 0
    for key in params.keys():
        n_elem = list(params[key].size())
        # print(n_elem, params[key].nelement())
        params_[key] = th.from_numpy(flat_vec[start_idx:start_idx + params[key].nelement()].reshape(params[key].shape))
        start_idx += params[key].nelement()
    return params_

## Create Model and Find Initial Parameters

In [4]:
env = gym.make('LunarLander-v2')
model = A2C(
    "MlpPolicy",
    env
)

# Use traditional actor-critic policy gradient updates to
# find good initial parameters
model.learn(total_timesteps=10_000)

fitnesses = []
iterations = 10
for i in range(iterations):
    fitness, _ = evaluate_policy(model, env)
    print(fitness, end=" ")
    fitnesses.append(fitness)

mean_fitness = np.mean(sorted(fitnesses)[:int(0.1 * iterations)])
print("\nIterations {:<3} Mean top 10% reward: {:.2f}".format(iterations, -mean_fitness))



-450.35259926389006 -397.8973459230212 -347.656095596691 -297.56590001436416 -478.7106381719117 -468.4744760384259 -426.8542408260473 -345.32462144556047 -419.89931562927376 -432.9895065597972 
Iterations 10  Mean top 10% reward: 478.71


## Retrieve Policy Parameters

In [5]:
# Include only variables with "policy", "action" (policy) or "shared_net" (shared layers)
# in their name: only these ones affect the action.
# NOTE: you can retrieve those parameters using model.get_parameters() too
mean_params = dict(
    (key, value)
    for key, value in model.policy.state_dict().items()
    if ("policy" in key or "shared_net" in key or "action" in key)
)

mean_params

{'mlp_extractor.policy_net.0.weight': tensor([[-3.9069e-02, -2.9751e-01,  5.7135e-02,  1.1001e-01,  4.9415e-01,
           4.8763e-01, -2.5367e-01,  2.0103e-01],
         [ 2.7658e-01,  1.7731e-01, -1.0760e-01,  5.2904e-01,  1.5778e-01,
          -5.6275e-02,  1.8452e-01,  3.7666e-02],
         [-2.5497e-01,  6.1471e-02,  6.4641e-02, -4.5071e-01, -2.0454e-01,
          -7.9017e-02,  5.4192e-03,  3.7865e-02],
         [-4.1247e-01, -8.6146e-02, -2.1238e-01,  8.0171e-02,  5.9006e-02,
           3.5128e-02, -6.1475e-02, -1.8640e-01],
         [ 3.2047e-02,  2.8532e-01, -4.5607e-01, -3.0664e-01,  1.6310e-01,
          -1.3408e-01, -2.4096e-01,  6.1581e-02],
         [-2.9457e-03, -1.9655e-01, -2.2810e-01,  9.4642e-02, -5.6707e-02,
          -2.1351e-01,  1.0577e-01,  2.3724e-01],
         [ 1.1425e-01, -2.0929e-01, -2.6077e-01,  3.2334e-01,  3.5190e-01,
           5.6999e-01, -2.3503e-02, -8.3208e-02],
         [-2.4273e-01, -2.3636e-02,  5.0647e-01,  3.7233e-01, -2.3459e-01,
           3.

## Create Evolution Strategy Object

In [6]:
es = cma.CMAEvolutionStrategy(flatten(mean_params), sigma0=1)

(14_w,29)-aCMA-ES (mu_w=8.4,w_1=21%) in dimension 4996 (seed=155161, Sun Oct 24 00:16:32 2021)


## Iterate through Mutated Policy Params

In [7]:
for iteration in range(10):
    # Create population of candidates and evaluate them
    candidates, fitnesses = es.ask(), []
    for candidate in candidates:
        # Load new policy parameters to agent.
        # print(candidate.shape)
        model.policy.load_state_dict(to_dict(candidate, mean_params), strict=False)
        # Evaluate the agent using stable-baselines predict function
        fitness, _ = evaluate_policy(model, env)
        fitnesses.append(fitness)
    # CMA-ES update
    es.tell(candidates, fitnesses)
    # Display some training infos
    mean_fitness = np.mean(sorted(fitnesses)[:int(0.1 * len(candidates))])
    print("Iteration {:<3} Mean top 10% reward: {:.2f}".format(iteration, -mean_fitness))

Iteration 0   Mean top 10% reward: 1060.47
Iteration 1   Mean top 10% reward: 697.28
Iteration 2   Mean top 10% reward: 696.71
Iteration 3   Mean top 10% reward: 826.30
Iteration 4   Mean top 10% reward: 1029.41
Iteration 5   Mean top 10% reward: 1003.31
Iteration 6   Mean top 10% reward: 933.45
Iteration 7   Mean top 10% reward: 1088.30
Iteration 8   Mean top 10% reward: 1170.51
Iteration 9   Mean top 10% reward: 1089.46


## Re-learn the Model and Evaluate

In [8]:
model.learn(total_timesteps=10_000)

fitnesses
iterations = 10
for i in range(iterations):
    fitness, _ = evaluate_policy(model, env)
    print(fitness, end=" ")
    fitnesses.append(fitness)

mean_fitness = np.mean(sorted(fitnesses)[:int(0.1 * iterations)])
print("\nIterations {:<3} Mean top 10% reward: {:.2f}".format(iterations, -mean_fitness))

-621.5978957628831 -583.5865632622969 -626.0510313031962 -645.9844730104669 -722.7712184350937 -546.0729721915326 -642.8980144155881 -565.4102624164894 -569.0454332590103 -702.1407415207475 
Iterations 10  Mean top 10% reward: 1125.86


## Save The Model

In [9]:
model.save("cmaes_lunar")

# Exporting Params as JSON

## Function to Convert Params Dict to Flattened List

In [10]:
def flatten_list(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = {}
    for key in params.keys():
        params_[key] = params[key].flatten().tolist()
    return params_

## Write Parameters to JSON File

In [11]:
import json

with open('data.json', 'w') as f:
    json.dump(flatten_list(mean_params), f, indent='\t')