# Modifying and Loading Parameters of Policies

## Importing Libraries

In [12]:
from typing import Dict

import gym
import numpy as np
import cma
from collections import OrderedDict
import torch as th

from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy

## Function to Convert Params Dict to Flattened N-D array

In [13]:
def flatten(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = []
    for key in params.keys():
        params_.append(params[key].flatten())
    return np.concatenate(params_)

## Convert to Flattened Params to Dictionary

In [14]:
def to_dict(flat_vec, params):
    """
    :param flat_vec: (np.ndarray)
    :param params: (OrderedDict)
    :return: (OrderedDict)
    """
    params_ = OrderedDict()
    start_idx = 0
    for key in params.keys():
        n_elem = list(params[key].size())
        # print(n_elem, params[key].nelement())
        params_[key] = th.from_numpy(flat_vec[start_idx:start_idx + params[key].nelement()].reshape(params[key].shape))
        start_idx += params[key].nelement()
    return params_

## Create Model and Find Initial Parameters

In [15]:
env = gym.make('CartPole-v1')
model = A2C(
    "MlpPolicy",
    env
)

# Use traditional actor-critic policy gradient updates to
# find good initial parameters
model.learn(total_timesteps=10_000)

fitnesses = []
iterations = 10
for i in range(iterations):
    fitness, _ = evaluate_policy(model, env)
    print(fitness, end=" ")
    fitnesses.append(fitness)

mean_fitness = np.mean(sorted(fitnesses)[:int(0.1 * iterations)])
print("\nIterations {:<3} Mean top 10% reward: {:.2f}".format(iterations, -mean_fitness))

160.7 174.7 158.4 165.7 160.1 155.0 160.7 157.5 163.4 170.5 
Iterations 10  Mean top 10% reward: -155.00


## Retrieve Policy Parameters

In [16]:
# Include only variables with "policy", "action" (policy) or "shared_net" (shared layers)
# in their name: only these ones affect the action.
# NOTE: you can retrieve those parameters using model.get_parameters() too
mean_params = dict(
    (key, value)
    for key, value in model.policy.state_dict().items()
    # if ("policy" in key or "shared_net" in key or "action" in key)
)

mean_params

{'mlp_extractor.policy_net.0.weight': tensor([[-0.2133, -0.0659, -0.2679, -0.2098],
         [ 0.0783,  0.0727,  0.0269, -0.1212],
         [ 0.1607,  0.4384, -0.0209, -0.0929],
         [-0.0373, -0.0996, -0.1436, -0.2268],
         [-0.1674,  0.1741, -0.2317, -0.2255],
         [-0.1217, -0.0444,  0.0895, -0.1604],
         [ 0.2357, -0.0377,  0.0425, -0.4285],
         [-0.1429, -0.0406, -0.0664, -0.1667],
         [-0.2128, -0.1823,  0.3424,  0.0853],
         [ 0.0107, -0.2913, -0.0465, -0.0077],
         [ 0.1249, -0.0828,  0.0830, -0.0194],
         [ 0.1758,  0.1716,  0.4806,  0.1782],
         [-0.0174,  0.0600,  0.1147, -0.0959],
         [ 0.1657,  0.0301, -0.2597, -0.2295],
         [ 0.0511,  0.1210,  0.0369,  0.2135],
         [ 0.4613, -0.0224, -0.2029, -0.2504],
         [ 0.1216,  0.0475,  0.5653,  0.2119],
         [-0.0249, -0.0597,  0.0067, -0.1585],
         [ 0.0817, -0.1309,  0.3817,  0.3043],
         [ 0.1671, -0.0944,  0.1569, -0.6188],
         [ 0.1402,  0.1

## Create Evolution Strategy Object

In [17]:
es = cma.CMAEvolutionStrategy(flatten(mean_params), sigma0=1)

(15_w,31)-aCMA-ES (mu_w=8.9,w_1=20%) in dimension 9155 (seed=406734, Wed Oct 27 01:04:04 2021)


## Iterate through Mutated Policy Params

In [18]:
for iteration in range(10):
    # Create population of candidates and evaluate them
    candidates, fitnesses = es.ask(), []
    for candidate in candidates:
        # Load new policy parameters to agent.
        # print(candidate.shape)
        model.policy.load_state_dict(to_dict(candidate, mean_params), strict=False)
        # Evaluate the agent using stable-baselines predict function
        fitness, _ = evaluate_policy(model, env)
        fitnesses.append(fitness)
    # CMA-ES update
    es.tell(candidates, fitnesses)
    # Display some training infos
    mean_fitness = np.mean(sorted(fitnesses)[:int(0.1 * len(candidates))])
    print("Iteration {:<3} Mean top 10% reward: {:.2f}".format(iteration, -mean_fitness))

Iteration 0   Mean top 10% reward: -8.80
Iteration 1   Mean top 10% reward: -9.17
Iteration 2   Mean top 10% reward: -9.27
Iteration 3   Mean top 10% reward: -9.27
Iteration 4   Mean top 10% reward: -9.07
Iteration 5   Mean top 10% reward: -8.93
Iteration 6   Mean top 10% reward: -9.03
Iteration 7   Mean top 10% reward: -8.87
Iteration 8   Mean top 10% reward: -9.17
Iteration 9   Mean top 10% reward: -9.13


## Re-learn the Model and Evaluate

In [19]:
model.learn(total_timesteps=10_000)

fitnesses
iterations = 10
for i in range(iterations):
    fitness, _ = evaluate_policy(model, env)
    print(fitness, end=" ")
    fitnesses.append(fitness)

mean_fitness = np.mean(sorted(fitnesses)[:int(0.1 * iterations)])
print("\nIterations {:<3} Mean top 10% reward: {:.2f}".format(iterations, -mean_fitness))

64.9 71.4 88.1 85.7 95.3 92.7 75.1 72.0 64.2 73.9 
Iterations 10  Mean top 10% reward: -9.00


## Save The Model

In [20]:
model.save("cmaes_lunar")

# Exporting Params as JSON

## Function to Convert Params Dict to Flattened List

In [21]:
def flatten_list(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = {}
    for key in params.keys():
        params_[key] = params[key].flatten().tolist()
    return params_

## Write Parameters to JSON File

In [22]:
import json

with open('data.json', 'w') as f:
    json.dump(flatten_list(mean_params), f, indent='\t')