# Modifying and Loading Parameters of Policies

## Importing Libraries

In [1]:
from typing import Dict

import gym
import numpy as np
import torch as th

from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy

## Mutate Function to modify Parameters

In [2]:
def mutate(params: Dict[str, th.Tensor]) -> Dict[str, th.Tensor]:
    """Mutate parameters by adding normal noise to them"""
    return dict((name, param + th.randn_like(param)) for name, param in params.items())

## Create Policy with a small Network

In [3]:
env = gym.make('CartPole-v1')
model = A2C(
    "MlpPolicy",
    env,
    verbose=1
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


## Find Initial Parameters

In [4]:
# Use traditional actor-critic policy gradient updates to
# find good initial parameters
model.learn(total_timesteps=10_000)

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 59.6     |
|    ep_rew_mean        | 59.6     |
| time/                 |          |
|    fps                | 686      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.644   |
|    explained_variance | 0.104    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 1.4      |
|    value_loss         | 8.78     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 58.2     |
|    ep_rew_mean        | 58.2     |
| time/                 |          |
|    fps                | 716      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/                |          |
|

<stable_baselines3.a2c.a2c.A2C at 0x7f6ec86e2cf8>

## Retrieve Policy Parameters

In [5]:
# Include only variables with "policy", "action" (policy) or "shared_net" (shared layers)
# in their name: only these ones affect the action.
# NOTE: you can retrieve those parameters using model.get_parameters() too
mean_params = dict(
    (key, value)
    for key, value in model.policy.state_dict().items()
    if ("policy" in key or "shared_net" in key or "action" in key)
)

mean_params

{'mlp_extractor.policy_net.0.weight': tensor([[-1.2999e-01,  2.4489e-01, -2.4168e-01, -1.9392e-01],
         [ 2.1343e-01, -1.4928e-01, -2.1906e-01, -2.2806e-01],
         [-1.9922e-01, -7.2524e-02,  2.7773e-01,  5.9612e-02],
         [ 1.2960e-03,  3.0906e-01, -1.7327e-01,  7.5899e-02],
         [-1.5672e-02, -2.5386e-01,  1.5331e-01, -8.1522e-03],
         [ 2.6331e-01,  1.9324e-01, -1.2731e-01, -8.8238e-02],
         [ 6.9915e-02,  1.4019e-01,  4.9944e-02,  3.4992e-01],
         [-8.7018e-02, -1.6019e-02,  4.3477e-02,  3.1152e-01],
         [ 1.3564e-01,  2.2845e-01,  2.3310e-01,  9.6888e-02],
         [ 9.0050e-02,  1.4930e-01,  3.0171e-01,  2.6908e-01],
         [ 2.2106e-01,  2.9067e-01,  6.4909e-02, -3.8618e-01],
         [ 1.0708e-01, -2.2509e-01, -2.4501e-01, -8.5982e-02],
         [ 1.3579e-01,  2.9970e-01,  2.8183e-02, -3.9338e-01],
         [-7.4302e-02,  1.4286e-01, -1.0769e-01, -3.3475e-01],
         [ 3.5629e-01,  2.6837e-01, -2.3631e-02, -2.8512e-01],
         [-1.0405e

## Set Population Size and Retrieve Environment 

In [6]:
# population size of 50 invdiduals
pop_size = 50
# Keep top 10%
n_elite = pop_size // 10
# Retrieve the environment
env = model.get_env()

## Iterate through Mutated Policy Params

In [7]:
for iteration in range(10):
    # Create population of candidates and evaluate them
    population = []
    for population_i in range(pop_size):
        candidate = mutate(mean_params)
        # Load new policy parameters to agent.
        # Tell function that it should only update parameters
        # we give it (policy parameters)
        model.policy.load_state_dict(candidate, strict=False)
        # Evaluate the candidate
        fitness, _ = evaluate_policy(model, env)
        population.append((candidate, fitness))
    # Take top 10% and use average over their parameters as next mean parameter
    top_candidates = sorted(population, key=lambda x: x[1], reverse=True)[:n_elite]
    mean_params = dict(
        (
            name,
            th.stack([candidate[0][name] for candidate in top_candidates]).mean(dim=0),
        )
        for name in mean_params.keys()
    )
    mean_fitness = sum(top_candidate[1] for top_candidate in top_candidates) / n_elite
    print(f"Iteration {iteration + 1:<3} Mean top fitness: {mean_fitness:.2f}")
    print(f"Best fitness: {top_candidates[0][1]:.2f}")

Iteration 1   Mean top fitness: 59.66
Best fitness: 88.30
Iteration 2   Mean top fitness: 206.38
Best fitness: 298.90
Iteration 3   Mean top fitness: 137.26
Best fitness: 217.20
Iteration 4   Mean top fitness: 368.84
Best fitness: 455.80
Iteration 5   Mean top fitness: 278.04
Best fitness: 384.50
Iteration 6   Mean top fitness: 346.00
Best fitness: 425.00
Iteration 7   Mean top fitness: 485.16
Best fitness: 500.00
Iteration 8   Mean top fitness: 496.04
Best fitness: 500.00
Iteration 9   Mean top fitness: 486.30
Best fitness: 500.00
Iteration 10  Mean top fitness: 492.06
Best fitness: 500.00
