# Modifying and Loading Parameters of Policies

## Importing Libraries

In [8]:
from typing import Dict

import gym
import numpy as np
import torch as th

from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy

## Mutate Function to modify Parameters

In [9]:
def mutate(params: Dict[str, th.Tensor]) -> Dict[str, th.Tensor]:
    """Mutate parameters by adding normal noise to them"""
    return dict((name, param + th.randn_like(param)) for name, param in params.items())

## Create Policy with a small Network

In [10]:
env = gym.make('LunarLander-v2')
model = A2C(
    "MlpPolicy",
    env,
    verbose=1
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


## Find Initial Parameters

In [11]:
# Use traditional actor-critic policy gradient updates to
# find good initial parameters
model.learn(total_timesteps=10_000)

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 99.5     |
|    ep_rew_mean        | -188     |
| time/                 |          |
|    fps                | 702      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.26    |
|    explained_variance | 0.0233   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.318    |
|    value_loss         | 0.357    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 117      |
|    ep_rew_mean        | -274     |
| time/                 |          |
|    fps                | 738      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/                |          |
|

<stable_baselines3.a2c.a2c.A2C at 0x7f84192df4e0>

## Retrieve Policy Parameters

In [12]:
# Include only variables with "policy", "action" (policy) or "shared_net" (shared layers)
# in their name: only these ones affect the action.
# NOTE: you can retrieve those parameters using model.get_parameters() too
mean_params = dict(
    (key, value)
    for key, value in model.policy.state_dict().items()
    if ("policy" in key or "shared_net" in key or "action" in key)
)

mean_params

{'mlp_extractor.policy_net.0.weight': tensor([[ 0.0523,  0.0923, -0.2493,  0.1845,  0.1131,  0.0717,  0.1366,  0.2136],
         [ 0.0480, -0.1824, -0.3128, -0.1512,  0.0041, -0.1035,  0.2259,  0.4073],
         [-0.1503, -0.0196, -0.2226,  0.2811,  0.3213,  0.1049,  0.0091, -0.3590],
         [ 0.0396, -0.0116,  0.2437,  0.0271, -0.2210,  0.2227,  0.1610,  0.3043],
         [ 0.2401, -0.0132, -0.1946, -0.5740, -0.1654, -0.2685, -0.0142, -0.0328],
         [-0.0884,  0.0479,  0.0621, -0.0735, -0.2888, -0.0338, -0.0490,  0.0546],
         [-0.1125, -0.1088,  0.1204,  0.3141, -0.1330, -0.1819, -0.1169,  0.1096],
         [-0.2690, -0.2706, -0.1742,  0.0074,  0.2683,  0.1308, -0.0188,  0.0504],
         [-0.1101,  0.2377,  0.6249, -0.2528, -0.2431, -0.1072,  0.1510,  0.4252],
         [ 0.0110,  0.1031, -0.1026,  0.2955,  0.2761, -0.0483, -0.0125,  0.1004],
         [ 0.0055, -0.2950,  0.1107,  0.3956, -0.1347, -0.0932,  0.0628,  0.0114],
         [ 0.2168,  0.0573,  0.1756,  0.2874, -0.3

## Set Population Size and Retrieve Environment 

In [13]:
# population size of 50 invdiduals
pop_size = 50
# Keep top 10%
n_elite = pop_size // 10
# Retrieve the environment
env = model.get_env()

## Iterate through Mutated Policy Params

In [14]:
for iteration in range(10):
    # Create population of candidates and evaluate them
    population = []
    for population_i in range(pop_size):
        candidate = mutate(mean_params)
        # Load new policy parameters to agent.
        # Tell function that it should only update parameters
        # we give it (policy parameters)
        model.policy.load_state_dict(candidate, strict=False)
        # Evaluate the candidate
        fitness, _ = evaluate_policy(model, env)
        population.append((candidate, fitness))
    # Take top 10% and use average over their parameters as next mean parameter
    top_candidates = sorted(population, key=lambda x: x[1], reverse=True)[:n_elite]
    mean_params = dict(
        (
            name,
            th.stack([candidate[0][name] for candidate in top_candidates]).mean(dim=0),
        )
        for name in mean_params.keys()
    )
    mean_fitness = sum(top_candidate[1] for top_candidate in top_candidates) / n_elite
    print(f"Iteration {iteration + 1:<3} Mean top fitness: {mean_fitness:.2f}")
    print(f"Best fitness: {top_candidates[0][1]:.2f}")

Iteration 1   Mean top fitness: -139.48
Best fitness: -121.85
Iteration 2   Mean top fitness: -136.02
Best fitness: -106.02
Iteration 3   Mean top fitness: -119.73
Best fitness: -103.64
Iteration 4   Mean top fitness: -127.08
Best fitness: -124.50
Iteration 5   Mean top fitness: -114.11
Best fitness: -106.69
Iteration 6   Mean top fitness: -98.15
Best fitness: -83.05
Iteration 7   Mean top fitness: -95.81
Best fitness: -87.04
Iteration 8   Mean top fitness: -77.52
Best fitness: -68.93
Iteration 9   Mean top fitness: -75.12
Best fitness: -65.89
Iteration 10  Mean top fitness: -85.35
Best fitness: -79.72
