# Modifying and Loading Parameters of Policies

## Importing Libraries

In [2]:
from typing import Dict

import gym
import numpy as np
import torch as th

from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy

## Mutate Function to modify Parameters

In [4]:
def mutate(params: Dict[str, th.Tensor]) -> Dict[str, th.Tensor]:
    """Mutate parameters by adding normal noise to them"""
    return dict((name, param + th.randn_like(param)) for name, param in params.items())

## Create Policy with a small Network

In [5]:
model = A2C(
    "MlpPolicy",
    "CartPole-v1",
    ent_coef=0.0,
    policy_kwargs={"net_arch": [32]},
    seed=0,
    learning_rate=0.05,
)

## Find Initial Parameters

In [6]:
# Use traditional actor-critic policy gradient updates to
# find good initial parameters
model.learn(total_timesteps=10_000)

<stable_baselines3.a2c.a2c.A2C at 0x7fc9926a5be0>

## Retrieve Policy Parameters

In [8]:
# Include only variables with "policy", "action" (policy) or "shared_net" (shared layers)
# in their name: only these ones affect the action.
# NOTE: you can retrieve those parameters using model.get_parameters() too
mean_params = dict(
    (key, value)
    for key, value in model.policy.state_dict().items()
    if ("policy" in key or "shared_net" in key or "action" in key)
)

mean_params

{'mlp_extractor.shared_net.0.weight': tensor([[ 1.2296e+00, -1.4302e+00, -1.7079e+00, -8.3761e-01],
         [-1.0285e-01,  1.9513e+00,  4.9012e+00,  1.8903e+00],
         [-1.3991e-01, -6.8936e-01, -3.2703e+00, -8.2682e-01],
         [-2.4834e-01,  3.5618e-01, -1.8914e+00,  3.3280e-01],
         [-1.1993e+00, -6.7687e-01,  6.6309e-01, -6.3695e-01],
         [ 4.1149e-02, -1.8906e+00, -7.4173e+00, -1.8766e+00],
         [-9.8579e-01,  1.0869e+00,  1.6673e+00,  5.7690e-01],
         [ 9.5711e-01,  1.6833e-01,  5.6043e-01,  3.7882e-01],
         [-1.5834e+00, -8.9660e-01,  7.4962e+00,  2.6119e+00],
         [ 1.0654e-02,  1.1045e+00,  3.2932e+00,  1.0957e+00],
         [-8.7981e-01, -1.5747e+00,  6.1326e+00,  3.8232e+00],
         [ 6.5508e-01, -1.7383e+00, -7.5128e+00, -1.8467e+00],
         [-1.4266e+00,  2.6727e-01,  1.1462e+01,  2.7812e+00],
         [-2.0198e-01, -1.9209e+00, -7.2255e+00, -1.8872e+00],
         [-3.2611e-02, -2.8132e+00, -6.2025e+00, -2.0255e+00],
         [ 5.7684e

## Set Population Size and Retrieve Environment 

In [9]:
# population size of 50 invdiduals
pop_size = 50
# Keep top 10%
n_elite = pop_size // 10
# Retrieve the environment
env = model.get_env()

## Iterate through Mutated Policy Params

In [10]:
for iteration in range(10):
    # Create population of candidates and evaluate them
    population = []
    for population_i in range(pop_size):
        candidate = mutate(mean_params)
        # Load new policy parameters to agent.
        # Tell function that it should only update parameters
        # we give it (policy parameters)
        model.policy.load_state_dict(candidate, strict=False)
        # Evaluate the candidate
        fitness, _ = evaluate_policy(model, env)
        population.append((candidate, fitness))
    # Take top 10% and use average over their parameters as next mean parameter
    top_candidates = sorted(population, key=lambda x: x[1], reverse=True)[:n_elite]
    mean_params = dict(
        (
            name,
            th.stack([candidate[0][name] for candidate in top_candidates]).mean(dim=0),
        )
        for name in mean_params.keys()
    )
    mean_fitness = sum(top_candidate[1] for top_candidate in top_candidates) / n_elite
    print(f"Iteration {iteration + 1:<3} Mean top fitness: {mean_fitness:.2f}")
    print(f"Best fitness: {top_candidates[0][1]:.2f}")

Iteration 1   Mean top fitness: 221.44
Best fitness: 305.20
Iteration 2   Mean top fitness: 494.48
Best fitness: 500.00
Iteration 3   Mean top fitness: 500.00
Best fitness: 500.00
Iteration 4   Mean top fitness: 500.00
Best fitness: 500.00
Iteration 5   Mean top fitness: 500.00
Best fitness: 500.00
Iteration 6   Mean top fitness: 500.00
Best fitness: 500.00
