# Modifying and Loading Parameters of Policies

## Importing Libraries

In [1]:
from typing import Dict

import gym
import numpy as np
import torch as th

from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

## Mutate Function to modify Parameters

In [2]:
def mutate(params: Dict[str, th.Tensor]) -> Dict[str, th.Tensor]:
    """Mutate parameters by adding normal noise to them"""
    return dict((name, param + th.randn_like(param)) for name, param in params.items())

## Create Policy with a small Network

In [3]:
env = gym.make('LunarLander-v2')
model = DQN(
    "MlpPolicy",
    env,
    verbose=1
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


## Find Initial Parameters

In [4]:
# Use traditional actor-critic policy gradient updates to
# find good initial parameters
model.learn(total_timesteps=10_000)

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 79.2     |
|    ep_rew_mean      | -220     |
|    exploration rate | 0.699    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2274     |
|    time_elapsed     | 0        |
|    total timesteps  | 317      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 82.5     |
|    ep_rew_mean      | -189     |
|    exploration rate | 0.373    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 2300     |
|    time_elapsed     | 0        |
|    total timesteps  | 660      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 79.4     |
|    ep_rew_mean      | -170     |
|    exploration rate | 0.0947   |
| time/               |          |
|    episodes       

<stable_baselines3.dqn.dqn.DQN at 0x7f841f261a90>

## Retrieve Policy Parameters

In [5]:
# Include only variables with "policy", "action" (policy) or "shared_net" (shared layers)
# in their name: only these ones affect the action.
# NOTE: you can retrieve those parameters using model.get_parameters() too
mean_params = dict(
    (key, value)
    for key, value in model.policy.state_dict().items()
    if ("policy" in key or "shared_net" in key or "action" in key)
)

mean_params

{}

## Set Population Size and Retrieve Environment 

In [6]:
# population size of 50 invdiduals
pop_size = 50
# Keep top 10%
n_elite = pop_size // 10
# Retrieve the environment
env = model.get_env()

## Iterate through Mutated Policy Params

In [7]:
for iteration in range(10):
    # Create population of candidates and evaluate them
    population = []
    for population_i in range(pop_size):
        candidate = mutate(mean_params)
        # Load new policy parameters to agent.
        # Tell function that it should only update parameters
        # we give it (policy parameters)
        model.policy.load_state_dict(candidate, strict=False)
        # Evaluate the candidate
        fitness, _ = evaluate_policy(model, env)
        population.append((candidate, fitness))
    # Take top 10% and use average over their parameters as next mean parameter
    top_candidates = sorted(population, key=lambda x: x[1], reverse=True)[:n_elite]
    mean_params = dict(
        (
            name,
            th.stack([candidate[0][name] for candidate in top_candidates]).mean(dim=0),
        )
        for name in mean_params.keys()
    )
    mean_fitness = sum(top_candidate[1] for top_candidate in top_candidates) / n_elite
    print(f"Iteration {iteration + 1:<3} Mean top fitness: {mean_fitness:.2f}")
    print(f"Best fitness: {top_candidates[0][1]:.2f}")

Iteration 1   Mean top fitness: -478.50
Best fitness: -454.47
Iteration 2   Mean top fitness: -495.36
Best fitness: -475.02
Iteration 3   Mean top fitness: -492.54
Best fitness: -475.36
Iteration 4   Mean top fitness: -479.80
Best fitness: -465.07
Iteration 5   Mean top fitness: -478.58
Best fitness: -440.46
Iteration 6   Mean top fitness: -484.92
Best fitness: -470.12
Iteration 7   Mean top fitness: -482.87
Best fitness: -453.33
Iteration 8   Mean top fitness: -471.44
Best fitness: -447.52
Iteration 9   Mean top fitness: -480.96
Best fitness: -437.54
Iteration 10  Mean top fitness: -488.49
Best fitness: -478.64
