# Modifying and Loading Parameters of Policies (Gradient Update)

## Importing Libraries

In [1]:
from typing import Dict

import gym
import numpy as np
import torch as th

from stable_baselines3 import A2C as ALGO
from stable_baselines3.common.evaluation import evaluate_policy

## Init. ENV and Model

In [2]:
env = gym.make('CartPole-v1')
model = ALGO(
    "MlpPolicy",
    env
)

model_trained = ALGO(
    "MlpPolicy",
    env
)

## Function to Evaluate Model 

In [3]:
def evaluate(model, env):
    fitnesses = []
    iterations = 20
    for i in range(iterations):
        fitness, _ = evaluate_policy(model, env)
        print(fitness, end=" ")
        fitnesses.append(fitness)

    mean_fitness = np.mean(sorted(fitnesses))
    print("\nIterations {:<3} Mean top 10% reward: {:.2f}".format(iterations, mean_fitness))

## Initial Evaluation

In [4]:
evaluate(model, env)
evaluate(model_trained, env)



76.9 72.4 53.7 62.2 67.2 62.4 68.3 60.6 77.5 67.0 69.2 67.9 61.9 74.7 55.0 67.2 71.3 66.1 64.7 60.3 
Iterations 20  Mean top 10% reward: 66.33
18.0 21.6 16.8 15.9 20.5 19.8 17.7 19.5 17.5 16.4 21.1 19.5 20.6 15.6 17.7 20.6 18.6 17.8 17.2 16.4 
Iterations 20  Mean top 10% reward: 18.44


## Train for 1K Steps and Evaluate

In [5]:
# Train MT Model
model_trained.learn(total_timesteps=10_00)
evaluate(model_trained, env)
evaluate(model, env)

70.6 47.7 46.4 54.8 53.7 55.8 65.1 67.4 57.8 59.4 54.6 62.5 42.0 51.8 43.8 50.9 38.6 50.7 64.9 53.1 
Iterations 20  Mean top 10% reward: 54.58
56.1 59.5 56.8 58.8 70.6 66.1 54.0 68.0 58.8 63.5 63.9 58.2 58.3 61.9 59.2 70.4 62.4 66.5 61.2 52.6 
Iterations 20  Mean top 10% reward: 61.34


## Apply Gradient and Evaluate

In [9]:
state_dict = model.policy.state_dict()
optim_dict = model_trained.policy.optimizer.param_groups[0]['params']
optim_alpha = model.policy.optimizer.param_groups[0]['alpha']

# print(state_dict['q_net.q_net.0.weight'])

optim_index = 0
for key, value in state_dict.items():
    # print(key)
    state_dict[key].add_(optim_alpha, optim_dict[optim_index])
    optim_index += 1

model.policy.load_state_dict(state_dict)

evaluate(model, env)




52.6 47.7 54.3 44.8 54.0 63.3 68.8 49.0 52.7 50.8 46.2 52.6 50.7 45.8 63.0 66.2 50.2 56.4 41.9 50.5 
Iterations 20  Mean top 10% reward: 53.08


In [8]:
model.policy.optimizer.param_groups

[{'params': [Parameter containing:
   tensor([[-0.0662,  0.2436, -0.2726,  0.2409],
           [-0.0160, -0.1464,  0.0706,  0.1378],
           [-0.3642, -0.3893, -0.0434, -0.1240],
           [ 0.1849, -0.0154, -0.0914,  0.0161],
           [-0.1704,  0.1508,  0.1334, -0.1542],
           [ 0.0119, -0.1691,  0.4588, -0.0237],
           [ 0.1301, -0.1125,  0.2761,  0.4953],
           [-0.1367, -0.1416,  0.0893, -0.1832],
           [-0.2614,  0.1326, -0.2186,  0.2425],
           [ 0.2556,  0.0435,  0.2706,  0.2892],
           [-0.1693,  0.0969,  0.0251,  0.1559],
           [-0.1669, -0.0730, -0.0029,  0.4009],
           [ 0.1399, -0.1016,  0.0024,  0.4099],
           [ 0.0764,  0.5135, -0.1068, -0.0185],
           [-0.5447,  0.0806, -0.0023, -0.2094],
           [ 0.0122, -0.2296, -0.3017, -0.2517],
           [-0.2891, -0.4243,  0.0970,  0.0708],
           [ 0.0940,  0.5145,  0.4349,  0.6060],
           [-0.6229,  0.0075,  0.3811, -0.0007],
           [-0.0556, -0.1333,  0.3