# Modifying and Loading Parameters of Policies (Gradient Update)

## Importing Libraries

In [1]:
from typing import Dict

import gym
import numpy as np
import torch as th

from stable_baselines3 import A2C as ALGO
from stable_baselines3.common.evaluation import evaluate_policy

## Init. ENV and Model

In [2]:
env = gym.make('LunarLander-v2')
model = ALGO(
    "MlpPolicy",
    env
)

model_trained = ALGO(
    "MlpPolicy",
    env
)

## Function to Evaluate Model 

In [21]:
def evaluate(model, env, message = '', verbose = 0):
    fitnesses = []
    iterations = 20
    for i in range(iterations):
        fitness, _ = evaluate_policy(model, env)
        if verbose == 1:
            print(i, fitness, end=" ")
        fitnesses.append(fitness)

    mean_fitness = np.mean(sorted(fitnesses))
    print(f'Type {message} Mean reward: {mean_fitness}')

## Initial Evaluation

In [4]:
evaluate(model, env)
evaluate(model_trained, env)



Type  Mean reward: -272.2401136589282
Type  Mean reward: -252.64175468897625


## Train for 1K Steps and Evaluate

In [5]:
# Train MT Model
model_trained.learn(total_timesteps=10_00)
evaluate(model_trained, env)
evaluate(model, env)

Type  Mean reward: -706.4746398813907
Type  Mean reward: -277.8907781250286


## Apply Gradient and Evaluate

In [6]:
state_dict = model.policy.state_dict()
optim_dict = model_trained.policy.optimizer.param_groups[0]['params']
optim_alpha = model.policy.optimizer.param_groups[0]['alpha']

# print(state_dict['q_net.q_net.0.weight'])

optim_index = 0
for key, value in state_dict.items():
    # print(key)
    state_dict[key].add_(optim_alpha, optim_dict[optim_index])
    optim_index += 1

model.policy.load_state_dict(state_dict)

evaluate(model, env)

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1025.)
  # Remove the CWD from sys.path while we load stuff.


Type  Mean reward: -978.7938613491103


In [7]:
for i in range(10):
    print('Train Iter: ', i)

    model_trained.learn(total_timesteps=10_000)

    evaluate(model_trained, env, 'Trained Model')
    evaluate(model, env, 'Initial Model')

    state_dict = model.policy.state_dict()
    optim_dict = model_trained.policy.optimizer.param_groups[0]['params']
    optim_alpha = model.policy.optimizer.param_groups[0]['alpha']

    # print(state_dict['q_net.q_net.0.weight'])

    optim_index = 0
    for key, value in state_dict.items():
        # print(key)
        state_dict[key].add_(optim_dict[optim_index], alpha=optim_alpha)
        optim_index += 1

    model.policy.load_state_dict(state_dict)

    evaluate(model, env, 'Updated Model')

Train Iter:  0
Type Trained Model Mean reward: -1453.6915133199575
Type Initial Model Mean reward: -1011.1044653150327
Type Updated Model Mean reward: -760.1082116512814
Train Iter:  1
Type Trained Model Mean reward: -27.671912100319627
Type Initial Model Mean reward: -761.5660160733275
Type Updated Model Mean reward: -534.1140503168245
Train Iter:  2
Type Trained Model Mean reward: 26.69997985396887
Type Initial Model Mean reward: -556.2356059185594
Type Updated Model Mean reward: -138.67133830219532
Train Iter:  3
Type Trained Model Mean reward: -6.155266910339929
Type Initial Model Mean reward: -172.87448681252627
Type Updated Model Mean reward: -25.85987597381905
Train Iter:  4
Type Trained Model Mean reward: -4.520823695287798
Type Initial Model Mean reward: -25.229681720148385
Type Updated Model Mean reward: 15.581000853319836
Train Iter:  5
Type Trained Model Mean reward: -10.401366978729815
Type Initial Model Mean reward: 10.039106263396338
Type Updated Model Mean reward: -23.5

In [10]:
model.get_parameters()

{'policy': OrderedDict([('mlp_extractor.policy_net.0.weight',
               tensor([[-1.8779,  1.5491, -3.7494,  5.0083, -0.9677, -1.0457,  0.8717, -1.0524],
                       [ 0.4266, -2.6537,  4.0678, -0.0258, -0.5537, -0.7158,  0.6689, -0.0716],
                       [ 0.8158, -2.8155, -3.8363,  4.4576,  1.2133, -0.4617, -0.9739,  1.2427],
                       [-2.4640, -2.7749, -1.4225,  3.0765,  3.4104,  0.4835, -0.2691, -0.6075],
                       [-0.8617, -0.9607, -1.0689, -2.0681,  4.2006,  1.5362,  2.8239, -1.1258],
                       [ 0.5737,  2.4529, -4.3265,  0.5875,  5.5520,  1.6899,  3.6649, -1.1985],
                       [-0.3089,  2.0027,  1.9865, -1.8170, -2.1288, -1.7477,  0.7953,  3.4615],
                       [-0.1910, -0.6935, -1.3071,  1.4611,  0.7284,  4.6786, -1.9645, -2.0884],
                       [-1.1749, -1.7641,  6.1844, -0.2913, -3.7597,  0.9986,  0.6388,  0.6540],
                       [-1.4631,  1.0618, -5.3877, -6.7274,  3.71

In [9]:
model.save('a2c_lunar_gradient')

In [15]:
# Exporting Params as JSON
## Function to Convert Params Dict to Flattened List
def flatten_list(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = {}
    for key in params.keys():
        params_[key] = params[key].tolist()
    return params_
## Write Parameters to JSON File
import json

all_params = model.get_parameters()
pol_params = flatten_list(all_params['policy'])

all_params['policy'] = pol_params

with open('data.json', 'w') as f:
    json.dump(all_params, f, indent='\t')

In [16]:
model_loaded = ALGO(
    "MlpPolicy",
    env
)

new_params = all_params
loaded_pol_params = new_params['policy']
for key in loaded_pol_params.keys():
    loaded_pol_params[key] = th.tensor(loaded_pol_params[key])

new_params['policy'] = loaded_pol_params

model_loaded.set_parameters(new_params)

In [22]:
env.reset()
evaluate(model_loaded,env, verbose=1)

0 161.6014876950986 1 203.2397071879934 2 148.35718966187608 3 125.14956299816347 4 109.67958389759902 5 131.98694310187787 6 166.97263956990872 7 145.46603745286853 8 146.78814117423636 9 148.18033857159213 10 125.7655503214105 11 140.68488312936944 12 168.4807900254519 13 147.33811435937878 14 157.6695637067544 15 159.8696544707172 16 160.02716369411777 17 148.80049522413339 18 139.77425339664643 19 135.06464364894128 Type  Mean reward: 148.54483716440674
