# Modifying and Loading Parameters of Policies (Gradient Update)

## Importing Libraries

In [1]:
from typing import Dict

import gym
import numpy as np
import torch as th

from stable_baselines3 import A2C as ALGO
from stable_baselines3.dqn.policies import DQNPolicy
from stable_baselines3.common.evaluation import evaluate_policy

In [4]:
th.optim.lr_scheduler.ReduceLROnPlateau()

torch.optim.lr_scheduler.ReduceLROnPlateau

## Init. ENV and Model

In [2]:
env = gym.make('CartPole-v1')

# policy = DQNPolicy(env.observation_space, env.action_space, )

policy_kwargs = dict(optimizer_class=th.optim.SGD, optimizer_kwargs = dict(momentum=0.9))

model = ALGO(
    "MlpPolicy",
    env,
    learning_rate=0.1,
    policy_kwargs = policy_kwargs,
    # optimizer_kwargs = optimizer_kwargs,
)

model_trained = ALGO(
    "MlpPolicy",
    env,
    learning_rate=0.1,
    policy_kwargs = policy_kwargs,
    # optimizer_kwargs = optimizer_kwargs,
)

## Function to Evaluate Model 

In [3]:
def evaluate(model, env, message = '', verbose = 0):
    fitnesses = []
    iterations = 10
    for i in range(iterations):
        fitness, _ = evaluate_policy(model, env)
        if verbose == 1:
            print(i, fitness, end=" ")
        fitnesses.append(fitness)

    mean_fitness = np.mean(sorted(fitnesses))
    print(f'Type {message} Mean reward: {mean_fitness}')

## Initial Evaluation

In [4]:
evaluate(model, env)
evaluate(model_trained, env)



Type  Mean reward: 9.07
Type  Mean reward: 63.290000000000006


## Train for 1K Steps and Evaluate

In [5]:
# Train MT Model
model_trained.learn(total_timesteps=10_000)
evaluate(model_trained, env)
evaluate(model, env)

Type  Mean reward: 147.42000000000002
Type  Mean reward: 8.94


In [20]:
model.get_parameters()['policy.optimizer']

{'state': {},
 'param_groups': [{'lr': 0.0007,
   'momentum': 0.9,
   'dampening': 0,
   'weight_decay': 0,
   'nesterov': False,
   'params': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]}]}

## Apply Gradient and Evaluate

In [12]:
model_params = model.get_parameters()
optim_grads = []
for para in model_trained.policy.parameters():
    optim_grads.append(para.grad)

optim_index = 0
for key, value in model_params['policy'].items():
    model_params['policy'][key].add_(optim_grads[optim_index])#, alpha=-model_params['policy.optimizer']['param_groups'][0]['lr'])
    optim_index += 1

model.set_parameters(model_params)

evaluate(model, env)

Type  Mean reward: 9.360000000000001


In [7]:
# optim_params[1]['square_avg'].sum()

In [8]:
for i in range(10):
    print('Train Iter: ', i)

    model_trained.learn(total_timesteps=10_00)

    evaluate(model_trained, env, 'Trained Model')
    evaluate(model, env, 'Initial Model')

    model_params = model.get_parameters()
    optim_params = model_trained.get_parameters()#['policy.optimizer']['state']

    # model_params['policy.optimizer']['state'] = optim_params

    optim_index = 0
    for key, value in model_params['policy'].items():
        # print(key)
        # if len(optim_params[optim_index]) != 0:
        model_params['policy'][key].add_(optim_params['policy'][key], alpha=-optim_params['policy.optimizer'][optim_index]['square_avg'].sum())
        optim_index += 1

    model.set_parameters(model_params)

    model.policy.optimizer.step()

    evaluate(model, env, 'Updated Model')

Train Iter:  0
Type Trained Model Mean reward: 85.3
Type Initial Model Mean reward: 9.170000000000002


KeyError: 0

In [None]:
model.get_parameters()['policy']['mlp_extractor.policy_net.0.weight'].grad

In [None]:
model.save('a2c_lunar_gradient')

In [None]:
# Exporting Params as JSON
## Function to Convert Params Dict to Flattened List
def flatten_list(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = {}
    for key in params.keys():
        params_[key] = params[key].tolist()
    return params_
## Write Parameters to JSON File
import json

all_params = model.get_parameters()
pol_params = flatten_list(all_params['policy'])

all_params['policy'] = pol_params

with open('data.json', 'w') as f:
    json.dump(all_params, f, indent='\t')

In [None]:
model_loaded = ALGO(
    "MlpPolicy",
    env
)

new_params = all_params
loaded_pol_params = new_params['policy']
for key in loaded_pol_params.keys():
    loaded_pol_params[key] = th.tensor(loaded_pol_params[key])

new_params['policy'] = loaded_pol_params

model_loaded.set_parameters(new_params)

In [None]:
env.reset()
evaluate(model_loaded,env, verbose=1)