# Multi Train Gradient Update

## Importing Libraries

In [1]:
from typing import Dict
import threading

import gym
import numpy as np
import torch as th

from stable_baselines3 import A2C as ALGO
from stable_baselines3.common.evaluation import evaluate_policy

## Init. ENV and Model

In [2]:
env = gym.make('LunarLander-v2')
model = ALGO(
    "MlpPolicy",
    env
)

model_trained_1 = ALGO(
    "MlpPolicy",
    env
)

model_trained_2 = ALGO(
    "MlpPolicy",
    env
)

## Functions to Evaluate Model and Train Model within Thread

In [3]:
def evaluate(model, env, message = '', verbose = 0):
    fitnesses = []
    iterations = 10
    for i in range(iterations):
        fitness, _ = evaluate_policy(model, env)
        if verbose == 1:
            print(i, fitness, end=" ")
        fitnesses.append(fitness)

    mean_fitness = np.mean(sorted(fitnesses))
    print(f'Type {message} Mean reward: {mean_fitness}')

In [4]:
def train(model, timesteps):
    print('Starting Training')
    model.learn(total_timesteps=timesteps)
    print('Completed Training')

## Initial Evaluation

In [5]:
evaluate(model, env)
evaluate(model_trained_1, env)
evaluate(model_trained_2, env)



Type  Mean reward: -431.4619080990739
Type  Mean reward: -445.2882938029006
Type  Mean reward: -494.61938431913507


## Train for 1K Steps and Evaluate

In [6]:
# Train MT Model 1
t1 = threading.Thread(target=train, args=(model_trained_1, 10_00))

# Train MT Model 2
t2 = threading.Thread(target=train, args=(model_trained_2, 10_00))

# starting thread
t1.start()
t2.start()

# wait until thread is completely executed
t1.join()
t2.join()


# model_trained.learn(total_timesteps=10_00)
evaluate(model_trained_1, env)
evaluate(model_trained_2, env)
evaluate(model, env)

Starting Training
Starting Training
Completed Training
Completed Training
Type  Mean reward: -962.1624753702479
Type  Mean reward: -817.9836377596282
Type  Mean reward: -445.4719514189521


## Apply Gradient and Evaluate

In [7]:
# For Trained Model 1
state_dict = model.policy.state_dict()
optim_dict = model_trained_1.policy.optimizer.param_groups[0]['params']
optim_alpha = model.policy.optimizer.param_groups[0]['alpha']

optim_index = 0
for key, value in state_dict.items():
    # print(key)
    state_dict[key].add_(optim_alpha, optim_dict[optim_index])
    optim_index += 1

model.policy.load_state_dict(state_dict)

# For Trained Model 2
state_dict = model.policy.state_dict()
optim_dict = model_trained_2.policy.optimizer.param_groups[0]['params']
optim_alpha = model.policy.optimizer.param_groups[0]['alpha']

optim_index = 0
for key, value in state_dict.items():
    # print(key)
    state_dict[key].add_(optim_alpha, optim_dict[optim_index])
    optim_index += 1

model.policy.load_state_dict(state_dict)


evaluate(model, env)

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1025.)
  if __name__ == '__main__':


Type  Mean reward: -1013.8730055048061


In [8]:
for i in range(10):
    print('Train Iter: ', i)

    # Train MT Model 1
    t1 = threading.Thread(target=train, args=(model_trained_1, 10_000))

    # Train MT Model 2
    t2 = threading.Thread(target=train, args=(model_trained_2, 10_000))

    # starting thread
    t1.start()
    t2.start()

    # wait until thread is completely executed
    t1.join()
    t2.join()

    evaluate(model_trained_1, env, 'Trained Model 1', verbose=1)
    evaluate(model_trained_2, env, 'Trained Model 2', verbose=1)
    evaluate(model, env, 'Initial Model', verbose=1)

    # For Trained Model 1
    state_dict = model.policy.state_dict()
    optim_dict = model_trained_1.policy.optimizer.param_groups[0]['params']
    optim_alpha = model_trained_1.policy.optimizer.param_groups[0]['alpha']

    optim_index = 0
    for key, value in state_dict.items():
        # print(key)
        state_dict[key].add_(optim_alpha, optim_dict[optim_index])
        optim_index += 1

    model.policy.load_state_dict(state_dict)

    # For Trained Model 2
    state_dict = model.policy.state_dict()
    optim_dict = model_trained_2.policy.optimizer.param_groups[0]['params']
    optim_alpha = model_trained_2.policy.optimizer.param_groups[0]['alpha']

    optim_index = 0
    for key, value in state_dict.items():
        # print(key)
        state_dict[key].add_(optim_alpha, optim_dict[optim_index])
        optim_index += 1

    model_trained_1.policy.load_state_dict(state_dict)
    model_trained_2.policy.load_state_dict(state_dict)
    model.policy.load_state_dict(state_dict)


    evaluate(model, env, 'Updated Model', verbose=1)

Train Iter:  0
Starting Training
Starting Training
Completed Training
Completed Training
0 -2660.270971801644 1 -2783.74361298015 2 -2662.3348019203054 3 -2427.2091898242943 4 -2229.277876516305 5 -2523.3956706062395 6 -2359.949037573987 7 -2421.1822055381954 8 -2294.029686663032 9 -2506.892941570854 Type Trained Model 1 Mean reward: -2486.8285994995003
0 -8.624941579909114 1 -9.18273109467118 2 -23.07342647242058 3 24.531086154386866 4 26.956538421118456 5 -32.537770424538756 6 30.356180476475856 7 9.369339340706938 8 -65.73432079327176 9 36.801928885414235 Type Trained Model 2 Mean reward: -1.1138117086709065
0 -1084.7570516026578 1 -1220.731389884831 2 -934.773939095065 3 -875.3112892461941 4 -766.3548823645339 5 -987.0417992171831 6 -1141.3576850225218 7 -1298.498894039169 8 -1168.9746140966192 9 -1045.324966353603 Type Initial Model Mean reward: -1052.3126510922377
0 -369.8592925339064 1 -381.9445559002459 2 -339.2384945921076 3 -312.22853385728087 4 -276.76520256088986 5 -359.246

In [9]:
model.get_parameters()

{'policy': OrderedDict([('mlp_extractor.policy_net.0.weight',
               tensor([[-22362.0977,  -3081.3545,    801.2779,   6870.6895, -10768.9414,
                          -276.7640,   4536.3721,   5394.2510],
                       [ -6643.8032,   2509.6279,  -6237.9946,  -2613.2415,   -615.9299,
                          3988.4851, -12677.0488,  -9107.0986],
                       [  -524.6043, -16094.4893,  -5325.1270, -15252.5635,   6950.4927,
                         10159.3105, -16265.6152, -13284.9775],
                       [ 23559.7441, -10320.5479,  -5160.8179,   -944.0709,   7477.3643,
                         -1309.9873,   3617.0942,  -9305.7334],
                       [   379.7978,  13770.9600,  -4300.2817, -38598.3164,   -639.1262,
                         -3667.5242,   1559.7115,   8702.4121],
                       [-14291.5508,  -2079.0923,  -4034.8157,   1173.3798,  17696.5879,
                         10375.6104,  -1698.8304,   7390.4360],
                    

In [10]:
model.save('a2c_lunar_multiproc')

In [11]:
# Exporting Params as JSON
## Function to Convert Params Dict to Flattened List
def flatten_list(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = {}
    for key in params.keys():
        params_[key] = params[key].tolist()
    return params_
## Write Parameters to JSON File
import json

all_params = model.get_parameters()
pol_params = flatten_list(all_params['policy'])

all_params['policy'] = pol_params

with open('a2c_lunar_multiproc.json', 'w') as f:
    json.dump(all_params, f, indent='\t')

In [12]:
model_loaded = ALGO(
    "MlpPolicy",
    env
)

evaluate(model_loaded,env, verbose=1)

new_params = all_params
loaded_pol_params = new_params['policy']
for key in loaded_pol_params.keys():
    loaded_pol_params[key] = th.tensor(loaded_pol_params[key])

new_params['policy'] = loaded_pol_params

model_loaded.set_parameters(new_params)

0 -125.8918386331643 1 -130.12257823236286 2 -133.80995555415285 3 -155.3977794014383 4 -135.67041762243025 5 -116.3916652476124 6 -138.49260917463397 7 -173.49132147997153 8 -123.89375966929947 9 -134.10918745084783 Type  Mean reward: -136.7271112465914


In [13]:
env.reset()
evaluate(model_loaded,env, verbose=1)

0 -189.31433846539804 1 -122.5401758715976 2 -149.45707392856565 3 -111.5959788093649 4 -106.72923593113664 5 -132.80579088629239 6 -147.23850057052914 7 -203.0111683366471 8 -160.72532241535373 9 -90.73251939904512 Type  Mean reward: -141.41501046139302
