# Multi Train Gradient Update

## Importing Libraries

In [1]:
from typing import Dict
import threading

import gym
import numpy as np
import torch as th

from stable_baselines3 import A2C as ALGO
from stable_baselines3.common.evaluation import evaluate_policy

## Init. ENV and Model

In [2]:
env = gym.make('CartPole-v1')
model = ALGO(
    "MlpPolicy",
    env
)

model_trained_1 = ALGO(
    "MlpPolicy",
    env
)

model_trained_2 = ALGO(
    "MlpPolicy",
    env
)

## Functions to Evaluate Model and Train Model within Thread

In [3]:
def evaluate(model, env, message = '', verbose = 0):
    fitnesses = []
    iterations = 10
    for i in range(iterations):
        fitness, _ = evaluate_policy(model, env)
        if verbose == 1:
            print(i, fitness, end=" ")
        fitnesses.append(fitness)

    mean_fitness = np.mean(sorted(fitnesses))
    print(f'Type {message} Mean reward: {mean_fitness}')

In [4]:
def train(model, timesteps):
    print('Starting Training')
    model.learn(total_timesteps=timesteps)
    print('Completed Training')

## Initial Evaluation

In [5]:
model_trained_1.set_parameters(model.get_parameters())
model_trained_2.set_parameters(model.get_parameters())

evaluate(model, env)
evaluate(model_trained_1, env)
evaluate(model_trained_2, env)



Type  Mean reward: 9.27
Type  Mean reward: 9.400000000000002
Type  Mean reward: 9.370000000000001


## Train for 1K Steps and Evaluate

In [6]:
# Train MT Model 1
t1 = threading.Thread(target=train, args=(model_trained_1, 10_00))

# Train MT Model 2
t2 = threading.Thread(target=train, args=(model_trained_2, 10_00))

# starting thread
t1.start()
t2.start()

# wait until thread is completely executed
t1.join()
t2.join()


# model_trained.learn(total_timesteps=10_00)
evaluate(model_trained_1, env)
evaluate(model_trained_2, env)
evaluate(model, env)

Starting Training
Starting Training
Completed Training
Completed Training
Type  Mean reward: 61.3
Type  Mean reward: 46.589999999999996
Type  Mean reward: 9.34


## Apply Gradient and Evaluate

In [7]:
# For Trained Model 1
state_dict = model.policy.state_dict()
optim_dict = model_trained_1.policy.optimizer.param_groups[0]['params']
optim_alpha = model.policy.optimizer.param_groups[0]['alpha']

optim_index = 0
for key, value in state_dict.items():
    # print(key)
    state_dict[key].add_(optim_alpha, optim_dict[optim_index])
    optim_index += 1

model.policy.load_state_dict(state_dict)

# For Trained Model 2
state_dict = model.policy.state_dict()
optim_dict = model_trained_2.policy.optimizer.param_groups[0]['params']
optim_alpha = model.policy.optimizer.param_groups[0]['alpha']

optim_index = 0
for key, value in state_dict.items():
    # print(key)
    state_dict[key].add_(optim_alpha, optim_dict[optim_index])
    optim_index += 1

model.policy.load_state_dict(state_dict)
model_trained_1.policy.load_state_dict(state_dict)
model_trained_2.policy.load_state_dict(state_dict)


evaluate(model, env)

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1025.)
  if __name__ == '__main__':


Type  Mean reward: 57.23


In [8]:
for i in range(10):
    print('Train Iter: ', i)

    # Train MT Model 1
    t1 = threading.Thread(target=train, args=(model_trained_1, 10_0))

    # Train MT Model 2
    t2 = threading.Thread(target=train, args=(model_trained_2, 10_0))

    # starting thread
    t1.start()
    t2.start()

    # wait until thread is completely executed
    t1.join()
    t2.join()

    evaluate(model_trained_1, env, 'Trained Model 1', verbose=1)
    evaluate(model_trained_2, env, 'Trained Model 2', verbose=1)
    evaluate(model, env, 'Initial Model', verbose=1)

    # For Trained Model 1
    state_dict = model.policy.state_dict()
    optim_dict = model_trained_1.policy.optimizer.param_groups[0]['params']
    optim_alpha = model.policy.optimizer.param_groups[0]['alpha']

    optim_index = 0
    for key, value in state_dict.items():
        # print(key)
        state_dict[key].add_(optim_alpha, optim_dict[optim_index])
        optim_index += 1

    model.policy.load_state_dict(state_dict)

    # For Trained Model 2
    state_dict = model.policy.state_dict()
    optim_dict = model_trained_2.policy.optimizer.param_groups[0]['params']
    optim_alpha = model.policy.optimizer.param_groups[0]['alpha']

    optim_index = 0
    for key, value in state_dict.items():
        # print(key)
        state_dict[key].add_(optim_alpha, optim_dict[optim_index])
        optim_index += 1

    model.policy.load_state_dict(state_dict)


    evaluate(model, env, 'Updated Model', verbose=1)

Train Iter:  0
Starting Training
Starting Training
Completed Training
Completed Training
0 63.9 1 58.5 2 62.6 3 53.5 4 72.9 5 54.9 6 62.8 7 72.9 8 63.4 9 63.0 Type Trained Model 1 Mean reward: 62.839999999999996
0 61.2 1 53.5 2 72.5 3 54.7 4 67.7 5 50.8 6 62.7 7 55.9 8 80.7 9 67.9 Type Trained Model 2 Mean reward: 62.760000000000005
0 57.3 1 64.6 2 60.9 3 58.4 4 58.8 5 54.1 6 75.5 7 55.6 8 54.6 9 61.7 Type Initial Model Mean reward: 60.15
0 59.3 1 60.2 2 67.0 3 84.3 4 66.3 5 79.1 6 79.3 7 54.8 8 96.1 9 80.4 Type Updated Model Mean reward: 72.67999999999999
Train Iter:  1
Starting TrainingStarting Training

Completed Training
Completed Training
0 49.4 1 50.5 2 69.4 3 72.1 4 73.2 5 58.9 6 56.1 7 56.7 8 65.2 9 42.3 Type Trained Model 1 Mean reward: 59.38000000000001
0 70.2 1 61.6 2 64.8 3 66.6 4 65.8 5 56.1 6 52.5 7 69.2 8 69.8 9 68.3 Type Trained Model 2 Mean reward: 64.49
0 55.5 1 57.2 2 51.9 3 51.3 4 62.1 5 93.5 6 68.7 7 65.0 8 57.6 9 70.5 Type Initial Model Mean reward: 63.33
0 67.8 1

In [9]:
model.get_parameters()

{'policy': OrderedDict([('mlp_extractor.policy_net.0.weight',
               tensor([[-13.6661,  -1.6162,   5.2286,  -3.7724],
                       [ 15.5470,  -8.9814,  10.4736, -11.1178],
                       [  2.0127,  -2.3448,  -1.4312,   1.2207],
                       [ -0.9364,   6.6811,  -3.2110,   2.2838],
                       [ -1.8378, -16.7624,  -4.6042,  16.0163],
                       [-15.0264,   1.9382,   8.2095,  12.7957],
                       [ 22.8689,   1.7694,  11.2260,   0.9689],
                       [  7.0476,  16.5845,  -8.6652,  -7.1861],
                       [-13.0648,  11.9390,   8.2493,   2.3556],
                       [  8.0456,   5.6632,  -3.0767,  -3.3188],
                       [  4.1010,  10.4970,   5.0210, -30.8385],
                       [  5.4770,  -2.7635,  -9.2190,  -7.0158],
                       [ -0.5031,  -8.9173,  14.9004,  15.6148],
                       [-14.1457,  20.2595,  -4.5194,   0.0946],
                       [  3.

In [10]:
model.save('a2c_lunar_multiproc')

In [11]:
# Exporting Params as JSON
## Function to Convert Params Dict to Flattened List
def flatten_list(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = {}
    for key in params.keys():
        params_[key] = params[key].tolist()
    return params_
## Write Parameters to JSON File
import json

all_params = model.get_parameters()
pol_params = flatten_list(all_params['policy'])

all_params['policy'] = pol_params

with open('a2c_lunar_multiproc.json', 'w') as f:
    json.dump(all_params, f, indent='\t')

In [12]:
model_loaded = ALGO(
    "MlpPolicy",
    env
)

evaluate(model_loaded,env, verbose=1)

new_params = all_params
loaded_pol_params = new_params['policy']
for key in loaded_pol_params.keys():
    loaded_pol_params[key] = th.tensor(loaded_pol_params[key])

new_params['policy'] = loaded_pol_params

model_loaded.set_parameters(new_params)

0 9.5 1 9.3 2 8.9 3 9.7 4 9.3 5 8.9 6 9.1 7 8.9 8 9.8 9 9.3 Type  Mean reward: 9.27


In [13]:
env.reset()
evaluate(model_loaded,env, verbose=1)

0 79.3 1 89.0 2 75.4 3 61.1 4 78.8 5 66.9 6 63.8 7 58.4 8 66.0 9 78.9 Type  Mean reward: 71.75999999999999
