# Multi Train Gradient Update

## Importing Libraries

In [1]:
from typing import Dict
import threading

import gym
import numpy as np
import torch as th

from stable_baselines3 import A2C as ALGO
from stable_baselines3.common.evaluation import evaluate_policy

## Init. ENV and Model

In [2]:
env = gym.make('CartPole-v1')
model = ALGO(
    "MlpPolicy",
    env
)

model_trained_1 = ALGO(
    "MlpPolicy",
    env
)

model_trained_2 = ALGO(
    "MlpPolicy",
    env
)

## Functions to Evaluate Model and Train Model within Thread

In [3]:
def evaluate(model, env, message = '', verbose = 0):
    fitnesses = []
    iterations = 10
    for i in range(iterations):
        fitness, _ = evaluate_policy(model, env)
        if verbose == 1:
            print(i, fitness, end=" ")
        fitnesses.append(fitness)

    mean_fitness = np.mean(sorted(fitnesses))
    print(f'Type {message} Mean reward: {mean_fitness}')

In [4]:
def train(model, timesteps):
    print('Starting Training')
    model.learn(total_timesteps=timesteps)
    print('Completed Training')

## Initial Evaluation

In [5]:
evaluate(model, env)
evaluate(model_trained_1, env)
evaluate(model_trained_2, env)



Type  Mean reward: 9.110000000000001
Type  Mean reward: 9.73
Type  Mean reward: 86.55


## Train for 1K Steps and Evaluate

In [6]:
# Train MT Model 1
t1 = threading.Thread(target=train, args=(model_trained_1, 10_00))

# Train MT Model 2
t2 = threading.Thread(target=train, args=(model_trained_2, 10_00))

# starting thread
t1.start()
t2.start()

# wait until thread is completely executed
t1.join()
t2.join()


# model_trained.learn(total_timesteps=10_00)
evaluate(model_trained_1, env)
evaluate(model_trained_2, env)
evaluate(model, env)

Starting Training
Starting Training
Completed Training
Completed Training
Type  Mean reward: 50.940000000000005
Type  Mean reward: 9.309999999999999
Type  Mean reward: 9.24


## Apply Gradient and Evaluate

In [7]:
# For Trained Model 1
state_dict = model.policy.state_dict()
optim_dict = model_trained_1.policy.optimizer.param_groups[0]['params']
optim_alpha = model.policy.optimizer.param_groups[0]['alpha']

optim_index = 0
for key, value in state_dict.items():
    # print(key)
    state_dict[key].add_(optim_alpha, optim_dict[optim_index])
    optim_index += 1

model.policy.load_state_dict(state_dict)

# For Trained Model 2
state_dict = model.policy.state_dict()
optim_dict = model_trained_2.policy.optimizer.param_groups[0]['params']
optim_alpha = model.policy.optimizer.param_groups[0]['alpha']

optim_index = 0
for key, value in state_dict.items():
    # print(key)
    state_dict[key].add_(optim_alpha, optim_dict[optim_index])
    optim_index += 1

model.policy.load_state_dict(state_dict)
model_trained_1.policy.load_state_dict(state_dict)
model_trained_2.policy.load_state_dict(state_dict)


evaluate(model, env)

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1025.)
  if __name__ == '__main__':


Type  Mean reward: 101.23


In [8]:
for i in range(10):
    print('Train Iter: ', i)

    # Train MT Model 1
    t1 = threading.Thread(target=train, args=(model_trained_1, 10_00))

    # Train MT Model 2
    t2 = threading.Thread(target=train, args=(model_trained_2, 10_00))

    # starting thread
    t1.start()
    t2.start()

    # wait until thread is completely executed
    t1.join()
    t2.join()

    evaluate(model_trained_1, env, 'Trained Model 1', verbose=1)
    evaluate(model_trained_2, env, 'Trained Model 2', verbose=1)
    evaluate(model, env, 'Initial Model', verbose=1)

    # For Trained Model 1
    state_dict = model.policy.state_dict()
    optim_dict = model_trained_1.policy.optimizer.param_groups[0]['params']
    optim_alpha = model.policy.optimizer.param_groups[0]['alpha']

    optim_index = 0
    for key, value in state_dict.items():
        # print(key)
        state_dict[key].add_(optim_alpha, optim_dict[optim_index])
        optim_index += 1

    model.policy.load_state_dict(state_dict)

    # For Trained Model 2
    state_dict = model.policy.state_dict()
    optim_dict = model_trained_2.policy.optimizer.param_groups[0]['params']
    optim_alpha = model.policy.optimizer.param_groups[0]['alpha']

    optim_index = 0
    for key, value in state_dict.items():
        # print(key)
        state_dict[key].add_(optim_alpha, optim_dict[optim_index])
        optim_index += 1

    model.policy.load_state_dict(state_dict)


    evaluate(model, env, 'Updated Model', verbose=1)

Train Iter:  0
Starting Training
Starting Training
Completed Training
Completed Training
0 126.3 1 164.4 2 151.6 3 186.8 4 101.8 5 137.1 6 102.7 7 160.3 8 162.3 9 136.6 Type Trained Model 1 Mean reward: 142.98999999999998
0 51.6 1 65.1 2 69.1 3 75.1 4 67.5 5 65.1 6 69.7 7 56.0 8 59.0 9 64.8 Type Trained Model 2 Mean reward: 64.3
0 68.3 1 105.4 2 93.3 3 63.1 4 60.5 5 108.2 6 61.7 7 111.2 8 73.3 9 107.5 Type Initial Model Mean reward: 85.25000000000001
0 101.7 1 84.8 2 107.0 3 151.5 4 131.2 5 145.4 6 118.4 7 109.2 8 184.5 9 117.0 Type Updated Model Mean reward: 125.07000000000001
Train Iter:  1
Starting Training
Starting Training
Completed Training
Completed Training
0 181.3 1 186.0 2 224.1 3 147.6 4 237.3 5 145.3 6 194.1 7 200.3 8 158.6 9 225.8 Type Trained Model 1 Mean reward: 190.04
0 155.0 1 149.2 2 165.2 3 156.5 4 166.4 5 161.9 6 176.8 7 155.2 8 147.2 9 159.7 Type Trained Model 2 Mean reward: 159.31
0 128.1 1 179.7 2 88.6 3 96.8 4 134.1 5 79.2 6 147.6 7 143.8 8 78.1 9 117.2 Type Ini

In [9]:
model.get_parameters()

{'policy': OrderedDict([('mlp_extractor.policy_net.0.weight',
               tensor([[-4.7994,  2.7285,  1.8746,  4.0797],
                       [ 0.7623, -7.0949,  2.1134, -1.0655],
                       [ 1.9194,  3.3921, -0.3741,  3.4626],
                       [-3.6278, -1.0268, -2.4102,  3.2928],
                       [ 1.1961, -0.9835, -2.6474,  4.2740],
                       [-3.1046,  3.6106, -5.5946,  0.2108],
                       [ 0.1898, -4.1656, -1.2745,  1.0092],
                       [-3.5225,  0.8290,  4.2771,  2.9213],
                       [-0.1257, -0.5160, -1.3684, -3.4648],
                       [-1.7615,  0.8610,  1.5810,  3.8147],
                       [-3.9144, -0.5135,  0.4511,  1.2394],
                       [ 4.6794,  1.8337, -0.4528, -0.6097],
                       [ 3.7276,  0.2791,  3.8379,  4.3595],
                       [ 1.5737, -0.0986, -0.5604, -1.5846],
                       [ 0.0509,  2.3838, -1.1136,  2.0262],
                       

In [10]:
model.save('a2c_lunar_multiproc')

In [11]:
# Exporting Params as JSON
## Function to Convert Params Dict to Flattened List
def flatten_list(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = {}
    for key in params.keys():
        params_[key] = params[key].tolist()
    return params_
## Write Parameters to JSON File
import json

all_params = model.get_parameters()
pol_params = flatten_list(all_params['policy'])

all_params['policy'] = pol_params

with open('a2c_lunar_multiproc.json', 'w') as f:
    json.dump(all_params, f, indent='\t')

In [12]:
model_loaded = ALGO(
    "MlpPolicy",
    env
)

evaluate(model_loaded,env, verbose=1)

new_params = all_params
loaded_pol_params = new_params['policy']
for key in loaded_pol_params.keys():
    loaded_pol_params[key] = th.tensor(loaded_pol_params[key])

new_params['policy'] = loaded_pol_params

model_loaded.set_parameters(new_params)

0 285.4 1 150.9 2 149.6 3 153.2 4 146.2 5 235.6 6 233.5 7 151.5 8 243.4 9 287.2 Type  Mean reward: 203.65000000000003


In [13]:
env.reset()
evaluate(model_loaded,env, verbose=1)

0 153.9 1 128.9 2 182.4 3 145.9 4 161.4 5 159.1 6 152.4 7 185.1 8 177.8 9 145.6 Type  Mean reward: 159.25
