# Multi Train Gradient Update

## Importing Libraries

In [1]:
from typing import Dict
import threading

import gym
import numpy as np
import torch

from stable_baselines3 import A2C as ALGO
from stable_baselines3.common.evaluation import evaluate_policy

In [2]:
# Hyper-Parameters
NUM_CLIENT_MODELS = 2
NUM_TRAINING_STEPS = 10_000
NUM_ITERATIONS = 10

## Init. ENV and Model

In [3]:
env = gym.make('LunarLander-v2')
global_model = ALGO(
    "MlpPolicy",
    env
)

client_models = [ALGO("MlpPolicy", gym.make('LunarLander-v2')) for i in range(NUM_CLIENT_MODELS)]

## Functions to Evaluate Model and Train Model within Thread

In [4]:
def evaluate(model, env, message = '', verbose = 0):
    fitnesses = []
    iterations = 2
    for i in range(iterations):
        fitness, _ = evaluate_policy(model, env)
        if verbose == 1:
            print(i, fitness, end=" ")
        fitnesses.append(fitness)

    mean_fitness = np.mean(sorted(fitnesses))
    print(f'Type {message} Mean reward: {mean_fitness}')

In [5]:
def train(model, timesteps):
    # print('Starting Training')
    model.learn(total_timesteps=timesteps)
    # print('Completed Training')

In [6]:
def multithread_eval(client_models):
    # Create Threads
    client_threads = [] 
    for ci in range(NUM_CLIENT_MODELS):
        thread = threading.Thread(target=evaluate, args=(client_models[ci], gym.make('LunarLander-v2'), f'Trained Model {ci}'))
        client_threads.append(thread)

    # Start Threads
    for thread in client_threads:
        thread.start()

    # Join Threads (wait until thread is completely executed)
    for thread in client_threads:
        thread.join()

## Initial Evaluation

In [7]:
for model in client_models:
    model.set_parameters(global_model.get_parameters())

evaluate(global_model, env)

multithread_eval(client_models)



Type  Mean reward: -223.39775323503417
Type Trained Model 0 Mean reward: -276.58416300378155
Type Trained Model 1 Mean reward: -253.46882942284282


## Train for 1K Steps and Evaluate

In [8]:
# Create Threads
client_threads = [] 
for i in range(NUM_CLIENT_MODELS):
    thread = threading.Thread(target=train, args=(client_models[i], NUM_TRAINING_STEPS))
    client_threads.append(thread)


# Start Threads
for thread in client_threads:
    thread.start()

# Join Threads (wait until thread is completely executed)
for thread in client_threads:
    thread.join()

evaluate(global_model, env)

multithread_eval(client_models)

Type  Mean reward: -296.88150507626125
Type Trained Model 1 Mean reward: -296.35482030900585
Type Trained Model 0 Mean reward: -1733.7059948234921


## Apply Gradient and Evaluate

In [9]:
global_dict = global_model.policy.state_dict()

# Accumulate Client Parameters / Weights
for k in global_dict.keys():
    global_dict[k] = torch.stack([client_models[i].policy.state_dict()[k].float() for i in range(len(client_models))], 0).mean(0)

# Load New Parameters to Global Model
global_model.policy.load_state_dict(global_dict)

# Load New Parameters to clients
for model in client_models:
    model.policy.load_state_dict(global_model.policy.state_dict())

evaluate(global_model, env)

Type  Mean reward: -1990.1841961945347


In [10]:
# Evaluation Before Iterated Training
evaluate(global_model, env, "Global Initial Model")

for i in range(NUM_ITERATIONS):
    print('Train Iter: ', i)

    # Create Threads
    client_threads = [] 
    for ci in range(NUM_CLIENT_MODELS):
        thread = threading.Thread(target=train, args=(client_models[ci], NUM_TRAINING_STEPS))
        client_threads.append(thread)


    # Start Threads
    for thread in client_threads:
        thread.start()

    # Join Threads (wait until thread is completely executed)
    for thread in client_threads:
        thread.join()

    # Evaluation after Training
    multithread_eval(client_models)

    # Accumulate Client Parameters / Weights
    global_dict = global_model.policy.state_dict()
    for k in global_dict.keys():
        global_dict[k] = torch.stack([client_models[i].policy.state_dict()[k].float() for i in range(len(client_models))], 0).mean(0)

    # Load New Parameters to Global Model
    global_model.policy.load_state_dict(global_dict)

    # Load New Parameters to clients
    for model in client_models:
        model.policy.load_state_dict(global_model.policy.state_dict())

    # Evaluate Updated Global Model
    evaluate(model, env, 'Global Updated Model', verbose=0)

Type Global Initial Model Mean reward: -1770.0712671525544
Train Iter:  0
Type Trained Model 1 Mean reward: -3.6711915621364577
Type Trained Model 0 Mean reward: -59.250647715966466
Type Global Updated Model Mean reward: 3.0320234181764576
Train Iter:  1
Type Trained Model 1 Mean reward: -20.633241837677932
Type Trained Model 0 Mean reward: 7.087361509526905
Type Global Updated Model Mean reward: -5.288396007391956
Train Iter:  2
Type Trained Model 0 Mean reward: 37.9332150891186
Type Trained Model 1 Mean reward: -50.09303499287578
Type Global Updated Model Mean reward: -4.3426747398871965
Train Iter:  3
Type Trained Model 0 Mean reward: -16.676327334890093
Type Trained Model 1 Mean reward: 31.439973291876456
Type Global Updated Model Mean reward: 120.63703586988098
Train Iter:  4
Type Trained Model 1 Mean reward: 71.98086525731351
Type Trained Model 0 Mean reward: 26.816471511568338
Type Global Updated Model Mean reward: 25.73925326213365
Train Iter:  5
Type Trained Model 1 Mean rewar

In [11]:
global_model.get_parameters()

{'policy': OrderedDict([('mlp_extractor.policy_net.0.weight',
               tensor([[ 1.0718e-01, -1.3142e-01,  8.1531e-02, -6.2127e-01, -3.0137e-01,
                        -7.7027e-03,  1.3792e-01,  2.3970e-01],
                       [-6.8553e-02, -5.0693e-02,  2.5027e-01,  6.0018e-01, -1.7604e-01,
                        -2.1755e-01,  1.6782e-01,  3.9031e-01],
                       [-6.4633e-02, -1.4975e-02,  3.6865e-01,  4.1997e-01, -6.1053e-01,
                        -3.7117e-01,  1.4363e-01,  1.9571e-01],
                       [-1.0289e-01,  1.4346e-01,  5.5555e-01,  1.1888e-01, -4.0846e-01,
                        -3.1693e-01, -2.6111e-03, -1.6400e-01],
                       [-1.2851e-01,  1.7242e-01, -4.8431e-01,  6.7283e-01,  2.5170e-02,
                         4.5872e-02, -3.0708e-01,  5.8815e-02],
                       [ 1.5102e-01,  1.4502e-01, -4.5250e-01, -1.7336e-01,  2.6811e-01,
                         1.1459e-01, -4.4270e-01, -2.1618e-01],
                    

In [12]:
global_model.save('a2c_lunar_multiproc')

In [13]:
# Exporting Params as JSON
## Function to Convert Params Dict to Flattened List
def flatten_list(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = {}
    for key in params.keys():
        params_[key] = params[key].tolist()
    return params_
## Write Parameters to JSON File
import json

all_params = global_model.get_parameters()
pol_params = flatten_list(all_params['policy'])

all_params['policy'] = pol_params

with open('a2c_lunar_multiproc.json', 'w') as f:
    json.dump(all_params, f, indent='\t')

In [14]:
# model_loaded = ALGO(
#     "MlpPolicy",
#     env
# )

# evaluate(model_loaded,env, verbose=1)

# import json
# with open('a2c_lunar_multiproc.json', 'w') as f:
#     new_params = json.load(f)

# loaded_pol_params = new_params['policy']
# for key in loaded_pol_params.keys():
#     loaded_pol_params[key] = th.tensor(loaded_pol_params[key])

# new_params['policy'] = loaded_pol_params

# model_loaded.set_parameters(new_params)

model_loaded = ALGO.load('a2c_lunar_multiproc', env)

In [15]:
env.reset()
evaluate(model_loaded,env, verbose=1)

0 158.35763339629293 1 156.59294999945695 Type  Mean reward: 157.47529169787492
