# Multi Train Gradient Update

## Importing Libraries

In [1]:
from typing import Dict
import threading

import gym
import numpy as np
import torch

from stable_baselines3 import A2C as ALGO
from stable_baselines3.common.evaluation import evaluate_policy

In [2]:
# Hyper-Parameters
NUM_CLIENT_MODELS = 2

## Init. ENV and Model

In [3]:
env = gym.make('CartPole-v1')
global_model = ALGO(
    "MlpPolicy",
    env
)

client_models = [ALGO("MlpPolicy", env) for i in range(NUM_CLIENT_MODELS)]

## Functions to Evaluate Model and Train Model within Thread

In [4]:
def evaluate(model, env, message = '', verbose = 0):
    fitnesses = []
    iterations = 10
    for i in range(iterations):
        fitness, _ = evaluate_policy(model, env)
        if verbose == 1:
            print(i, fitness, end=" ")
        fitnesses.append(fitness)

    mean_fitness = np.mean(sorted(fitnesses))
    print(f'Type {message} Mean reward: {mean_fitness}')

In [5]:
def train(model, timesteps):
    print('Starting Training')
    model.learn(total_timesteps=timesteps)
    print('Completed Training')

## Initial Evaluation

In [6]:
for model in client_models:
    model.set_parameters(global_model.get_parameters())

evaluate(global_model, env)

for model in client_models:
    evaluate(model, env)



Type  Mean reward: 9.16
Type  Mean reward: 9.14
Type  Mean reward: 9.34


## Train for 1K Steps and Evaluate

In [7]:
# Create Threads
client_threads = [] 
for i in range(NUM_CLIENT_MODELS):
    thread = threading.Thread(target=train, args=(client_models[i], 10_00))
    client_threads.append(thread)


# Start Threads
for thread in client_threads:
    thread.start()

# Join Threads (wait until thread is completely executed)
for thread in client_threads:
    thread.join()

evaluate(global_model, env)

for model in client_models:
    evaluate(model, env)

Starting Training
Starting Training
Completed Training
Completed Training
Type  Mean reward: 9.180000000000001
Type  Mean reward: 162.9
Type  Mean reward: 62.17


## Apply Gradient and Evaluate

In [8]:
global_dict = global_model.policy.state_dict()

# Accumulate Client Parameters / Weights
for k in global_dict.keys():
    global_dict[k] = torch.stack([client_models[i].policy.state_dict()[k].float() for i in range(len(client_models))], 0).mean(0)

# Load New Parameters to Global Model
global_model.policy.load_state_dict(global_dict)

# Load New Parameters to clients
for model in client_models:
    model.policy.load_state_dict(global_model.policy.state_dict())

evaluate(global_model, env)

Type  Mean reward: 93.47


In [11]:
for i in range(100):
    print('Train Iter: ', i)

    # Create Threads
    client_threads = [] 
    for ci in range(NUM_CLIENT_MODELS):
        thread = threading.Thread(target=train, args=(client_models[ci], 10_00))
        client_threads.append(thread)


    # Start Threads
    for thread in client_threads:
        thread.start()

    # Join Threads (wait until thread is completely executed)
    for thread in client_threads:
        thread.join()

    # Evaluation After Training
    evaluate(global_model, env, "Global Initial Model")
    for ci in range(NUM_CLIENT_MODELS):
        evaluate(client_models[ci], env, f'Trained Model {ci}')

    # Accumulate Client Parameters / Weights
    global_dict = global_model.policy.state_dict()
    for k in global_dict.keys():
        global_dict[k] = torch.stack([client_models[i].policy.state_dict()[k].float() for i in range(len(client_models))], 0).mean(0)

    # Load New Parameters to Global Model
    global_model.policy.load_state_dict(global_dict)

    # Load New Parameters to clients
    for model in client_models:
        model.policy.load_state_dict(global_model.policy.state_dict())

    # Evaluate Updated Global Model
    evaluate(model, env, 'Global Updated Model', verbose=0)

Train Iter:  0
Starting Training
Starting Training
Completed Training
Completed Training




Type Global Initial Model Mean reward: 94.95
Type Trained Model 0 Mean reward: 67.08000000000001
Type Trained Model 1 Mean reward: 139.52
Type Global Updated Model Mean reward: 84.61
Train Iter:  1
Starting Training
Starting Training
Completed Training
Completed Training
Type Global Initial Model Mean reward: 85.98
Type Trained Model 0 Mean reward: 207.7
Type Trained Model 1 Mean reward: 78.86999999999999
Type Global Updated Model Mean reward: 100.49999999999999
Train Iter:  2
Starting Training
Starting Training
Completed Training
Completed Training
Type Global Initial Model Mean reward: 98.96000000000001
Type Trained Model 0 Mean reward: 287.69999999999993
Type Trained Model 1 Mean reward: 91.03999999999999
Type Global Updated Model Mean reward: 148.88000000000002
Train Iter:  3
Starting Training
Starting Training
Completed Training
Completed Training
Type Global Initial Model Mean reward: 146.1
Type Trained Model 0 Mean reward: 249.4
Type Trained Model 1 Mean reward: 482.52
Type Glob

In [None]:
model.get_parameters()

In [None]:
model.save('a2c_lunar_multiproc')

In [None]:
# Exporting Params as JSON
## Function to Convert Params Dict to Flattened List
def flatten_list(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = {}
    for key in params.keys():
        params_[key] = params[key].tolist()
    return params_
## Write Parameters to JSON File
import json

all_params = model.get_parameters()
pol_params = flatten_list(all_params['policy'])

all_params['policy'] = pol_params

with open('a2c_lunar_multiproc.json', 'w') as f:
    json.dump(all_params, f, indent='\t')

In [None]:
model_loaded = ALGO(
    "MlpPolicy",
    env
)

evaluate(model_loaded,env, verbose=1)

new_params = all_params
loaded_pol_params = new_params['policy']
for key in loaded_pol_params.keys():
    loaded_pol_params[key] = th.tensor(loaded_pol_params[key])

new_params['policy'] = loaded_pol_params

model_loaded.set_parameters(new_params)

In [None]:
env.reset()
evaluate(model_loaded,env, verbose=1)