# Multi Train Gradient Update

## Importing Libraries

In [1]:
from typing import Dict
import threading

import gym
import numpy as np
import torch

from stable_baselines3 import PPO as ALGO
from stable_baselines3.common.evaluation import evaluate_policy

  for external in metadata.entry_points().get(self.group, []):


In [2]:
# Hyper-Parameters
NUM_CLIENT_MODELS = 4
NUM_TRAINING_STEPS = 1000
NUM_ITERATIONS = 10
ENV_NAME = 'CartPole-v1'


## Init. ENV and Model

In [3]:
env = gym.make(ENV_NAME)
global_model = ALGO(
    "MlpPolicy",
    env
)

client_models = [ALGO("MlpPolicy", gym.make(ENV_NAME)) for i in range(NUM_CLIENT_MODELS)]

## Functions to Evaluate Model and Train Model within Thread

In [4]:
def evaluate(models, env, index, eval_results, message = '', verbose = 0):
    # print('Starting Eval')
    fitnesses = []
    iterations = 10
    for i in range(iterations):
        fitness, _ = evaluate_policy(models[index], env)
        if verbose == 1:
            print(i, fitness, end=" ")
        fitnesses.append(fitness)

    mean_fitness = np.mean(sorted(fitnesses))
    eval_results[index] = mean_fitness
    print(f'Type {message} Mean reward: {mean_fitness}')

In [5]:
def train(models, index, timesteps):
    # print('Starting Training')
    models[index] = models[index].learn(reset_num_timesteps=False, total_timesteps=timesteps)
    # print('Completed Training')


In [6]:
def multithread_eval(client_models):
    # Create Threads
    client_threads = []
    eval_results = [0 for i in range(len(client_models))] 
    for ci in range(NUM_CLIENT_MODELS):
        thread = threading.Thread(target=evaluate, args=(client_models, gym.make(ENV_NAME), ci, eval_results, f'Trained Model {ci}'))
        client_threads.append(thread)

    # Start Threads
    for thread in client_threads:
        thread.start()

    # Join Threads (wait until thread is completely executed)
    for thread in client_threads:
        thread.join()

    return eval_results


## Initial Evaluation

In [7]:
for model in client_models:
    model.set_parameters(global_model.get_parameters())

global_model.save('initial')

evaluate([global_model], env, 0, [0])

multithread_eval(client_models)



Type  Mean reward: 9.879999999999999
Type Trained Model 2 Mean reward: 9.879999999999999
Type Trained Model 0 Mean reward: 9.830000000000002
Type Trained Model 1 Mean reward: 9.84
Type Trained Model 3 Mean reward: 9.95


[9.830000000000002, 9.84, 9.879999999999999, 9.95]

# Train and Evaluate

In [8]:
# Evaluation Before Iterated Training
evaluate([global_model], env, 0, [0], "Global Initial Model")

for i in range(NUM_ITERATIONS):
    print('Train Iter: ', i)

    # Create Threads
    client_threads = [] 
    for ci in range(NUM_CLIENT_MODELS):
        thread = threading.Thread(target=train, args=(client_models, ci, NUM_TRAINING_STEPS))
        client_threads.append(thread)


    # Start Threads
    for thread in client_threads:
        thread.start()

    # Join Threads (wait until thread is completely executed)
    for thread in client_threads:
        thread.join()

    # Optimization Steps Check
    print('Optim Steps: ', client_models[0].get_parameters()['policy.optimizer']['state'][0]['step'])

    # Evaluation after Training
    results = multithread_eval(client_models)
    total = sum(results)

    # Accumulate Client Parameters / Weights
    global_dict = global_model.policy.state_dict()
    for k in global_dict.keys():
        global_dict[k] = torch.stack([client_models[i].policy.state_dict()[k].float()*(NUM_CLIENT_MODELS*results[i]/total) for i in range(len(client_models))], 0).mean(0)

    # Load New Parameters to Global Model
    global_model.policy.load_state_dict(global_dict)

    # Load New Parameters to clients
    for model in client_models:
        model.policy.load_state_dict(global_model.policy.state_dict())

    # Evaluate Updated Global Model
    evaluate([global_model], env, 0, [0], 'Global Updated Model', verbose = 0)


Type Global Initial Model Mean reward: 9.99
Train Iter:  0
Optim Steps:  320
Type Trained Model 1 Mean reward: 108.34
Type Trained Model 2 Mean reward: 176.39
Type Trained Model 3 Mean reward: 181.3
Type Trained Model 0 Mean reward: 312.37
Type Global Updated Model Mean reward: 245.08999999999997
Train Iter:  1
Optim Steps:  640
Type Trained Model 0 Mean reward: 222.47000000000003
Type Trained Model 3 Mean reward: 255.99
Type Trained Model 2 Mean reward: 271.81000000000006
Type Trained Model 1 Mean reward: 292.27000000000004
Type Global Updated Model Mean reward: 270.2
Train Iter:  2
Optim Steps:  960
Type Trained Model 3 Mean reward: 274.69999999999993
Type Trained Model 0 Mean reward: 325.22
Type Trained Model 2 Mean reward: 345.07
Type Trained Model 1 Mean reward: 371.90999999999997
Type Global Updated Model Mean reward: 331.61999999999995
Train Iter:  3
Optim Steps:  1280
Type Trained Model 2 Mean reward: 331.43
Type Trained Model 3 Mean reward: 374.92999999999995
Type Trained Mode

In [9]:
global_model.get_parameters()

{'policy': OrderedDict([('mlp_extractor.policy_net.0.weight',
               tensor([[-1.7479e-01, -1.4356e-01, -2.9067e-01, -3.3917e-01],
                       [-2.1077e-01, -1.2386e-02,  2.1131e-01,  7.6488e-02],
                       [ 1.2600e-02, -1.1981e-01,  9.8463e-02, -3.1738e-01],
                       [-1.5360e-01, -1.7929e-01, -2.2833e-01, -1.0829e-01],
                       [ 2.8456e-02,  1.0108e-01,  1.2718e-01, -6.6476e-02],
                       [ 4.4974e-02,  7.0620e-02,  4.7945e-01,  3.0896e-01],
                       [ 4.9151e-01,  1.3986e-01,  8.1873e-02,  1.8094e-01],
                       [-8.8696e-03, -2.3996e-02, -4.7761e-01, -6.6724e-02],
                       [ 1.6739e-01,  9.2729e-02, -5.1045e-02,  3.3629e-02],
                       [-5.4289e-02, -1.5929e-01, -2.8036e-01, -3.7247e-01],
                       [-1.2554e-01,  1.2663e-01,  2.0524e-01,  2.8253e-01],
                       [ 5.5599e-02,  8.9562e-02,  1.1913e-01,  1.1776e-01],
              

In [10]:
global_model.save('a2c_lunar_multiproc')

In [11]:
# Exporting Params as JSON
## Function to Convert Params Dict to Flattened List
def flatten_list(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = {}
    for key in params.keys():
        params_[key] = params[key].tolist()
    return params_
## Write Parameters to JSON File
import json

all_params = global_model.get_parameters()
pol_params = flatten_list(all_params['policy'])

all_params['policy'] = pol_params

with open('a2c_lunar_multiproc.json', 'w') as f:
    json.dump(all_params, f, indent='\t')

In [12]:
# model_loaded = ALGO(
#     "MlpPolicy",
#     env
# )

# evaluate(model_loaded,env, verbose=1)

# import json
# with open('a2c_lunar_multiproc.json', 'w') as f:
#     new_params = json.load(f)

# loaded_pol_params = new_params['policy']
# for key in loaded_pol_params.keys():
#     loaded_pol_params[key] = th.tensor(loaded_pol_params[key])

# new_params['policy'] = loaded_pol_params

# model_loaded.set_parameters(new_params)

model_loaded = ALGO.load('a2c_lunar_multiproc', env)

In [13]:
env.reset()
evaluate(model_loaded,env, verbose=1)

TypeError: evaluate() missing 2 required positional arguments: 'index' and 'eval_results'