# Multi Train Gradient Update

## Importing Libraries

In [1]:
from typing import Dict
import threading

import gym
import numpy as np
import torch

from stable_baselines3 import PPO as ALGO
from stable_baselines3.common.evaluation import evaluate_policy

  for external in metadata.entry_points().get(self.group, []):


In [2]:
# Hyper-Parameters
NUM_CLIENT_MODELS = 4
NUM_TRAINING_STEPS = 1000
NUM_ITERATIONS = 10
ENV_NAME = 'CartPole-v1'


## Init. ENV and Model

In [3]:
env = gym.make(ENV_NAME)
global_model = ALGO(
    "MlpPolicy",
    env
)

client_models = [ALGO("MlpPolicy", gym.make(ENV_NAME)) for i in range(NUM_CLIENT_MODELS)]

## Functions to Evaluate Model and Train Model within Thread

In [4]:
def evaluate(model, env, message = '', verbose = 0):
    # print('Starting Eval')
    fitnesses = []
    iterations = 10
    for i in range(iterations):
        fitness, _ = evaluate_policy(model, env)
        if verbose == 1:
            print(i, fitness, end=" ")
        fitnesses.append(fitness)

    mean_fitness = np.mean(sorted(fitnesses))
    print(f'Type {message} Mean reward: {mean_fitness}')

In [5]:
def train(models, index, timesteps):
    # print('Starting Training')
    models[index] = models[index].learn(reset_num_timesteps=False, total_timesteps=timesteps)
    # print('Completed Training')


In [6]:
def multithread_eval(client_models):
    # Create Threads
    client_threads = [] 
    for ci in range(NUM_CLIENT_MODELS):
        thread = threading.Thread(target=evaluate, args=(client_models[ci], gym.make(ENV_NAME), f'Trained Model {ci}'))
        client_threads.append(thread)

    # Start Threads
    for thread in client_threads:
        thread.start()

    # Join Threads (wait until thread is completely executed)
    for thread in client_threads:
        thread.join()

## Initial Evaluation

In [7]:
for model in client_models:
    model.set_parameters(global_model.get_parameters())

global_model.save('initial')

evaluate(global_model, env)

multithread_eval(client_models)



Type  Mean reward: 9.27
Type Trained Model 1 Mean reward: 9.28
Type Trained Model 3 Mean reward: 9.459999999999999
Type Trained Model 0 Mean reward: 9.3
Type Trained Model 2 Mean reward: 9.4


# Train and Evaluate

In [8]:
# Evaluation Before Iterated Training
evaluate(global_model, env, "Global Initial Model")

for i in range(NUM_ITERATIONS):
    print('Train Iter: ', i)

    # Create Threads
    client_threads = [] 
    for ci in range(NUM_CLIENT_MODELS):
        thread = threading.Thread(target=train, args=(client_models, ci, NUM_TRAINING_STEPS))
        client_threads.append(thread)


    # Start Threads
    for thread in client_threads:
        thread.start()

    # Join Threads (wait until thread is completely executed)
    for thread in client_threads:
        thread.join()

    # Optimization Steps Check
    print('Optim Steps: ', client_models[0].get_parameters()['policy.optimizer']['state'][0]['step'])

    # Evaluation after Training
    multithread_eval(client_models)

    # Accumulate Client Parameters / Weights
    global_dict = global_model.policy.state_dict()
    for k in global_dict.keys():
        global_dict[k] = torch.stack([client_models[i].policy.state_dict()[k].float() for i in range(len(client_models))], 0).mean(0)

    # Load New Parameters to Global Model
    global_model.policy.load_state_dict(global_dict)

    # Load New Parameters to clients
    for model in client_models:
        model.policy.load_state_dict(global_model.policy.state_dict())

    # Evaluate Updated Global Model
    evaluate(model, env, 'Global Updated Model', verbose=0)

Type Global Initial Model Mean reward: 9.360000000000001
Train Iter:  0
Optim Steps:  320
Type Trained Model 0 Mean reward: 86.98
Type Trained Model 3 Mean reward: 100.82
Type Trained Model 2 Mean reward: 192.35999999999999
Type Trained Model 1 Mean reward: 332.11
Type Global Updated Model Mean reward: 148.93
Train Iter:  1
Optim Steps:  640
Type Trained Model 1 Mean reward: 147.87
Type Trained Model 2 Mean reward: 173.61
Type Trained Model 3 Mean reward: 175.47
Type Trained Model 0 Mean reward: 277.65
Type Global Updated Model Mean reward: 174.96
Train Iter:  2
Optim Steps:  960
Type Trained Model 2 Mean reward: 305.57
Type Trained Model 1 Mean reward: 326.74
Type Trained Model 3 Mean reward: 329.68
Type Trained Model 0 Mean reward: 333.05
Type Global Updated Model Mean reward: 350.71
Train Iter:  3
Optim Steps:  1280
Type Trained Model 3 Mean reward: 375.85
Type Trained Model 0 Mean reward: 390.54
Type Trained Model 2 Mean reward: 395.01000000000005
Type Trained Model 1 Mean reward: 

In [9]:
global_model.get_parameters()

{'policy': OrderedDict([('mlp_extractor.policy_net.0.weight',
               tensor([[-0.1020, -0.0423, -0.1422,  0.2610],
                       [-0.0262,  0.1690, -0.1342, -0.2009],
                       [ 0.0452,  0.2361, -0.1432,  0.1706],
                       [ 0.1497, -0.2435, -0.3376, -0.0246],
                       [-0.0697,  0.0147,  0.4047,  0.2382],
                       [-0.0320,  0.1160, -0.3323, -0.4182],
                       [-0.3215,  0.2539,  0.1269,  0.1439],
                       [-0.1777,  0.2547,  0.2744, -0.0048],
                       [-0.2374,  0.0122, -0.1653,  0.2059],
                       [-0.0441,  0.1529, -0.2613, -0.3425],
                       [-0.1478, -0.0044, -0.4353, -0.2480],
                       [ 0.0151, -0.2379, -0.4968, -0.1327],
                       [-0.0747,  0.1341, -0.1716, -0.0738],
                       [-0.4569, -0.0599, -0.4047, -0.4025],
                       [ 0.1470, -0.2175, -0.3141, -0.3726],
                       

In [10]:
global_model.save('a2c_lunar_multiproc')

In [11]:
# Exporting Params as JSON
## Function to Convert Params Dict to Flattened List
def flatten_list(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = {}
    for key in params.keys():
        params_[key] = params[key].tolist()
    return params_
## Write Parameters to JSON File
import json

all_params = global_model.get_parameters()
pol_params = flatten_list(all_params['policy'])

all_params['policy'] = pol_params

with open('a2c_lunar_multiproc.json', 'w') as f:
    json.dump(all_params, f, indent='\t')

In [12]:
# model_loaded = ALGO(
#     "MlpPolicy",
#     env
# )

# evaluate(model_loaded,env, verbose=1)

# import json
# with open('a2c_lunar_multiproc.json', 'w') as f:
#     new_params = json.load(f)

# loaded_pol_params = new_params['policy']
# for key in loaded_pol_params.keys():
#     loaded_pol_params[key] = th.tensor(loaded_pol_params[key])

# new_params['policy'] = loaded_pol_params

# model_loaded.set_parameters(new_params)

model_loaded = ALGO.load('a2c_lunar_multiproc', env)

In [13]:
env.reset()
evaluate(model_loaded,env, verbose=1)

0 500.0 1 500.0 2 500.0 3 500.0 4 500.0 5 500.0 6 500.0 7 500.0 8 500.0 9 500.0 Type  Mean reward: 500.0
