# Single Train Gradient Update

## Importing Libraries

In [1]:
from typing import Dict
import threading

import gym
import numpy as np
import torch as th

from stable_baselines3 import PPO as ALGO
from stable_baselines3.common.evaluation import evaluate_policy

  for external in metadata.entry_points().get(self.group, []):


## Init. ENV and Model

In [2]:
env = gym.make('CartPole-v1')
model = ALGO.load(
    "initial",
    env
)

## Functions to Evaluate Model and Train Model within Thread

In [3]:
def evaluate(model, env, message = '', verbose = 0):
    fitnesses = []
    iterations = 20
    for i in range(iterations):
        fitness, _ = evaluate_policy(model, env)
        if verbose == 1:
            print(i, fitness, end=" ")
        fitnesses.append(fitness)

    mean_fitness = np.mean(sorted(fitnesses))
    print(f'Type {message} Mean reward: {mean_fitness}')

In [4]:
def train(model, timesteps):
    # print('Starting Training')
    model.learn(total_timesteps=timesteps)
    # print('Completed Training')

## Initial Evaluation

In [5]:
evaluate(model, gym.make('CartPole-v1'))



Type  Mean reward: 9.070000000000002


## Train for 1K Steps and Evaluate

In [6]:
# Train MT Model 1
t1 = threading.Thread(target=train, args=(model, 10))

# starting thread
t1.start()

# wait until thread is completely executed
t1.join()


# model_trained.learn(total_timesteps=10_00)
evaluate(model, gym.make('CartPole-v1'))


Type  Mean reward: 144.91000000000003


## Apply Gradient and Evaluate

In [7]:
for i in range(20):
    print('Train Iter: ', i, end="| ")

    # Train MT Model 1
    t1 = threading.Thread(target=train, args=(model, 1))

    # starting thread
    t1.start()
    
    # wait until thread is completely executed
    t1.join()
    
    evaluate(model, gym.make('CartPole-v1'), 'Updated Model', verbose=0)


Train Iter:  0| Type Updated Model Mean reward: 282.505
Train Iter:  1| Type Updated Model Mean reward: 352.86499999999995
Train Iter:  2| Type Updated Model Mean reward: 244.54000000000002
Train Iter:  3| Type Updated Model Mean reward: 350.06000000000006
Train Iter:  4| Type Updated Model Mean reward: 413.93500000000006
Train Iter:  5| Type Updated Model Mean reward: 424.44000000000005
Train Iter:  6| Type Updated Model Mean reward: 472.8549999999999
Train Iter:  7| Type Updated Model Mean reward: 474.08000000000004
Train Iter:  8| Type Updated Model Mean reward: 499.62
Train Iter:  9| Type Updated Model Mean reward: 500.0
Train Iter:  10| Type Updated Model Mean reward: 500.0
Train Iter:  11| Type Updated Model Mean reward: 500.0
Train Iter:  12| Type Updated Model Mean reward: 500.0
Train Iter:  13| Type Updated Model Mean reward: 500.0
Train Iter:  14| Type Updated Model Mean reward: 500.0
Train Iter:  15| Type Updated Model Mean reward: 500.0
Train Iter:  16| Type Updated Model M

In [8]:
model.get_parameters()

{'policy': OrderedDict([('mlp_extractor.policy_net.0.weight',
               tensor([[ 8.1545e-03,  3.9859e-02, -5.9752e-01, -1.6215e-01],
                       [ 3.6693e-01,  1.0810e-01, -4.7896e-01, -2.9211e-01],
                       [-1.5548e-01, -2.8461e-01, -1.6824e-01,  2.5530e-01],
                       [-1.2323e-01,  3.0495e-01,  5.3607e-01,  3.2447e-01],
                       [ 2.1222e-01,  1.1768e-01, -7.5320e-02, -3.1634e-02],
                       [-3.0526e-02,  7.5230e-02,  6.2863e-02,  7.9263e-02],
                       [-1.2118e-01, -1.2661e-01, -3.1060e-01,  9.3076e-02],
                       [-2.8579e-01, -3.7972e-01, -3.5494e-01,  1.2195e-02],
                       [-1.5230e-01,  1.2521e-01,  3.2009e-01,  3.6858e-01],
                       [-1.9243e-01,  1.9617e-02,  2.5028e-01,  1.2537e-01],
                       [ 1.8439e-01,  2.9693e-02,  7.2991e-02,  2.8980e-01],
                       [-2.6300e-01,  1.5612e-02,  5.3567e-01,  6.4870e-01],
              

In [9]:
model.save('a2c_lunar_singleproc')

In [10]:
# Exporting Params as JSON
## Function to Convert Params Dict to Flattened List
def flatten_list(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = {}
    for key in params.keys():
        params_[key] = params[key].tolist()
    return params_
## Write Parameters to JSON File
import json

all_params = model.get_parameters()
pol_params = flatten_list(all_params['policy'])

all_params['policy'] = pol_params

with open('a2c_lunar_singleproc.json', 'w') as f:
    json.dump(all_params, f, indent='\t')

TypeError: Object of type Tensor is not JSON serializable

In [None]:
model_loaded = ALGO(
    "MlpPolicy",
    env
)

evaluate(model_loaded,env, verbose=1)

new_params = all_params
loaded_pol_params = new_params['policy']
for key in loaded_pol_params.keys():
    loaded_pol_params[key] = th.tensor(loaded_pol_params[key])

new_params['policy'] = loaded_pol_params

model_loaded.set_parameters(new_params)

In [None]:
env.reset()
evaluate(model_loaded,env, verbose=1)