In [1]:
import gym
import numpy as np
import torch as th

from stable_baselines3 import A2C as ALGO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env

In [2]:
env = make_vec_env("LunarLander-v2", n_envs=4)

model = ALGO("MlpPolicy", env)

In [3]:
def evaluate(model, env, message = '', verbose = 0):
    fitnesses = []
    iterations = 10
    for i in range(iterations):
        fitness, _ = evaluate_policy(model, env)
        if verbose == 1:
            print(i, fitness, end=" ")
        fitnesses.append(fitness)

    mean_fitness = np.mean(sorted(fitnesses))
    print(f'Type {message} Mean reward: {mean_fitness}')
    return mean_fitness

In [4]:
for i in range(10):
    print('Train Iter: ', i)

    model.learn(total_timesteps=10_000)

    evaluate(model, env, 'Initial Model', verbose=1)

Train Iter:  0
0 -4117.226584099999 1 -4283.858195700001 2 -4240.2552548 3 -3710.4929604 4 -4346.0585857 5 -4230.4947559 6 -4124.7666957 7 -3842.8394132000003 8 -4162.8251006 9 -3952.3227629000003 Type Initial Model Mean reward: -4101.1140309
Train Iter:  1
0 -809.4678341000001 1 -843.4929935 2 -817.0406953 3 -822.3820531000001 4 -817.4074712999999 5 -954.3712075999999 6 -938.2350838 7 -917.7406267 8 -950.3904405 9 -869.4689628 Type Initial Model Mean reward: -873.99973687
Train Iter:  2
0 -361.3938038 1 -278.01693539999997 2 39.891035200000005 3 -182.8434498 4 -190.5989877 5 -199.23621400000002 6 -150.60252340000002 7 -176.3088339 8 -220.8032921 9 -109.65498410000001 Type Initial Model Mean reward: -182.95679890000002
Train Iter:  3
0 -52.4953658 1 -68.28856019999999 2 -19.305675500000003 3 -23.03039830000001 4 -87.2794239 5 -82.32711400000001 6 6.086364700000002 7 35.5143082 8 95.88155409999999 9 -87.2563535 Type Initial Model Mean reward: -28.250066419999996
Train Iter:  4
0 4.68376

In [5]:
model.save('a2c_lunar_vec')

In [6]:
del model
model = ALGO.load("a2c_lunar_vec", env)

In [7]:
# Exporting Params as JSON
## Function to Convert Params Dict to Flattened List
def flatten_pol(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = {}
    for key in params.keys():
        params_[key] = params[key].tolist()
    return params_

def flatten_opt(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    for key in params.keys():
        params[key]['square_avg'] = params[key]['square_avg'].tolist()
    return params


## Write Parameters to JSON File
import json

all_params = model.get_parameters()
pol_params = flatten_pol(all_params['policy'])
opt_params = flatten_opt(all_params['policy.optimizer']['state'])

all_params['policy'] = pol_params
all_params['policy.optimizer']['state'] = opt_params

with open('a2c_lunar_vec.json', 'w') as f:
    json.dump(all_params, f, indent='\t')

In [8]:
with open('a2c_lunar_vec.json', 'r') as f:
    new_params = json.load(f)
model_loaded = ALGO(
    "MlpPolicy",
    env
)

loaded_pol_params = new_params['policy']
loaded_opt_params = new_params['policy.optimizer']['state']

for key in loaded_pol_params.keys():
    loaded_pol_params[key] = th.tensor(loaded_pol_params[key])

for key in loaded_opt_params.keys():
        loaded_opt_params[key]['square_avg'] = th.tensor(loaded_opt_params[key]['square_avg'])

new_params['policy'] = loaded_pol_params
new_params['policy.optimizer']['state'] = loaded_opt_params

model_loaded.set_parameters(new_params)

In [9]:
env = gym.make('LunarLander-v2')
evaluate(model_loaded,env, verbose=1)



0 83.0815350393769 1 100.58356478604655 2 41.75563291757671 3 28.62173695012158 4 96.94347761442336 5 81.22091310120939 6 79.46620474235505 7 117.29206659107975 8 40.76013276213398 9 44.08097024690831 Type  Mean reward: 71.38062347512316


71.38062347512316