In [8]:
import gym
import numpy as np
import torch as th

from stable_baselines3 import A2C as ALGO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env

In [9]:
env = make_vec_env("LunarLander-v2", n_envs=1)

model = ALGO("MlpPolicy", env)

In [10]:
def evaluate(model, env, message = '', verbose = 0):
    fitnesses = []
    iterations = 10
    for i in range(iterations):
        fitness, _ = evaluate_policy(model, env)
        if verbose == 1:
            print(i, fitness, end=" ")
        fitnesses.append(fitness)

    mean_fitness = np.mean(sorted(fitnesses))
    print(f'Type {message} Mean reward: {mean_fitness}')
    return mean_fitness

In [11]:
for i in range(10):
    print('Train Iter: ', i)

    model.learn(total_timesteps=10_000)

    evaluate(model, env, 'Initial Model', verbose=1)

Train Iter:  0


KeyboardInterrupt: 

In [None]:
model.save('a2c_lunar_vec')

In [50]:
del model
model = ALGO.load("a2c_lunar_vec", env)

In [51]:
# Exporting Params as JSON
## Function to Convert Params Dict to Flattened List
def flatten_pol(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    params_ = {}
    for key in params.keys():
        params_[key] = params[key].tolist()
    return params_

def flatten_opt(params):
    """
    :param params: (dict)
    :return: (np.ndarray)
    """
    for key in params.keys():
        params[key]['square_avg'] = params[key]['square_avg'].tolist()
    return params


## Write Parameters to JSON File
import json

all_params = model.get_parameters()
pol_params = flatten_pol(all_params['policy'])
opt_params = flatten_opt(all_params['policy.optimizer']['state'])

all_params['policy'] = pol_params
all_params['policy.optimizer']['state'] = opt_params

with open('a2c_lunar_vec.json', 'w') as f:
    json.dump(all_params, f, indent='\t')

In [52]:
with open('a2c_lunar_vec.json', 'r') as f:
    new_params = json.load(f)
model_loaded = ALGO(
    "MlpPolicy",
    env
)

loaded_pol_params = new_params['policy']
loaded_opt_params = new_params['policy.optimizer']['state']

for key in loaded_pol_params.keys():
    loaded_pol_params[key] = th.tensor(loaded_pol_params[key])

for key in loaded_opt_params.keys():
        loaded_opt_params[key]['square_avg'] = th.tensor(loaded_opt_params[key]['square_avg'])

new_params['policy'] = loaded_pol_params
new_params['policy.optimizer']['state'] = loaded_opt_params

model_loaded.set_parameters(new_params)

In [55]:
env = gym.make('LunarLander-v2')
evaluate(model_loaded,env, verbose=1)



0 -65.02528352546112 1 -33.91261352441421 2 -35.17712513930746 3 -2.992161099196852 4 -43.183701952241584 5 -25.783930780545354 6 -48.40815050309462 7 -67.80175702186708 8 -17.840527265653407 9 -0.5973991504482626 Type  Mean reward: -34.07226499622299


-34.07226499622299