In [None]:
import numpy as np
import matplotlib.pyplot as plt 
import torch
import os
import copy
import time

from agent import PPO
from env import Flowsheet

In [None]:
def plot_learning_curve(x, scores):
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
    plt.plot(x, running_avg, label="Running average", color="red")
    plt.xlabel("Epsiodes")
    plt.ylabel("Cumulative rewards")
    plt.plot(x,scores, alpha=0.2, color="red")
    plt.legend(loc="best")

In [None]:
def main():
    env_kwargs = {
        "conv": 0.975, 
        "max_iteras": 10, 
        "D_dims": np.array([5.5, 7.75]), 
        "H_dims": np.array([5.5, 7.75])
    }

    env = Flowsheet(**env_kwargs)

    random_seed = 1
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)
    env.seed(random_seed)
   

    # Hyperparameters
    kwargs = {
        "state_dim": env.observation_space.shape[0], 
        "actions": env.action_space, 
        "env_with_Dead": True,
        "gamma": 0.99, 
        "gae_lambda": 0.95, 
        "policy_clip": 0.2, 
        "n_epochs": 10, 
        "net_width": 64, 
        "lr": 2.5e-4, 
        "l2_reg": 0.5, 
        "batch_size": 64, 
        "entropy_coef": 0.01,
        "adv_normalization": True, 
        "entropy_coef_decay": 0.75
    }


    N = 2048 # lenth of long trajectory
    Max_train_steps = int(75e3)
    best_interval = int(50e3)
    score_history = []

    total_steps = 0
    traj_length = 0
    episode = 1
    best_score = -100
    
    
    model = PPO(**kwargs)

    if not os.path.exists('model'): 
        os.mkdir('model')
    
    if not os.path.exists('best_model'): 
        os.mkdir('best_model')
    
    update = 0
    num_updates = Max_train_steps // N
    
    while total_steps < Max_train_steps:
        observation, done, steps, score = env.reset(), False, 0, 0
        mask_vec = env.action_masks()

        '''Interact & trian'''
        while not done:
            steps += 1
            traj_length += 1
            total_steps += 1

            action_d, probs_d, action_c, probs_c = model.select_action(observation, mask_vec)
            action = {
                "discrete": action_d,
                "continuous": action_c}
            observation_, reward, done, info = env.step(action)

            if (done and steps != env.max_iteras):
                dw = True  #dw: dead and win
            else:
                dw = False

            model.put_data((observation, action_d, action_c, reward, observation_, probs_d, probs_c, done, dw, mask_vec))
            observation = observation_
            mask_vec = env.action_masks()
            score += reward           
            

            if traj_length % N == 0:
                a,b,c = model.train()
                traj_length = 0
                update += 1
                
                frac = 1.0 - (update - 1.0) / num_updates
                lrnow = frac * kwargs["lr"]
                model.actor.optimizer.param_groups[0]["lr"] = lrnow
                model.critic.optimizer.param_groups[0]["lr"] = lrnow

                model.save(update)
        
        
        score_history.append(score)
        avg_score = np.mean(score_history[-100:])
        print('Episode {} total steps {} avg score {:.4f}'.
              format(episode, total_steps, avg_score))
        episode += 1

        ''' best model '''
        if total_steps >= best_interval:
            if score_history[-1] > best_score:
                best_score = score_history[-1]
                model.best_save()


    env.close() 

    x = [i+1 for i in range(len(score_history))]
    plot_learning_curve(x, score_history)
    

if __name__ == '__main__':
    start_time = time.time()
    main()
    end_time = time.time()
    final_time = end_time - start_time
    print(final_time)

In [None]:
env_kwargs = {
    "conv": 0.975, 
    "max_iteras": 10, 
    "D_dims": np.array([5.5, 7.75]), 
    "H_dims": np.array([5.5, 7.75])
}

env = Flowsheet(**env_kwargs)


# Hyperparameters
kwargs = {
    "state_dim": env.observation_space.shape[0], 
    "actions": env.action_space, 
    "env_with_Dead": True,
    "gamma": 0.99, 
    "gae_lambda": 0.95, 
    "policy_clip": 0.2, 
    "n_epochs": 10, 
    "net_width": 64, 
    "lr": 2.5e-4, 
    "l2_reg": 0.5, 
    "batch_size": 64, 
    "entropy_coef": 0,
    "adv_normalization": True, 
    "entropy_coef_decay": 0
}

model = PPO(**kwargs)

model.load_best()
scores = []


for i in range(1):
    obs = env.reset()
    actions = []
    score = 0
    while True:
        mask_vec = env.action_masks()
        action_d, action_c= model.evaluate(obs, mask_vec)
        action = {
                "discrete": action_d,
                "continuous": action_c}
        obs, reward, done, info = env.step(action)
        score += reward
        actions.append(action)

        if done:
            print(f"Done, points: {score}")
            env.render()
            break
    
    scores.append(score)

print(f"Mean score: {np.mean(scores)}")