In [1]:
####  IMPORTS


import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from torch_geometric.data import Batch
import torch_geometric
from torch_geometric.nn import global_mean_pool
import time

from torch.optim.lr_scheduler import StepLR
import gymnasium as gym
from gymnasium.spaces import Discrete, Box, MultiDiscrete
import math
import random
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from collections import namedtuple, deque
from itertools import count
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import nbimporter
import autograd, autograd.core, autograd.extend, autograd.tracer  # for adjoints
import autograd.numpy as anp      
import optuna
import statistics
import matplotlib.pyplot as plt

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#### FEM ANALYSIS

import time
import numpy as np                                                # for dense matrix ops
import matplotlib.pyplot as plt                                   # for plotting
import autograd, autograd.core, autograd.extend, autograd.tracer  # for adjoints
import autograd.numpy as anp      
import scipy, scipy.ndimage, scipy.sparse, scipy.sparse.linalg    # sparse matrices
import nlopt   


def causeway_bridge(width=128, height=128, density=0.08, deck_level=0.2):
  """A bridge supported by columns at a regular interval."""
  x_ix, y_ix = 0, 1
  normals = np.zeros((width + 1, height + 1, 2))
  normals[-1, -1, y_ix] = 1
  normals[-1, :, x_ix] = 1
  normals[0, :, x_ix] = 1

  forces = np.zeros((width + 1, height + 1, 2))
  forces[:, round(height * (1 - deck_level)), y_ix] = -1 / width
  return normals, forces, density
  
class ObjectView(object):
    def __init__(self, d): self.__dict__ = d
    
def get_args(normals, forces, density=0.4):  
  width = normals.shape[0] - 1
  height = normals.shape[1] - 1
  fixdofs = np.flatnonzero(normals.ravel())
  alldofs = np.arange(2 * (width + 1) * (height + 1))
  freedofs = np.sort(list(set(alldofs) - set(fixdofs)))
  params = {
      # material properties
      'young': 1, 'young_min': 1e-9, 'poisson': 0.3, 'g': 0,
      # constraints
      'density': density, 'xmin': 0.001, 'xmax': 1.0,
      # input parameters
      'nelx': width, 'nely': height, 'mask': 1, 'penal': 3.0, 'filter_width': 1,
      'freedofs': freedofs, 'fixdofs': fixdofs, 'forces': forces.ravel(),
      # optimization parameters
      'opt_steps': 5, 'print_every': 20}
  return ObjectView(params)

def mbb_beam(width=6, height=6, density=0.4, y=1, x=0):  
    normals = np.zeros((width + 1, height + 1, 2))
    normals[-1, -1, y] = 1
    normals[0, :, x] = 1
    forces = np.zeros((width + 1, height + 1, 2))
    forces[0, width, y] = -1
    #forces[0,4,y] = -1
    return normals, forces, density
#'''
def mbb_beam2(width=6, height=6, density=0.4, y=1, x=0):  
    normals = np.zeros((width + 1, height + 1, 2))
    
    normals[0, 0, :] = 1   
    normals[0, width, :] = 1  
    normals[0, width // 2, :] = 1  
    
    forces = np.zeros((width + 1, height + 1, 2))
    
    
    forces[-1, :, y] = -1 / width
    
    return normals, forces, density
#'''
def _get_solver(a_entries, a_indices, size, sym_pos):
  # a is (usu.) symmetric positive; could solve 2x faster w/sksparse.cholmod.cholesky(a).solve_A
  a = scipy.sparse.coo_matrix((a_entries, a_indices), shape=(size,)*2).tocsc()
  return scipy.sparse.linalg.splu(a).solve

@autograd.primitive
def solve_coo(a_entries, a_indices, b, sym_pos=False):
  solver = _get_solver(a_entries, a_indices, b.size, sym_pos)
  return solver(b)

def grad_solve_coo_entries(ans, a_entries, a_indices, b, sym_pos=False):
  def jvp(grad_ans):
    lambda_ = solve_coo(a_entries, a_indices if sym_pos else a_indices[::-1],
                        grad_ans, sym_pos)
    i, j = a_indices
    return -lambda_[i] * ans[j]
  return jvp

autograd.extend.defvjp(solve_coo, grad_solve_coo_entries,
                       lambda: print('err: gradient undefined'),
                       lambda: print('err: gradient not implemented'))
@autograd.extend.primitive
def gaussian_filter(x, width): # 2D gaussian blur/filter
  return scipy.ndimage.gaussian_filter(x, width, mode='reflect')

def _gaussian_filter_vjp(ans, x, width): # gives the gradient of orig. function w.r.t. x
  del ans, x  # unused
  return lambda g: gaussian_filter(g, width)
autograd.extend.defvjp(gaussian_filter, _gaussian_filter_vjp)
def young_modulus(x, e_0, e_min, p=3):
  return e_min + x ** p * (e_0 - e_min)

def physical_density(x, args, volume_contraint=False, use_filter=False):
  x = args.mask * x.reshape(args.nely, args.nelx)  # reshape from 1D to 2D
  return gaussian_filter(x, args.filter_width) if use_filter else x  # maybe filter

def mean_density(x, args, volume_contraint=False, use_filter=True):
  return anp.mean(physical_density(x, args, volume_contraint, use_filter)) / anp.mean(args.mask)
def objective_calc(x, args, volume_contraint=False, use_filter=True):
  kwargs = dict(penal=args.penal, e_min=args.young_min, e_0=args.young)
  x_phys = physical_density(x, args, volume_contraint=volume_contraint, use_filter=use_filter)
  ke     = get_stiffness_matrix(args.young, args.poisson)  # stiffness matrix
  u      = displace(x_phys, ke, args.forces, args.freedofs, args.fixdofs, **kwargs)
  c      = compliance_calc(x_phys, u, ke, **kwargs)
  #print("x_phys= ",  x_phys)
  #print("compliance = ", c)
  return c
def compliance_calc(x_phys, u, ke, *, penal=3, e_min=1e-9, e_0=1):
  nely, nelx = x_phys.shape
  ely, elx = anp.meshgrid(range(nely), range(nelx))  # x, y coords for the index map

  n1 = (nely+1)*(elx+0) + (ely+0)  # nodes
  n2 = (nely+1)*(elx+1) + (ely+0)
  n3 = (nely+1)*(elx+1) + (ely+1)
  n4 = (nely+1)*(elx+0) + (ely+1)
  all_ixs = anp.array([2*n1, 2*n1+1, 2*n2, 2*n2+1, 2*n3, 2*n3+1, 2*n4, 2*n4+1])
  u_selected = u[all_ixs]  # select from u matrix

  ke_u = anp.einsum('ij,jkl->ikl', ke, u_selected)  # compute x^penal * U.T @ ke @ U
  ce = anp.einsum('ijk,ijk->jk', u_selected, ke_u)
  C = young_modulus(x_phys, e_0, e_min, p=penal) * ce.T
  return anp.sum(C)

def get_stiffness_matrix(e, nu):  # e=young's modulus, nu=poisson coefficient
  k = anp.array([1/2-nu/6, 1/8+nu/8, -1/4-nu/12, -1/8+3*nu/8,
                -1/4+nu/12, -1/8-nu/8, nu/6, 1/8-3*nu/8])
  return e/(1-nu**2)*anp.array([[k[0], k[1], k[2], k[3], k[4], k[5], k[6], k[7]],
                               [k[1], k[0], k[7], k[6], k[5], k[4], k[3], k[2]],
                               [k[2], k[7], k[0], k[5], k[6], k[3], k[4], k[1]],
                               [k[3], k[6], k[5], k[0], k[7], k[2], k[1], k[4]],
                               [k[4], k[5], k[6], k[7], k[0], k[1], k[2], k[3]],
                               [k[5], k[4], k[3], k[2], k[1], k[0], k[7], k[6]],
                               [k[6], k[3], k[4], k[1], k[2], k[7], k[0], k[5]],
                               [k[7], k[2], k[1], k[4], k[3], k[6], k[5], k[0]]])
def get_k(stiffness, ke):
  # Constructs sparse stiffness matrix k (used in the displace fn)
  # First, get position of the nodes of each element in the stiffness matrix
  nely, nelx = stiffness.shape
  ely, elx = anp.meshgrid(range(nely), range(nelx))  # x, y coords
  ely, elx = ely.reshape(-1, 1), elx.reshape(-1, 1)

  n1 = (nely+1)*(elx+0) + (ely+0)
  n2 = (nely+1)*(elx+1) + (ely+0)
  n3 = (nely+1)*(elx+1) + (ely+1)
  n4 = (nely+1)*(elx+0) + (ely+1)
  edof = anp.array([2*n1, 2*n1+1, 2*n2, 2*n2+1, 2*n3, 2*n3+1, 2*n4, 2*n4+1])
  edof = edof.T[0]
  x_list = anp.repeat(edof, 8)  # flat list pointer of each node in an element
  y_list = anp.tile(edof, 8).flatten()  # flat list pointer of each node in elem

  # make the global stiffness matrix K
  kd = stiffness.T.reshape(nelx*nely, 1, 1)
  value_list = (kd * anp.tile(ke, kd.shape)).flatten()
  return value_list, y_list, x_list

def displace(x_phys, ke, forces, freedofs, fixdofs, *, penal=3, e_min=1e-9, e_0=1):
  # Displaces the load x using finite element techniques (solve_coo=most of runtime)
  stiffness = young_modulus(x_phys, e_0, e_min, p=penal)
  k_entries, k_ylist, k_xlist = get_k(stiffness, ke)

  index_map, keep, indices = _get_dof_indices(freedofs, fixdofs, k_ylist, k_xlist)
  
  u_nonzero = solve_coo(k_entries[keep], indices, forces[freedofs], sym_pos=True)
  u_values = anp.concatenate([u_nonzero, anp.zeros(len(fixdofs))])
  return u_values[index_map]
def _get_dof_indices(freedofs, fixdofs, k_xlist, k_ylist):
  index_map = inverse_permutation(anp.concatenate([freedofs, fixdofs]))
  keep = anp.isin(k_xlist, freedofs) & anp.isin(k_ylist, freedofs)
  # Now we index an indexing array that is being indexed by the indices of k
  i = index_map[k_ylist][keep]
  j = index_map[k_xlist][keep]
  return index_map, keep, anp.stack([i, j])

def inverse_permutation(indices):  # reverses an index operation
  inverse_perm = np.zeros(len(indices), dtype=anp.int64)
  inverse_perm[indices] = np.arange(len(indices), dtype=anp.int64)
  return inverse_perm
def fast_stopt(args, x=None, verbose=True):
  if x is None:
    x = anp.ones((args.nely, args.nelx)) * args.density  # init mass

  reshape = lambda x: x.reshape(args.nely, args.nelx)
  objective_fn = lambda x: objective_calc(reshape(x), args) # don't enforce mass constraint here
  constraint = lambda params: mean_density(reshape(params), args) - args.density

  def wrap_autograd_func(func, losses=None, frames=None):
    def wrapper(x, grad):
      if grad.size > 0:
        value, grad[:] = autograd.value_and_grad(func)(x)
      else:
        value = func(x)
      if losses is not None:
        losses.append(value)
      if frames is not None:
        frames.append(reshape(x).copy())
        if verbose and len(frames) % args.print_every == 0:
          print('step {}, loss {:.2e}, t={:.2f}s'.format(len(frames), value, time.time()-dt))
      return value
    return wrapper

  losses, frames = [], [] ; dt = time.time()
  print('Optimizing a problem with {} nodes'.format(len(args.forces)))
  opt = nlopt.opt(nlopt.LD_MMA, x.size)
  opt.set_lower_bounds(0.0) ; opt.set_upper_bounds(1.0)
  opt.set_min_objective(wrap_autograd_func(objective_fn, losses, frames))

  opt.add_inequality_constraint(wrap_autograd_func(constraint), 1e-8)
  opt.set_maxeval(args.opt_steps + 1)
  opt.optimize(x.flatten())
  return np.array(losses), reshape(frames[-1]), np.array(frames), constraint(frames[-1])
  #return objective(x, args), constraint(x)  #losses[-1]

def optim( args, x=None, verbose = True):
    if x is None:
        x = anp.ones((args.nely, args.nelx)) * args.density  # init mass
    reshape = lambda x: x.reshape(args.nely, args.nelx)
    objective_fn = lambda x: objective_calc(reshape(x), args) # don't enforce mass constraint here
    constraint = lambda params: mean_density(reshape(params), args) 
    return objective_calc(x, args), constraint(x)  #losses[-1]

In [3]:
import os
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.callbacks import BaseCallback, CallbackList


class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model and environment state.
    """
    def __init__(self, check_freq: int, save_path: str, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self) -> None:
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:
            # Save the model
            model_path = os.path.join(self.save_path, 'model.zip')
            self.model.save(model_path)

            # Save the environment state
            env_path = os.path.join(self.save_path, 'env_state.npy')
            env_state = self.training_env.get_attr('state')
            env_state_cpu = [state.cpu().numpy() for state in env_state]  # Move to CPU and convert to numpy
            np.save(env_path, env_state_cpu)

            if self.verbose > 0:
                print(f"Saving model checkpoint to {model_path}")
                print(f"Saving environment state to {env_path}")

        return True


In [4]:
class RewardLoggingCallback(BaseCallback):
    """
    Callback for logging episode rewards.
    """
    def __init__(self, save_path, verbose=1):
        super(RewardLoggingCallback, self).__init__(verbose)
        self.episode_rewards = []
        self.current_rewards = []
        self.save_path = save_path

    def _on_step(self) -> bool:
        if self.locals['dones'][0]:
            episode_reward = np.sum(self.current_rewards)
            self.episode_rewards.append(episode_reward)
            self.current_rewards = []
            if self.verbose > 0:
                print(f"Episode reward: {episode_reward}")
        else:
            self.current_rewards.append(self.locals['rewards'][0])
        return True

    def _on_training_end(self) -> None:
        # Save the rewards to a file
        np.save(self.save_path, np.array(self.episode_rewards))
        if self.verbose > 0:
            print("Training finished")
            print(f"Episode rewards: {self.episode_rewards}")


In [5]:
import gymnasium as gym
import torch
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy
import optuna

class BeamOptimizationEnv(gym.Env):
    def __init__(self, width=6, height=6, density=0.4):
        super(BeamOptimizationEnv, self).__init__()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.width = width
        self.height = height
        self.density = density
        self.max_steps = self.width * self.height
        self.action_space = gym.spaces.Discrete(self.width * self.height)
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(self.width * self.height,), dtype=np.float32)
        self.previous_constraint = 1.0
        self.significant_improvement = 10
        self.state = torch.ones((self.height, self.width), dtype=torch.float32, device=self.device) 
        self.visited = torch.zeros((self.height * self.width), dtype=bool, device=self.device)
        self.visited_cells = set()
        normals, forces, _ = mbb_beam(width, height, density)
        self.normals = torch.tensor(normals, dtype=torch.float32, device=self.device)
        self.forces = torch.tensor(forces, dtype=torch.float32, device=self.device)
        self.args = get_args(self.normals.cpu().numpy(), self.forces.cpu().numpy(), density)
        self.current_compliance = float('inf')
        self.previous_compliance = 10
        self.current_step = 0
        self.reward = 0
        self.current_mode = 'mode1'

    def reset(self, seed=None, options=None):
        if seed is not None:
            np.random.seed(seed)
            torch.manual_seed(seed)
        self.state = torch.ones((self.width * self.height), dtype=torch.float32, device=self.device) 
        self.current_step = 0
        self.visited = torch.zeros((self.width * self.height), dtype=bool)
        self.visited_cells = set()
        self.current_compliance = float('inf')
        self.previous_compliance = 10
        self.reward = 0
        return self.state.cpu().numpy(), {}

    def is_strongly_connected(self, state, threshold=0.8):
        grid = state.cpu().numpy().reshape(self.height, self.width)
        for i in range(self.height - 1):
            for j in range(self.width - 1):
                if grid[i, j] > threshold and grid[i + 1, j + 1] > threshold:
                    if grid[i, j + 1] <= threshold and grid[i + 1, j] <= threshold:
                        return False
                if grid[i + 1, j] > threshold and grid[i, j + 1] > threshold:
                    if grid[i, j] <= threshold and grid[i + 1, j + 1] <= threshold:
                        return False
        return True

    def is_connected(self, state, threshold=0.8):
        grid = state.cpu().numpy().reshape(self.width, self.height)
        n = grid.shape[0]
        visited = np.zeros_like(grid, dtype=bool)
        stack = []

        for i in range(n):
            for j in range(n):
                if grid[i, j] > threshold:
                    stack.append((i, j))
                    break
            if stack:
                break

        while stack:
            x, y = stack.pop()
            if visited[x, y]:
                continue
            visited[x, y] = True
            for dx, dy in [(-1, 0), (-1, 1), (-1, -1), (1, 0), (1, 1), (1, -1), (0, -1), (0, 1)]:
                nx, ny = x + dx, y + dy
                if 0 <= nx < n and 0 <= ny < n and not visited[nx, ny] and grid[nx, ny] > threshold:
                    stack.append((nx, ny))
        return np.all(visited[grid > threshold])

    def find_isolated_cells_class(self, state, width, height, high_threshold=0.7, low_threshold=0.2):
        isolated_cells = []
        state_np = state.reshape((width, height))
        for i in range(width):
            for j in range(height):
                if state_np[i, j] > high_threshold:
                    is_isolated = True
                    for di in [-1, 0, 1]:
                        for dj in [-1, 0, 1]:
                            if di == 0 and dj == 0:
                                continue
                            ni, nj = i + di, j + dj
                            if 0 <= ni < width and 0 <= nj < width and state_np[ni, nj] > low_threshold:
                                is_isolated = False
                                break
                        if not is_isolated:
                            break
                    if is_isolated:
                        isolated_cells.append(i * width + j)
        return isolated_cells
    
    def step(self, action):
        if self.current_mode == 'mode1':
            return self.step1(action)
        elif self.current_mode == 'mode2':
            return self.step2(action)
        else:
            return self.step3(action)

    def step1(self, action):
        cell = int(action // 1)
        value = 0.001
        self.previous_state = self.state.clone()
        reward = 0
        with torch.no_grad():
            self.current_compliance, self.current_constraint = optim(args=self.args, x=self.state.cpu().numpy())
        if self.state[cell] > 0.9:
            reward += 1
        self.state[cell] = torch.tensor(value, dtype=torch.float32, device=self.device)
        self.current_step += 1
        done = self.current_step >= 9

        reward += self.calculate_reward()
        self.reward = reward
        truncated = False
        return self.state.cpu().numpy(), reward, done, truncated, {}
    
    def step2(self, action):
        cell = int(action // 1)
        self.previous_state = self.state.clone()
        reward = 0
        
        with torch.no_grad():
            self.current_compliance, self.current_constraint = optim(args=self.args, x=self.state.cpu().numpy())
        
        if self.state[cell] < 0.1:
            reward += 1
        value = 0.999

        #reward za popravak compliancea
        
        self.state[cell] = torch.tensor(value, dtype=torch.float32, device=self.device)
        self.current_step += 1
        done = self.current_step >= 6

        reward += self.calculate_reward()
        self.reward = reward
        truncated = False
        
        return self.state.cpu().numpy(), reward, done, truncated, {}

    def calculate_reward(self):
        reward = 0
        with torch.no_grad():
            self.current_compliance, self.current_constraint = optim(args=self.args, x=self.state.cpu().numpy())
        reward += 1 / (self.current_compliance) ** 0.5
        if self.is_strongly_connected(self.state):
            reward += 0.1
        if self.is_connected(self.state):
            reward += 0.2
        isolated_cells = len(self.find_isolated_cells_class(self.state, self.height, self.width))
        if isolated_cells > 0:
            reward -= 1
        self.previous_constraint = self.current_constraint
        return reward

    def render(self, mode='human'):
        if mode == 'human':
            grid = self.state.cpu().numpy()
            grid = np.reshape(grid, (self.height, self.width))
            plt.imshow(grid, cmap='viridis', interpolation='nearest', vmin=0, vmax=1)
            plt.colorbar()
            plt.title(f"Reward: {self.reward} Compliance: {self.current_compliance:.3f}")
            plt.show()
        else:
            pass

env = BeamOptimizationEnv()
check_env(env)

def optimize_ppo(trial):
    """Define the hyperparameter search space for PPO."""
    return {
        'n_steps': trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048]),
        'gamma': trial.suggest_float('gamma', 0.9, 0.9999, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1, log=True),
        'ent_coef': trial.suggest_float('ent_coef', 0.00000001, 0.1, log=True),
        'clip_range': trial.suggest_float('clip_range', 0.1, 0.4),
        'gae_lambda': trial.suggest_float('gae_lambda', 0.8, 1.0),
        'max_grad_norm': trial.suggest_float('max_grad_norm', 0.3, 5.0),
        'vf_coef': trial.suggest_float('vf_coef', 0.1, 1.0),
        'batch_size': trial.suggest_categorical('batch_size', [32, 64, 128, 256, 512]),
    }

def objective(trial):
    """Objective function to be minimized."""
    hyperparameters = optimize_ppo(trial)
    model = PPO('MlpPolicy', env, verbose=0, **hyperparameters)
    model.learn(total_timesteps=10000)

    # Evaluate the model
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10)

    return mean_reward

#study = optuna.create_study(direction='maximize')
#study.optimize(objective, n_trials=100, n_jobs=1)

#print('Best hyperparameters: ', study.best_params)




  return torch._C._cuda_getDeviceCount() > 0


In [6]:
#'''
best_hyperparams = {'n_steps': 1024, 'gamma': 0.9268317653886465, 'learning_rate': 0.0008844770142722593, 
'ent_coef': 4.0387142903583754e-08, 'clip_range': 0.21038826076990821, 'gae_lambda': 0.8353105326939684,
 'max_grad_norm': 3.6175740624337367, 'vf_coef': 0.6852703238198472, 'batch_size': 128}
#'''

In [None]:
# Define the callbacks and save path
save_path_1 = './ppo_checkpoints_first/'
model_path_1 = os.path.join(save_path_1, 'model.zip')
env_state_path_1 = os.path.join(save_path_1, 'env_state.npy')
episode_rewards_path_1 = os.path.join(save_path_1, 'episode_rewards_1.npy')

callback_1 = SaveOnBestTrainingRewardCallback(check_freq=1000, save_path=save_path_1)
reward_logging_callback_1 = RewardLoggingCallback(save_path=episode_rewards_path_1, verbose=1)
#callback_list_1 = CallbackList([callback_1, reward_logging_callback_1])

# Train the first PPO model
model_1 = PPO('MlpPolicy', env, verbose=1, **best_hyperparams)
model_1.learn(total_timesteps=1_00_000, callback=callback_1)

# Save the model and environment state
model_1.save(model_path_1)
env_state = env.state.cpu().numpy()
np.save(env_state_path_1, env_state)

# Load the first model and state
model_1 = PPO.load(model_path_1, env=env)
env_state = np.load(env_state_path_1, allow_pickle=True)
env.reset()
env.state = torch.tensor(env_state, dtype=torch.float32, device=env.device)


In [None]:
# Train the second PPO model from the state of the first model
save_path_2 = './ppo_checkpoints_second/'
model_path_2 = os.path.join(save_path_2, 'model.zip')
env_state_path_2 = os.path.join(save_path_2, 'env_state.npy')
episode_rewards_path_2 = os.path.join(save_path_2, 'episode_rewards_2.npy')

callback_2 = SaveOnBestTrainingRewardCallback(check_freq=1000, save_path=save_path_2)
reward_logging_callback_2 = RewardLoggingCallback(save_path=episode_rewards_path_2, verbose=1)
#callback_list_2 = CallbackList([callback_3, reward_logging_callback_2])

new_hyperparams = {
    'ent_coef': 0.02,  # Increase entropy coefficient to encourage exploration
    # 'learning_rate': 1e-4,  # Adjust learning rate if needed
    # Add any other hyperparameters adjustments here
}

model_2 = PPO('MlpPolicy', env, verbose=1, **new_hyperparams)

env.current_step = 5  # Uncomment if needed
print(f"CURRENT STEP IS: {env.current_step}")

model_2.learn(total_timesteps=5_00_00, callback=callback_2)


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
CURRENT STEP IS: 5
Saving model checkpoint to ./ppo_checkpoints_second/model.zip
Saving environment state to ./ppo_checkpoints_second/env_state.npy
Saving model checkpoint to ./ppo_checkpoints_second/model.zip
Saving environment state to ./ppo_checkpoints_second/env_state.npy
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 9        |
|    ep_rew_mean     | 12.3     |
| time/              |          |
|    fps             | 140      |
|    iterations      | 1        |
|    time_elapsed    | 14       |
|    total_timesteps | 2048     |
---------------------------------
Saving model checkpoint to ./ppo_checkpoints_second/model.zip
Saving environment state to ./ppo_checkpoints_second/env_state.npy
Saving model checkpoint to ./ppo_checkpoints_second/model.zip
Saving environment state to ./ppo_checkpoints_second/env_state.npy
------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7408219dfee0>

In [None]:
# Train the third PPO model from the state of the previous model
save_path_3 = './ppo_checkpoints_third/'
model_path_3 = os.path.join(save_path_3, 'model.zip')
env_state_path_3 = os.path.join(save_path_3, 'env_state.npy')
episode_rewards_path_3 = os.path.join(save_path_3, 'episode_rewards_3.npy')

callback_3 = SaveOnBestTrainingRewardCallback(check_freq=1000, save_path=save_path_3)
reward_logging_callback_3 = RewardLoggingCallback(save_path=episode_rewards_path_3, verbose=1)
#callback_list_3 = CallbackList([callback_4, reward_logging_callback_3])

new_hyperparams = {
    'ent_coef': 0.03,  
    # 'learning_rate': 1e-4,  
}

model_3 = PPO('MlpPolicy', env, verbose=1, **new_hyperparams)

env.current_step = 6  # Uncomment if needed
print(f"CURRENT STEP IS: {env.current_step}")

model_3.learn(total_timesteps=5_00_00, callback=callback_3)


In [None]:
# Train the fourth PPO model from the state of the third model
save_path_4 = './ppo_checkpoints_fourth/'
model_path_4 = os.path.join(save_path_4, 'model.zip')
env_state_path_4 = os.path.join(save_path_4, 'env_state.npy')
episode_rewards_path_4 = os.path.join(save_path_4, 'episode_rewards_4.npy')

callback_4 = SaveOnBestTrainingRewardCallback(check_freq=1000, save_path=save_path_4)
reward_logging_callback_4 = RewardLoggingCallback(save_path=episode_rewards_path_4, verbose=1)
#callback_list_4 = CallbackList([callback_4, reward_logging_callback_4])

new_hyperparams = {
    'ent_coef': 0.03,  # Increase entropy coefficient to encourage exploration
    # 'learning_rate': 1e-4,  # Adjust learning rate if needed
    # Add any other hyperparameters adjustments here
}

env.current_mode = 'mode2'
env.current_step = 0

model_4 = PPO('MlpPolicy', env, verbose=1, **new_hyperparams)

env.current_mode = "mode2"  # Uncomment if needed
print(f"CURRENT STEP IS: {env.current_step}")

model_4.learn(total_timesteps=5_00_00, callback=callback_4)


In [None]:
import gymnasium as gym
import torch
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy
import optuna, os

save_path_1 = './ppo_checkpoints_first/'
save_path_2 = './ppo_checkpoints_second/'
save_path_3 = './ppo_checkpoints_third/'
save_path_4 = './ppo_checkpoints_fourth/'

# Load the trained models
model_1 = PPO.load(os.path.join(save_path_1, 'model.zip'), env=env)
model_2 = PPO.load(os.path.join(save_path_2, 'model.zip'), env=env)
model_3 = PPO.load(os.path.join(save_path_3, 'model.zip'), env=env)
model_4 = PPO.load(os.path.join(save_path_4, 'model.zip'), env=env)


env.current_mode = 'mode1'
# Load the saved environment state
env_state = np.load(os.path.join(save_path_1, 'env_state.npy'), allow_pickle=True)

# Reset the environment and set the state
env.reset()
env.state = torch.tensor(env_state[0], dtype=torch.float32, device=env.device)

# Inference with the first model
obs, _ = env.reset()
dones = False
print("Inference with Model 1:")
while not dones:
    action, _states = model_1.predict(obs, deterministic=True)
    obs, rewards, dones, truncated, info = env.step(action)
    env.render()



In [None]:

#  Capture the final state after the first model's inference
final_state_after_model_1 = env.state.clone()

env.current_step = 4
env.current_mode = 'mode1'
# Inference with the second model starting from the final state of the first model
env.state = final_state_after_model_1  
obs = final_state_after_model_1.cpu().numpy().flatten() 
dones = False
print("Inference with Model 2:")
i=0
while not dones:
    action, _states = model_2.predict(obs, deterministic=False)
    obs, rewards, dones, truncated, info = env.step(action)
    env.render()
    #print(f"Model 2 - State: {obs}, Action: {action}, Reward: {rewards}")

    i+=1

final_state_after_model_2 = env.state.clone()

In [None]:



env.current_step = 5
env.current_mode = 'mode1'

# Inference with the third model starting from the final state of the second model
env.state = final_state_after_model_2  
obs = final_state_after_model_1.cpu().numpy().flatten()  
dones = False
print("Inference with Model 3:")
i=0
while not dones:
    action, _states = model_3.predict(obs, deterministic=False)
    obs, rewards, dones, truncated, info = env.step(action)
    env.render()
    #print(f"Model 3 - State: {obs}, Action: {action}, Reward: {rewards}")

    i+=1

final_state_after_model_3 = env.state.clone()

In [None]:

final_state_after_model_3 = env.state.clone()


env.current_mode = 'mode2'
env.current_step = 0
# Inference with the fourth model starting from the final state of the third model
env.state = final_state_after_model_3  
obs = final_state_after_model_3.cpu().numpy().flatten()  
dones = False
print("Inference with Model 4:")
i=0
while not dones:
    action, _states = model_4.predict(obs, deterministic=False)
    obs, rewards, dones, truncated, info = env.step(action)
    env.render()
    print(f"Model 4 - State: {obs}, Action: {action}, Reward: {rewards}")

    i+=1

final_state_after_model_4 = env.state.clone()