# Reinforcement Learning- and FEM-based Inverse Design

## Experiment Logger

In [1]:
import os
import neptune.new as neptune

os.environ['NEPTUNE_PROJECT']="pil-clemson/metamtl-rl"
os.environ['NEPTUNE_NOTEBOOK_ID']="45d03d69-6ac7-41ca-8af8-80caaa73aad5"
os.environ['NEPTUNE_NOTEBOOK_PATH']="metamaterial-rl/RemoteFEM-DQN.ipynb"

exp = neptune.init_run(project="pil-clemson/metamtl-rl",)

https://app.neptune.ai/pil-clemson/metamtl-rl/e/METAMTLRL-168
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


## Import

In [2]:
from __future__ import annotations
from typing import Union, Optional, Callable, Any
from typing import Tuple, List, Set, Dict
from typing import NamedTuple
from typing import Generator

In [3]:
from collections import defaultdict, deque
from types import SimpleNamespace
import queue
from queue import PriorityQueue
from enum import Enum

In [4]:
from dataclasses import dataclass, field

In [5]:
import traceback

In [6]:
import ipywidgets as widgets

In [7]:
import os
import sys
import copy
import time
from datetime import datetime, timedelta
from pprint import pformat
import multiprocessing
import random
import math
import itertools
import uuid

In [8]:
import matplotlib.pyplot as plt

In [9]:
import torch
from torch import nn

from torch import Tensor, BoolTensor

from torch.nn.modules.loss import _Loss
from torch.optim import Optimizer

from torchvision.transforms import PILToTensor

print('PyTorch version:', torch.__version__)

PyTorch version: 1.13.0


In [10]:
import torchinfo

In [11]:
import numpy as np

In [12]:
from SimHubClient import SimHubClient

## Computing Devices

In [13]:
print('CPU Cores:', multiprocessing.cpu_count())

CPU Cores: 56


In [14]:
# Getting all memory using os.popen()
mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')  # e.g. 4015976448
mem_gib = mem_bytes/(1024.**3)
print('Memory size:', int(mem_gib), 'GiB')

Memory size: 376 GiB


In [15]:
available_gpus = [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]
print('GPUs:', available_gpus)

GPUs: ['Tesla V100S-PCIE-32GB', 'Tesla V100S-PCIE-32GB']


In [16]:
cuda = torch.device('cpu') if torch.cuda.is_available() else torch.device('cpu')
print('Current computing device:', cuda)

Current computing device: cpu


## Helper Functions

In [17]:
clip = lambda x, l, u: l if x < l else u if x > u else x

In [18]:
def draw_sim():
    ...

## DEBUG FLAG

In [19]:
DEBUG = True

## Hyperparameters

In [20]:
environment_configuration = {
    'grid_size': (4, 4),
    'result_size': (40, 40),
}

hyperparameters = {
    'target_update_interval': 100,
    'optimization_iterations': 10,
    'experience_replay_capacity': 10000,
    'replay_batch_size': 32,
    'lr': .001,
    'discount_factor': .9,
    'explore_factor_initial': 1.,
    'explore_factor_minimal': 0.05,
    'explore_factor_halflife': 2000.,

    'final_threshold': 0.01,
    'final_step_threshold': 100,

    'max_episode': 1000,
    'max_step_per_episode': 1000,
    
    'goal_reward':10000.,
    'terminal_error_threshold': 100.,
    'invalid_state_penalty': -1000000.,
    'loop_penalty':-1000000.,
    
}    

exp['EnvConfig'] = environment_configuration
exp['Hyperparameters'] = hyperparameters


  exp['EnvConfig'] = environment_configuration


## Reinforcement Learning Environment

## Interfaces and Dataclasses

In [21]:
class State(dict):
    def __init__(self) -> None:
        super().__init__()
        
    def step(self, action: 'Action') -> 'State': 
        return action(copy.deepcopy(self))

    def to_tensor(self) -> Tensor: raise NotImplementedError

In [22]:
class Action:
    def __init__(self, name: str, action: Callable[[State], State]) -> None:
        self.name = name
        self.action = action
        
    def __repr__(self) -> str:
        return self.name
        
    def __call__(self, state: State) -> State:
        return self.action(state)
# Action = Callable[[State], State]

In [23]:
class Environment:    
    def __init__(self) -> None:
        self._state: State = None
        self._action_space: List[Action] = list()
        self._valid_actions: BoolTensor = None

    def __repr__(self) -> str:
        return f'''{self.__class__.__name__}(
    Action space size: {self.action_count()}
    Current state: {self.state}
)'''
        
    @property
    def state(self) -> State: return self._state
    
    @property
    def action_space(self) -> List[Action]: return self._action_space        

    def action_count(self) -> int: return len(self._action_space)        

    @property
    def valid_actions(self) -> BoolTensor: return self._valid_actions
    
    def reset(self) -> None: raise NotImplementedError
        
    def step(self, action_index: int) -> None: raise NotImplementedError


In [24]:
class ReplayTransition(NamedTuple):
    state: State
    action_index: int
    reward: float
    next_state: State

In [25]:
@dataclass
class SimulationTransition:
    # From stepping
    episode: int
    step: int
    
    state: State
    action_index: int
    next_state: State
    
    
    action_name: str = None
    
    # From FEM simulator
    state_id: str = None
    next_state_id: str = None
    
    state_sim: Dict[str, Any] = None
    next_state_sim: Dict[str, Any] = None
    
    # From reward function
    state_error: float = None
    is_state_terminal: bool = None
    
    next_state_error: float = None
    is_next_state_terminal: bool = None
    
    reward: float = None    
    
    def __repr__(self) -> str:
        return f'[{self.episode}-{self.step}] ' +\
                f'{self.state}({self.state_error}, {self.is_state_terminal})' +\
                f' =={self.action_name}==> ' +\
                f'{self.next_state}({self.state_error}, {self.is_state_terminal})' +\
                f'  R:{self.reward} {"Loop!" if self.state_id == self.next_state_id else ""}'

In [26]:
RewardFunc = Callable[[SimulationTransition], SimulationTransition]

## State and Environment

In [27]:
class TurnableGridState(State):
    def __init__(self) -> None:
        super().__init__()
        self['angle_matrix'] = np.zeros(environment_configuration['grid_size'])
    
    def to_tensor(self) -> torch.Tensor:
        return torch.tensor(self['angle_matrix'].flatten()).float()

In [28]:
class TurnableGridEnvironment(Environment):    
    def __init__(self) -> None:
        super().__init__()
        
        self.grid_size = environment_configuration['grid_size']
        
        self.angle_range = [-90, 90]
        self.angle_modifiers = [-15, 15]
        
        self._valid_actions = torch.full([self.grid_size[0] * self.grid_size[1] * len(self.angle_modifiers)], 
                                         True, dtype=torch.bool)
        
        self.reset()
        
        def angle_matrix_action(i, j, mod):
            def action(state):
                state['angle_matrix'][i, j] = clip(state['angle_matrix'][i, j] + self.angle_modifiers[mod], 
                                                   self.angle_range[0], self.angle_range[1])
                # If reach the boundry, update availbility status
                if state['angle_matrix'][i, j] in self.angle_range:
                    self._valid_actions[i * j + mod] = False
                else:
                    self._valid_actions[i * j + mod] = True
                return state
            return Action(f'({i}, {j})->{self.angle_modifiers[mod]}', action)
        
        for i in range(self.grid_size[0]):
            for j in range(self.grid_size[1]):
                for mod in range(len(self.angle_modifiers)):
                    self._action_space.append(angle_matrix_action(i, j, mod))

                    
    def reset(self) -> None: 
        self._state = TurnableGridState()
        
    def step(self, action_index: int) -> None: 
        action = self._action_space[action_index]
        self._state = self._state.step(action)

## DQN

### FEM-based Reward & Terminal Function

In [29]:
class FEMReward():
    def __init__(self,
                 target: Tensor,
                 hyperparameters: Dict[str, Any]) -> None:

        self.target = (target - 293.15) / 60
        # exp['target_value'] = target

        self.loss = nn.MSELoss(reduction='sum')

        self.goal_reward = hyperparameters['goal_reward']
        self.terminal_error_threshold = hyperparameters['terminal_error_threshold']
        self.loop_penalty = hyperparameters['loop_penalty']
        self.invalid_state_penalty = hyperparameters['invalid_state_penalty']

    def __call__(self, transition: SimulationTransition) -> SimulationTransition:
        """
        Calculate reward value for a transition, and determine if a terminal state is reached

        Parameters
        ----------
        transition : SimulationTransition
            A transition with completed simulation data

        Returns
        -------
        float | None
            Reward value, None if the next_state is terminal
        bool
            The next_state is terminal

        Raises
        ------
        TODO
        """
        
        if transition.state_sim and transition.state_sim['status'] == 'done':
            state_result = torch.tensor(transition.state_sim['output']['temperature_distribution'][2])
            transition.state_error = float(self.loss((state_result - 293.15) / 60, self.target))
            transition.is_state_terminal = transition.state_error <= self.terminal_error_threshold
        
        if transition.next_state_sim and transition.next_state_sim['status'] == 'done':
            next_state_result = torch.tensor(transition.next_state_sim['output']['temperature_distribution'][2])
            transition.next_state_error = float(self.loss((next_state_result - 293.15) / 60, self.target))
            transition.is_next_state_terminal = transition.next_state_error <= self.terminal_error_threshold
        
        transition.reward = 0.
        
        # Reward decreasing error from state to next state
        transition.reward += transition.state_error - transition.next_state_error
        
        # # Negative reward based on next_state_error
        # if transition.next_state_error:
        #     transition.reward = -transition.next_state_error
        # else:
        #     transition.reward = self.invalid_state_penalty
        
        # If the state not changed after apply the action(s_n -> s_n),
        # the loop penalty is applied
        if transition.state_id == transition.next_state_id:
            transition.reward = self.loop_penalty
        

        
        # Reward extra if next state is final
        if transition.is_next_state_terminal:
            transition.reward = self.goal_reward
        
        return transition

### Network Definition

In [30]:
# Network Container
class Model():
    def __init__(self, network: nn.Module, loss_func: _Loss, optimizer: Optimizer):
        self.network = network
        self.loss_func = loss_func
        self.optimizer = optimizer

    def __call__(self, network_input: Tensor) -> Tensor:
        return self.network(network_input)

In [31]:
def QNet(state_size: int = 16, action_number: int = 32, target_network: bool = False):
    net = nn.Sequential(
        nn.Linear(state_size, 100, device=cuda),
        nn.ReLU(),
        nn.Linear(100, 200, device=cuda),
        nn.ReLU(),
        nn.Linear(200, action_number, device=cuda),
    )
    if target_network:
        return Model(network=net, loss_func=None, optimizer=None)
    else:
        # exp['Network'] = str(torchinfo.summary(net, input_size=(32, state_size), 
        #                                        device=cuda, verbose=0))
        return Model(network=net, loss_func=nn.SmoothL1Loss(), optimizer=torch.optim.Adam(net.parameters(), 0.001))

### Replay Memory Class

In [32]:
class ReplayMemory():
    def __init__(self, capacity):
        self.memory: deque = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(ReplayTransition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

### Agent Class

In [33]:
class Agent():
    def __init__(self, environment: Environment, simulator: SimHubClient, reward_func: RewardFunc, 
                 q_network: nn.Module, target_network: nn.Module, hyperparameters: Dict[str, Any]) -> None:
        self.environment: Environment = environment
        self.fem_simulator: SimHubClient = simulator
        self.fem_reward_func: RewardFunc = reward_func
        
        self.q_network: Model = q_network
        self.target_network: Model = target_network  
        
        self.target_update_interval: int = hyperparameters['target_update_interval']

        self.optimization_iterations: int = hyperparameters['optimization_iterations']
        self.max_step_per_episode: int = hyperparameters['max_step_per_episode']
        self.experience_replay: ReplayMemory = ReplayMemory(hyperparameters['experience_replay_capacity'])
        self.replay_batch_size: int = hyperparameters['replay_batch_size']

        self.discount_factor: float = hyperparameters['discount_factor']
        self.explore_factor_initial: float = hyperparameters['explore_factor_initial']
        self.explore_factor_minimal: float = hyperparameters['explore_factor_minimal']
        self.explore_factor_halflife: float = hyperparameters['explore_factor_halflife']
        
        self.pending_transitions: List[SimulationTransition] = list()
        
        # Set to true when generating result
        self.generation_mode: bool = False
        self.explored_step: int = 0
        
        self.episode: int = 0
        self.step_num: int = 0
        

    
    def select_action(self) -> Tuple[State, int, str]:
        """
        Decide an action based on epsilon greedy algorithm

        Returns
        -------
        State
            Current state instance
        int
            Index number of an action in the action space
        
        str
            Action type, literal string of "Prediction" or "Random"
        """
        state = self.environment.state
        explore_factor = max(self.explore_factor_initial 
                             * (0.5 ** (self.explored_step / self.explore_factor_halflife)), 
                             self.explore_factor_minimal)
        
        if random.random() > explore_factor or self.generation_mode:
            prediction = self.q_network(state.to_tensor().flatten())
            # Mask invalid actions with with negative number
            action_scores = prediction.flatten().masked_fill(self.environment.valid_actions, sys.float_info.min)
            action_index = action_scores.argmax().item()
            action_type = 'Prediction'
            
        else:
            action_index = random.randrange(len(self.environment.action_space))
            action_type = 'Random'
            self.explored_step += 1
        return state, action_index, action_type
    
    def step(self) -> bool:
        """
        Perform an action in the in the environment and submit the transition as FEM task to simulator

        Returns
        -------
        bool
            Return False if the episode ends earlier (a terminal state encountered), 
            Otherwise, return True to continue the current episode
        """
        state, action_index, action_type = self.select_action()
        self.environment.step(action_index)
        next_state = self.environment.state
        
        
        transition = SimulationTransition(self.episode, self.step_num, state, action_index, next_state)
        transition.state_id, state_result = self.fem_simulator.submit_task(state)
        transition.next_state_id, next_state_result = self.fem_simulator.submit_task(next_state)
        
        if DEBUG:
            transition.action_name = self.environment.action_space[action_index].name
        
        # If both result already exsit, proceed to reward and memory instead of waiting
        if state_result and next_state_result:
            transition.state_sim = state_result
            transition.next_state_sim = next_state_result
            
            self.compute_reward(transition)
            return not transition.is_state_terminal
        
        self.pending_transitions.append(transition)
        return True
    
    def compute_reward(self, transition: SimulationTransition) -> None:
        """
        Compute reward value and terminal status for a COMPLETED transition. 
        The states, action and reward will be pushed into experience replay
        
        If the current state is terminal, transition.next_state will be set to None

        Returns
        -------
        SimulationTransition
            Return transition
        """
        self.fem_reward_func(transition)
        
        if DEBUG and not self.generation_mode:
            exp[f'transitions/{transition.episode}'].append(str(transition))

        if not self.generation_mode:
            self.experience_replay.push(transition.state.to_tensor(), 
                                        transition.action_index, 
                                        transition.reward, 
                                        None if transition.is_state_terminal else transition.next_state.to_tensor())

        
    def compute_pending_rewards(self) -> None:
        self.fem_simulator.wait()

        while len(self.pending_transitions) > 0:
            transition: SimulationTransition = self.pending_transitions.pop(0)

            transition.state_sim = self.fem_simulator.get_result(transition.state_id)
            transition.next_state_sim = self.fem_simulator.get_result(transition.next_state_id)
            
            self.compute_reward(transition)
            
            # Skip all remaining transition beyond terminal state
            if transition.is_state_terminal:
                break
        
    def optimize(self) -> None:
        if len(self.experience_replay) < self.replay_batch_size: return

        samples = self.experience_replay.sample(self.replay_batch_size)
        batch = ReplayTransition(*zip(*samples))

        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                                  batch.next_state)), device=cuda, dtype=torch.bool)
        # If none of the transition has a valid next_step, skip the round
        if not non_final_mask.any():
            return
        non_final_next_states = torch.stack([s.flatten() for s in batch.next_state
                                                        if s is not None])

        state_batch = torch.stack([s.flatten() for s in batch.state])
        action_batch = torch.tensor(batch.action_index, device=cuda).unsqueeze(1)
        reward_batch = torch.tensor(batch.reward, device=cuda)

        state_action_values = self.q_network(state_batch).gather(1, action_batch)

        next_state_values = torch.zeros(self.replay_batch_size, device=cuda)
        next_state_values[non_final_mask] = self.target_network(non_final_next_states).max(1)[0].detach()

        expected_state_action_values = (next_state_values * self.discount_factor) + reward_batch

        loss = self.q_network.loss_func(state_action_values, expected_state_action_values.unsqueeze(1))
        optimization_loss = float(loss)
        self.q_network.optimizer.zero_grad()
        loss.backward()
        for param in self.q_network.network.parameters():
            param.grad.data.clamp_(-1, 1)
        self.q_network.optimizer.step()
        return optimization_loss
        
    def update_target_network(self) -> None:
        self.target_network.network.load_state_dict(self.q_network.network.state_dict())
        
    def train(self, episodes: int) -> None:
        for episode in range(episodes):
            print('')
            print(f'Episode: {episode}')
            self.episode = episode
            
            for self.step_num in range(self.max_step_per_episode):
                if not self.step():
                    break
            self.compute_pending_rewards()
            
            for i in range(self.optimization_iterations):
                loss = self.optimize()
                exp['optimization_loss'].append(loss, step=self.episode + i / self.optimization_iterations)
                
            self.fem_simulator.clear_tasks()
            if episode % self.target_update_interval == 0:
                self.update_target_network()
                
            if DEBUG:
                print('Generating...')
                generated_state, generated_result = self.generate()
                
                exp['generated_state'].append(str(generated_state), step=self.episode)
                
                result_size = (len(np.unique(generated_result[0])), len(np.unique(generated_result[1])))
                plt.imshow(generated_result[2].reshape(result_size))
                exp['generated_result'].append(plt.gcf(), step=self.episode)
                plt.close()
                
                print('Done')
            
        
    def generate(self) -> State:
        self.generation_mode = True
        for self.step_num in range(self.max_step_per_episode):
            if not self.step():
                break
        
        # It is possible that the result is an unknown state
        state_id, state_result = self.fem_simulator.submit_task(self.environment.state)
        self.fem_simulator.wait(print_stats=False, progress_bar=False)
        self.fem_simulator.clear_tasks()
        self.generation_mode = False
        return self.environment.state, self.fem_simulator.get_result(state_id)['output']['temperature_distribution']
        

## Training

In [34]:
env = TurnableGridEnvironment()

fem = SimHubClient('10.128.97.115', 44444, database_ip='10.125.9.35')
fem.set_experiment('./elmer_task/elmer_task.yml')

target_arr = np.load('target.npy')
target_size = (len(np.unique(target_arr[0])), len(np.unique(target_arr[1])))
plt.imshow(target_arr[2].reshape(target_size))
exp['target'] = plt.gcf()
plt.close()

reward_func = FEMReward(torch.tensor(target_arr[2]), hyperparameters)

agent = Agent(env, fem, reward_func, QNet(), QNet(target_network=True), hyperparameters)

/home/nwen/metamaterial-rl/elmer_task/elmer_script.py
/home/nwen/metamaterial-rl/elmer_task/data
Establishing working directory structure...
Working directory structure established
Copying script files...
Copying /home/nwen/metamaterial-rl/elmer_task/elmer_script.py
/home/nwen/metamaterial-rl/elmer_task/elmer_script.py copied
Copying data files...
Copying /home/nwen/metamaterial-rl/elmer_task/data
/home/nwen/metamaterial-rl/elmer_task/data copied
Entry script set to /scratch1/nwen/simhub/workspaces/scripts/elmer_script.py


In [35]:
#print(agent.generate())

agent.train(1000)

#print(agent.generate())


Episode: 0
Existed: 2(100.00%)
Duplicated: 0(0.00%)
New: 0(0.00%)


  0%|          | 0/2 [00:00<?, ?it/s]

Successful: 2(100.00%)
Failed: 0(0.00%)
Generating...


  exp['optimization_loss'].append(loss, step=self.episode + i / self.optimization_iterations)


KeyboardInterrupt: 

In [None]:
agent.generate()

In [36]:
fem.close()

In [37]:
exp.stop()

Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/pil-clemson/metamtl-rl/e/METAMTLRL-168


In [None]:
raise NotImplementedError

In [None]:
fem = SimHubClient('10.128.97.115', 44444, database_ip='10.125.9.35')
fem.set_experiment('./elmer_task/elmer_task.yml')



In [None]:
state = TurnableGridState()
state['angle_matrix'] = np.array(
    [[45, 80, -80, -45], 
     [10, 45, -45, -10], 
     [-10, -45, 45, 10], 
     [-45, -80, 80, 45]]
)
state

In [None]:
task = fem.submit_task(state)

In [None]:
task

In [None]:
fem.wait()

In [None]:
np.save('target.npy', fem.get_result(task[0])['output']['temperature_distribution'])

In [None]:
fem.close()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.imshow(np.load('target.npy')[2].reshape(40, 40))
plt.gcf().savefig('target.png')

In [None]:
plt.show()

In [None]:
len(np.unique(np.load('target.npy')[0]))