In [31]:
class FSMFileTransitionsError(Exception):
    
    def __init__(self, list_of_states):
        print("States of the following unique IDs have an amount of 'possible discrete actions (transitions)' that is not equal to the amount of transitions in their 'Transition names' field.")
        print(*list_of_states)
        print("Please recheck the states and try again.")

In [32]:
import pandas

def convert_state_names(file_name):
    df = pandas.read_csv(file_name)

    if 'Discretized_state_name' not in df.columns:
        state_names_dict = {}
        counter = 1
        for row in df.itertuples(index=False):
            print(row)
            if row.State_name not in state_names_dict:
                state_names_dict[row.State_name] = counter
                print(state_names_dict[row.State_name])
                counter += 1
        
        df['Discretized_state_name'] = df['State_name'].map(state_names_dict)
        df.to_csv(file_name, index=False)

    else:
        print('INFO: The file provided already contains discretized state names.')

In [33]:
convert_state_names('pelican.csv')

INFO: The file provided already contains discretized state names.


In [34]:
import numpy as np
import gym
from stable_baselines3.common.env_checker import check_env
from csv import DictReader

class FSMEnv(gym.Env):
  """
  Custom Environment that follows gym interface.
  This is a simple env where the agent must learn to go always left. 
  """
  # Because of google colab, we cannot implement the GUI ('human' render mode)
  metadata = {'render.modes': ['console']}

  def __init__(self, file_name, start_state_id=None):
    super(FSMEnv, self).__init__()

    # The name of the .csv file containing the FSM
    self.file_name = file_name
    self.start_state_id = start_state_id

    convert_state_names(file_name)

    if start_state_id is None:
      preset_start_state = False
    else:
      preset_start_state = True
      self.start_state_id = start_state_id

    if preset_start_state == False:  
      start_state_already_set = False
      multiple_start_states = False

      with open(self.file_name, 'r') as csv_file:
        # pass the file object to DictReader() to get the DictReader object
        csv_dict_reader = DictReader(csv_file)
        start_states = []

        # iterate over each line as a ordered dictionary
        for row in csv_dict_reader:
          # row variable is a dictionary that represents a row in csv
          if row['Start_state'] == '1':
            start_states.append(row['Unique_ID'])
            
            if start_state_already_set == False:
              start_state_already_set = True
              self.start_state_id = int(row['Unique_ID'])
            else:
              self.start_state_id = None
              multiple_start_states = True
      
        if multiple_start_states == True:
          print("Multiple start states have been found!")
          correct_start_state_provided = False

          while correct_start_state_provided == False:
            print("Please choose one of the found start states:")
            for unique_id in start_states:
              print(unique_id)

            self.start_state_id = input("Provide an unique ID: ")
            
            if self.start_state_id not in start_states:
              print("ID provided is not in the list!")
            
            else:
              correct_start_state_provided = True
        
    row_counter = 0

    # Define action and observation space
    # They must be gym.spaces objects
    # Example when using discrete actions, we have two: left and right
    with open(self.file_name, 'r') as csv_file:
      csv_dict_reader = DictReader(csv_file)
      broken_states = []
      broken_states_exist = False

      for row in csv_dict_reader:
        row_counter += 1

        if len(row['Transitions_to_states'].split()) != int(row['Possible_discrete_actions_(transitions)']):
          broken_states_exist = True
          broken_states.append(row['Unique_ID'])

        if row['Unique_ID'] == self.start_state_id:
          initial_actions = int(row['Possible_discrete_actions_(transitions)'])
          self.action_space = gym.spaces.Discrete(initial_actions)
    
    if broken_states_exist == True:
      raise FSMFileTransitionsError(broken_states)

    self.past_states = np.zeros((100,), dtype=int)
    self.amount_of_states_visited = 0
    self.agent_pos = self.start_state_id
    self.current_discretized_state = 0

    # Dict observation space
    self.observation_space = gym.spaces.Dict(
    {
        # Current state obs
        'current_state': gym.spaces.Discrete(row_counter),
        # Past states history obs
        'past_states': gym.spaces.Box(low=0, high=row_counter, shape=(100,), dtype=np.uint8),
        # Transitions to states ?
        # 'transitions_to_states': spaces.Box(low=0, high=row_counter, shape=(100,), dtype=np.uint8),
        # Amount of states already visited (could be garbage data?)
        'amount_of_states': gym.spaces.Discrete(row_counter),
    })

  def reset(self):
    """
    Important: the observation must be a numpy array
    :return: (np.array) 
    """
    # Initialize the agent at the start state
    self.past_states = np.zeros((100,), dtype=int)
    self.amount_of_states_visited = 0
    self.agent_pos = self.start_state_id

    with open(self.file_name, 'r') as csv_file:
      csv_dict_reader = DictReader(csv_file)
      
      for row in csv_dict_reader:
        if self.start_state_id == int(row['Unique_ID']):
          self.current_discretized_state = int(row['Discretized_state_name'])

    observation = {
      'current_state': self.current_discretized_state,
      'past_states': self.past_states,
      'amount_of_states': self.amount_of_states_visited, 
    }

    return observation

  def step(self, action):
    if action == self.LEFT:
      self.agent_pos -= 1
    elif action == self.RIGHT:
      self.agent_pos += 1
    else:
      raise ValueError("Received invalid action={} which is not part of the action space".format(action))

    # Account for the boundaries of the grid
    self.agent_pos = np.clip(self.agent_pos, 0, self.grid_size)

    # Are we at the left of the grid?
    done = bool(self.agent_pos == 0)

    # Null reward everywhere except when reaching the goal (left of the grid)
    reward = 1 if self.agent_pos == 0 else 0

    # Optionally we can pass additional info, we are not using that for now
    info = {}

    return np.array([self.agent_pos]).astype(np.float32), reward, done, info

  def render(self, mode='console'):
    if mode != 'console':
      raise NotImplementedError()
    # agent is represented as a cross, rest as a dot
    print("." * self.agent_pos, end="")
    print("x", end="")
    print("." * (self.grid_size - self.agent_pos))

  def close(self):
    pass

In [35]:
env = FSMEnv('pelican.csv')
# If the environment don't follow the interface, an error will be thrown
check_env(env, warn=True)
#print(env.start_state_id)

INFO: The file provided already contains discretized state names.
Multiple start states have been found!
Please choose one of the found start states:
1
2
4
6


NameError: name 'current_state' is not defined

In [None]:
env = gym.make('CartPole-v0')
env.reset()
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample()) # take a random action
env.close()

KeyboardInterrupt: 

In [None]:
import gym

from stable_baselines3 import PPO

env = gym.make('CartPole-v0')

model = PPO('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10000)

obs = env.reset()
for i in range(1000):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
      obs = env.reset()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23.8     |
|    ep_rew_mean     | 23.8     |
| time/              |          |
|    fps             | 2697     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------


In [None]:
env = GoLeftEnv(grid_size=10)

obs = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

GO_LEFT = 0
# Hardcoded best agent: always go left!
n_steps = 20
for step in range(n_steps):
  print("Step {}".format(step + 1))
  obs, reward, done, info = env.step(GO_LEFT)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render()
  if done:
    print("Goal reached!", "reward=", reward)
    break

NameError: name 'GoLeftEnv' is not defined

In [None]:
from stable_baselines3 import DQN, PPO, A2C
from stable_baselines3.common.cmd_util import make_vec_env

# Instantiate the env
env = GoLeftEnv(grid_size=10)
# wrap it
env = make_vec_env(lambda: env, n_envs=1)



In [None]:
# Train the agent
model = PPO('MlpPolicy', env, verbose=1).learn(5000)

Using cpu device
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 138      |
|    ep_rew_mean      | 1        |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 17244    |
|    time_elapsed     | 0        |
|    total timesteps  | 552      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 161      |
|    ep_rew_mean      | 1        |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 17428    |
|    time_elapsed     | 0        |
|    total timesteps  | 1290     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 159      |
|    ep_rew_mean      | 1        |
|    exploration rate | 0.05     |
| time/               |          |
|  

In [None]:
# Test the trained agent
obs = env.reset()
n_steps = 20
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break

Step 1
Action:  [1]
obs= [[10.]] reward= [0.] done= [False]
..........x
Step 2
Action:  [1]
obs= [[10.]] reward= [0.] done= [False]
..........x
Step 3
Action:  [1]
obs= [[10.]] reward= [0.] done= [False]
..........x
Step 4
Action:  [1]
obs= [[10.]] reward= [0.] done= [False]
..........x
Step 5
Action:  [1]
obs= [[10.]] reward= [0.] done= [False]
..........x
Step 6
Action:  [1]
obs= [[10.]] reward= [0.] done= [False]
..........x
Step 7
Action:  [1]
obs= [[10.]] reward= [0.] done= [False]
..........x
Step 8
Action:  [1]
obs= [[10.]] reward= [0.] done= [False]
..........x
Step 9
Action:  [1]
obs= [[10.]] reward= [0.] done= [False]
..........x
Step 10
Action:  [1]
obs= [[10.]] reward= [0.] done= [False]
..........x
Step 11
Action:  [1]
obs= [[10.]] reward= [0.] done= [False]
..........x
Step 12
Action:  [1]
obs= [[10.]] reward= [0.] done= [False]
..........x
Step 13
Action:  [1]
obs= [[10.]] reward= [0.] done= [False]
..........x
Step 14
Action:  [1]
obs= [[10.]] reward= [0.] done= [False]