In [None]:
# Development of an agent-based simulation model in combination with reinforcement learning in Python using Mesa library



# - At the beginning of an episode, 10 plants (as agents) are planted

# - Plants must grow for 10 days (steps) before they can be harvested.

# - Each plant has a 10% chance of dying every day.

# - A new (fresh) plant can be bought every day (cost $10) to be planted

# - The aim is to harvest 10 plants that each grew for 10 days. When the goal is reached, there is a reward of $20 per plant harvested and the episode ends

# - Each day of the episode costs $5

# - Reinforcement learning is now used to find a strategy when to plant new trees that minimizes total costs.

In [264]:
import mesa
import seaborn as sns
import numpy as np
import random
import pandas as pd
from tqdm import tqdm

In [268]:
class PlantAgent(mesa.Agent):

    def __init__(self, unique_id, model):
        super().__init__(unique_id, model)

        self.alive = True

        self.age = 0
        self.harvested = False

    def step(self):
        if self.alive:
            if self.age != 10:
                self.age += 1

        if self.age == 10:
            self.harvested = True

        # print(f"Agent ID: {str(self.unique_id)} Agent Status: {self.alive} Agent Life: {str(self.age)} Harvest Status: {self.harvested}")

        if not self.alive or self.harvested:
            return


        death_outcome = random.choices([False, True], weights=[0.1, 0.9], k=1)[0]

        self.alive = death_outcome


In [291]:

sorted([1, 5, 2, 3, 4])

[1, 2, 3, 4, 5]

In [296]:
class PlantModel(mesa.Model):

    def __init__(self, N):
        super().__init__()

        self.num_agents = N

        self.number_of_days = 0
        self.number_of_plants_harvested = 0

        self.schedule = mesa.time.RandomActivation(self)

        for i in range(self.num_agents):
            a = PlantAgent(i, self)
            self.schedule.add(a)

    def get_state(self):
        agents = [a for a in self.schedule.agents]
        # state = [(0, 0) for i in range(len(agents))]
        state = []

        number_of_plants_harvested = 0

        for a in agents:
            print(
                f"Agent ID: {str(a.unique_id)}     Agent Status: {a.alive}     Agent Life: {str(a.age)}     Harvest Status: {a.harvested}"
            )

            if a.alive and not a.harvested:
                state.append(a.age)

            if a.harvested:
                number_of_plants_harvested += 1

            # state[a.unique_id] = (a.alive, a.age)

        return number_of_plants_harvested, sorted(state)
        # return state

    def check_terminated(self, observation):
        return list(map(lambda x: x[1] == 10, observation)).count(True)

    def step(self, action):

        self.number_of_days += 1
        terminated = 0
        truncated = False
        reward = -5

        if action == 1:
            self.num_agents += 1
            a = PlantAgent(self.num_agents - 1, self)
            self.schedule.add(a)

            reward -= 10

        self.schedule.step()

        observation = self.get_state()


        # number_of_plants_harvested = list(map(lambda x: x[0] and x[1] == 10, observation)).count(True)
        difference = observation[0] - self.number_of_plants_harvested
        reward += difference * 20

        self.number_of_plants_harvested = observation[0]

        if self.number_of_days == 500:
            truncated = True

        if self.number_of_plants_harvested == 10:
            terminated = 1

        return observation, reward, terminated, truncated

Thera are a few things to consider

Terminal State: This would be achieved when we have 10 plants  
Truncation: It should eventually converge in 500 iterations. No more than 500 steps.  
Reward: Positive for harvesting and negative for buying a plant plus on each step you get a negative reward  
Observation: This would be the current state. The state would be a list of tuples (status_of_death, age)  

In [None]:
# Essentially what you want the state space to be is that you want to know the status of each plant (alive, life, harvested) and the total cost so far

# And, so we need to construct our state space.

# There must be a function that constructs this state space for the model. The state space can also be infinite as we need to see what kind of things we have to make it in there.b


In [297]:
model = PlantModel(10)

In [349]:
# for steps in range(500):
observation, reward, terminated, truncated = model.step(0)
print()

Agent ID: 22     Agent Status: True     Agent Life: 10     Harvest Status: True
Agent ID: 8     Agent Status: True     Agent Life: 10     Harvest Status: True
Agent ID: 18     Agent Status: False     Agent Life: 1     Harvest Status: False
Agent ID: 9     Agent Status: True     Agent Life: 10     Harvest Status: True
Agent ID: 4     Agent Status: False     Agent Life: 1     Harvest Status: False
Agent ID: 19     Agent Status: False     Agent Life: 3     Harvest Status: False
Agent ID: 16     Agent Status: False     Agent Life: 7     Harvest Status: False
Agent ID: 1     Agent Status: False     Agent Life: 1     Harvest Status: False
Agent ID: 20     Agent Status: False     Agent Life: 6     Harvest Status: False
Agent ID: 12     Agent Status: False     Agent Life: 3     Harvest Status: False
Agent ID: 17     Agent Status: False     Agent Life: 6     Harvest Status: False
Agent ID: 26     Agent Status: True     Agent Life: 7     Harvest Status: False
Agent ID: 23     Agent Status: True 

In [351]:
model.num_agents

27

In [350]:
observation, reward, terminated, truncated

((10, [7]), 15, 1, False)

1. We want to define the state space.  
2. We want to define the action space.  
3. We want to make the Q and Value Table for each state. And, then do our reinforcement learning via value iteration, policy iteration or Q Learning

Conclusion is that we can't quite define the state and action space. They are in fact very hard to define and do. What we instead need is that a more nuanced approach to it. And, do Q Learning instead.

Perhaps we could try Monte Carlo Control to do it too.

In [258]:
10*2*27

540

Let's figure out the state space first.

In [253]:
random.randint(0,1)

0

In [286]:
def construct_state(observation):
    pass

In [287]:
observation = state_space[0]

In [288]:

for plant in observation:


((True, 1),
 (False, 1),
 (True, 1),
 (True, 1),
 (True, 1),
 (True, 1),
 (True, 1),
 (True, 1),
 (True, 1),
 (True, 1))

In [277]:
state_space = []

truncated = 0

for _ in tqdm(range(100)):
    model = PlantModel(10)

    observation = []
    reward = 0
    terminated = 0

    while not terminated and not truncated:
        action = random.randint(0, 1)

        observation, reward, terminated, truncated = model.step(action)
        # print()

        if observation not in state_space:
            state_space.append(tuple(observation))

100%|██████████| 100/100 [00:01<00:00, 85.93it/s]


In [261]:
model.num_agents

43

In [278]:
len(state_space)

4238

In [279]:
len(set(state_space))

4101

In [282]:
lenghts_of_states = []
for state in state_space:
    lenghts_of_states.append(len(state))

In [284]:
max(lenghts_of_states)

54

In [285]:
2*10*54

1080

In [274]:
2*10

20

In [273]:
state_space

[[(True, 1),
  (True, 1),
  (True, 1),
  (False, 1),
  (True, 1),
  (True, 1),
  (False, 1),
  (True, 1),
  (True, 1),
  (True, 1)],
 [(True, 2),
  (True, 2),
  (True, 2),
  (False, 1),
  (True, 2),
  (True, 2),
  (False, 1),
  (True, 2),
  (True, 2),
  (True, 2)],
 [(True, 3),
  (True, 3),
  (True, 3),
  (False, 1),
  (True, 3),
  (True, 3),
  (False, 1),
  (True, 3),
  (True, 3),
  (False, 3)],
 [(True, 4),
  (False, 4),
  (True, 4),
  (False, 1),
  (True, 4),
  (True, 4),
  (False, 1),
  (True, 4),
  (True, 4),
  (False, 3)],
 [(True, 5),
  (False, 4),
  (True, 5),
  (False, 1),
  (True, 5),
  (False, 5),
  (False, 1),
  (True, 5),
  (True, 5),
  (False, 3),
  (True, 1)],
 [(True, 6),
  (False, 4),
  (True, 6),
  (False, 1),
  (True, 6),
  (False, 5),
  (False, 1),
  (True, 6),
  (False, 6),
  (False, 3),
  (True, 2),
  (True, 1)],
 [(True, 7),
  (False, 4),
  (True, 7),
  (False, 1),
  (True, 7),
  (False, 5),
  (False, 1),
  (True, 7),
  (False, 6),
  (False, 3),
  (True, 3),
  (T

In [257]:
27*2*50

2700