In [None]:
# Development of an agent-based simulation model in combination with reinforcement learning in Python using Mesa library



# - At the beginning of an episode, 10 plants (as agents) are planted

# - Plants must grow for 10 days (steps) before they can be harvested.

# - Each plant has a 10% chance of dying every day.

# - A new (fresh) plant can be bought every day (cost $10) to be planted

# - The aim is to harvest 10 plants that each grew for 10 days. When the goal is reached, there is a reward of $20 per plant harvested and the episode ends

# - Each day of the episode costs $5

# - Reinforcement learning is now used to find a strategy when to plant new trees that minimizes total costs.

In [204]:
import mesa
import seaborn as sns
import numpy as np
import random
import pandas as pd
from tqdm import tqdm
import pickle
import matplotlib.pyplot as plt

In [2]:
class PlantAgent(mesa.Agent):

    def __init__(self, unique_id, model):
        super().__init__(unique_id, model)

        self.alive = True

        self.age = 0
        self.harvested = False

    def step(self):
        if self.alive:
            if self.age != 10:
                self.age += 1

        if self.age == 10:
            self.harvested = True

        # print(f"Agent ID: {str(self.unique_id)} Agent Status: {self.alive} Agent Life: {str(self.age)} Harvest Status: {self.harvested}")

        if not self.alive or self.harvested:
            return


        death_outcome = random.choices([False, True], weights=[0.1, 0.9], k=1)[0]

        self.alive = death_outcome


In [3]:
class PlantModel(mesa.Model):

    def __init__(self, N):
        super().__init__()

        self.num_agents = N

        self.number_of_days = 0
        self.number_of_plants_harvested = 0

        self.schedule = mesa.time.RandomActivation(self)

        for i in range(self.num_agents):
            a = PlantAgent(i, self)
            self.schedule.add(a)

    def get_state(self):
        agents = [a for a in self.schedule.agents]
        # state = [(0, 0) for i in range(len(agents))]
        state = []

        number_of_plants_harvested = 0

        for a in agents:
            # print(
            #     f"Agent ID: {str(a.unique_id)}     Agent Status: {a.alive}     Agent Life: {str(a.age)}     Harvest Status: {a.harvested}"
            # )

            if a.alive and not a.harvested:
                state.append(a.age)

            if a.harvested:
                number_of_plants_harvested += 1

            # state[a.unique_id] = (a.alive, a.age)

        return number_of_plants_harvested, tuple(sorted(state))
        # return state

    def check_terminated(self, observation):
        return list(map(lambda x: x[1] == 10, observation)).count(True)

    def step(self, action):

        self.number_of_days += 1
        terminated = 0
        truncated = False
        reward = -5

        if action == 1:
            self.num_agents += 1
            a = PlantAgent(self.num_agents - 1, self)
            self.schedule.add(a)

            reward -= 10

        self.schedule.step()

        observation = self.get_state()


        # number_of_plants_harvested = list(map(lambda x: x[0] and x[1] == 10, observation)).count(True)
        difference = observation[0] - self.number_of_plants_harvested
        reward += difference * 20

        self.number_of_plants_harvested = observation[0]

        if self.number_of_days == 500:
            truncated = True

        if self.number_of_plants_harvested == 10:
            terminated = 1

        return observation, reward, terminated, truncated

Thera are a few things to consider

Terminal State: This would be achieved when we have 10 plants  
Truncation: It should eventually converge in 500 iterations. No more than 500 steps.  
Reward: Positive for harvesting and negative for buying a plant plus on each step you get a negative reward  
Observation: This would be the current state. The state would be a list of tuples (status_of_death, age)  

In [None]:
# Essentially what you want the state space to be is that you want to know the status of each plant (alive, life, harvested) and the total cost so far

# And, so we need to construct our state space.

# There must be a function that constructs this state space for the model. The state space can also be infinite as we need to see what kind of things we have to make it in there.b


In [65]:
model = PlantModel(10)

In [67]:
initial_state = model.get_state()

In [72]:
initial_state

(0, (0, 0, 0, 0, 0, 0, 0, 0, 0, 0))

In [69]:
Q_Table[initial_state] = [0, 0]

In [359]:
# for steps in range(500):
observation, reward, terminated, truncated = model.step(0)
print()




In [360]:
model.num_agents

10

In [350]:
observation, reward, terminated, truncated

((10, [7]), 15, 1, False)

1. We want to define the state space.  
2. We want to define the action space.  
3. We want to make the Q and Value Table for each state. And, then do our reinforcement learning via value iteration, policy iteration or Q Learning

Conclusion is that we can't quite define the state and action space. They are in fact very hard to define and do. What we instead need is that a more nuanced approach to it. And, do Q Learning instead.

Perhaps we could try Monte Carlo Control to do it too.

Let's figure out the state space first.

In [418]:
state_space = set()

truncated = 0

for _ in tqdm(range(500000)):
    model = PlantModel(10)

    observation = []
    reward = 0
    terminated = 0

    while not terminated and not truncated:
        action = random.randint(0, 1)

        observation, reward, terminated, truncated = model.step(action)
        # print()


        state_space.add(observation)

        # if observation not in state_space:
        #     state_space.append((observation[0], tuple(observation[1])))

100%|██████████| 500000/500000 [09:52<00:00, 843.54it/s]  


In [379]:
model.num_agents

22

In [426]:
with open("state_space.pkl", "wb") as file:
    pickle.dump(state_space_list, file)

In [7]:
with open("state_space.pkl", "rb") as file:
    loaded_state_space = pickle.load(file)

In [9]:
type(loaded_state_space)

list

In [10]:
Q_Table = {}

In [12]:
for state in loaded_state_space:
    Q_Table[state] = [0, 0]

In [154]:
def action_selection(state, Q_Table, epsilon=0.1):

    if state not in Q_Table:
        Q_Table[state] = [0, 0]


    if random.random() < epsilon:
        return random.randint(0, 1)
    else:
        return Q_Table[state].index(max(Q_Table[state]))

In [63]:
Q_Table[sample_state] = [0, 0]

In [62]:
sample_state = (0, (1, 1, 1, 1))
action_selection(sample_state)

0

In [155]:
def generate_trajectory(model, Q_Table, epsilon=0.1):
    trajectory = []

    trajectory = []

    state = (0, (0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
    terminated = 0
    truncated = 0
    next_state = (0, (0, 0, 0, 0, 0, 0, 0, 0, 0, 0))

    model = PlantModel(10)

    while not terminated and not truncated:
        state = next_state
        action = action_selection(state, Q_Table, epsilon)

        observation, reward, terminated, truncated = model.step(action)

        next_state = observation

        experience_tuple = (state, action, reward, next_state)

        trajectory.append(experience_tuple)

    return trajectory

In [198]:
class QLearning:
    def __init__(self, Q_Table, gamma=1, number_of_iterations=100000):
        self.number_of_iterations = number_of_iterations

        self.Q = Q_Table

        self.ε = self.get_parameters_exponential_decay(decay_rate=0.999995)
        self.α = self.get_parameters_exponential_decay(decay_rate=0.999995)

        self.trajectories = [[]]

    def epsilon_greedy_exponential(self, iteration, s):
        ε = self.ε[iteration]

        a = 0

        if s not in self.Q:
            self.Q[s] = [0, 0]

        if np.random.random() > ε:
            a = np.argmax(self.Q[s])
        else:
            a = np.random.randint(len(self.Q[s]))

        return a

    def get_parameters_exponential_decay(
        self, initial_value=1, min_value=0.01, decay_rate=0.99
    ):
        num_points = self.number_of_iterations

        exponential_decay_parameters = initial_value * (
            decay_rate ** np.arange(num_points)
        )
        exponential_decay_parameters = np.where(
            exponential_decay_parameters < min_value,
            min_value,
            exponential_decay_parameters,
        )

        return exponential_decay_parameters

    def do_one_qlearning_iteration(self, iteration, γ=0.99):

        s = (0, (0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
        terminated = 0
        truncated = 0
        s_prime = (0, (0, 0, 0, 0, 0, 0, 0, 0, 0, 0))

        trajectory = []

        model = PlantModel(10)
        while not terminated and not truncated:
            a = self.epsilon_greedy_exponential(iteration, s)
            s_prime, R, terminated, truncated= model.step(a)


            if s_prime not in self.Q:
                self.Q[s_prime] = [0, 0]




            self.Q[s][a] = self.Q[s][a] + self.α[iteration] * (
                R + γ * max(self.Q[s_prime]) - self.Q[s][a]
            )

            trajectory.append((s, a, R, s_prime))

            s = s_prime

        return trajectory

    def do_qlearning(self):
        for i in tqdm(range(self.number_of_iterations)):
            trajectory = self.do_one_qlearning_iteration(i)
            # self.do_one_qlearning_iteration(i)
            self.trajectories.append(trajectory)

        # self.build_tables()

In [205]:
QL = QLearning(new_Q_Table, number_of_iterations=1000000)

In [206]:
QL.do_qlearning()

100%|██████████| 1000000/1000000 [1:45:58<00:00, 157.28it/s] 


In [207]:
new_Q_Table = QL.Q

In [209]:

with open("Q_Table.pkl", "wb") as file:
    pickle.dump(new_Q_Table, file)

In [210]:
with open("Q_Table.pkl", "rb") as file:
    loaded_Q_Table = pickle.load(file)

In [212]:
loaded_Q_Table == new_Q_Table

True

In [208]:
new_Q_Table

{(0, ()): [-192.61324145849912, -192.61605538453887],
 (0, (1,)): [-181.89284058155488, -181.89346809860345],
 (0, (1, 1, 1, 1)): [-153.28874209221706, -151.9460503622287],
 (0, (1, 1, 1, 1, 1)): [-160.4550991092499, -162.94316993902802],
 (0, (1, 1, 1, 1, 1, 1)): [-153.1294102221277, -146.93953006026808],
 (0, (1, 1, 1, 1, 1, 1, 1)): [-141.12491449602106, -133.2436401288181],
 (0, (1, 1, 1, 1, 1, 1, 1, 1)): [-123.04376753208625, -119.94450932902633],
 (0, (1, 1, 1, 1, 1, 1, 1, 1, 1)): [-108.15671987699963, -106.41558579736703],
 (0, (1, 1, 1, 1, 1, 1, 1, 1, 1, 1)): [-95.40153117853401, -86.80743825432731],
 (0, (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)): [-81.69018706781293,
  -74.8988061861237],
 (0, (1, 2)): [-169.15848052774425, -169.09523792327798],
 (0, (1, 2, 2)): [-80.42750897743912, -88.55968166957058],
 (0, (1, 2, 2, 2)): [-158.0672150626791, -158.22973084877438],
 (0, (1, 2, 2, 2, 2)): [-156.82438858333958, -154.565750264854],
 (0, (1, 2, 2, 2, 2, 2)): [-146.67601808193646, -139.978

In [156]:
model = PlantModel(10)

In [186]:
trajectory = generate_trajectory(model ,new_Q_Table)

In [187]:
len(trajectory)

31

In [188]:
rewards = list(map(lambda x: x[2], trajectory))
sum(rewards)

-115

In [189]:
new_Q_Table[initial_state]

[-103.34844233069197, -94.88279202860576]

In [None]:
action_selection

In [175]:
trajectory = QL.do_one_qlearning_iteration(0)

In [124]:
for experience in trajectory:
    print(experience)

((0, (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), 0, -5, (0, (1, 1, 1, 1, 1, 1, 1, 1, 1)))
((0, (1, 1, 1, 1, 1, 1, 1, 1, 1)), 0, -5, (0, (2, 2, 2, 2, 2, 2, 2)))
((0, (2, 2, 2, 2, 2, 2, 2)), 0, -5, (0, (3, 3, 3, 3, 3)))
((0, (3, 3, 3, 3, 3)), 0, -5, (0, (4, 4, 4, 4)))
((0, (4, 4, 4, 4)), 0, -5, (0, (5, 5, 5)))
((0, (5, 5, 5)), 1, -15, (0, (1, 6, 6)))
((0, (1, 6, 6)), 1, -15, (0, (1, 2, 7)))
((0, (1, 2, 7)), 0, -5, (0, (2, 3, 8)))
((0, (2, 3, 8)), 1, -15, (0, (1, 4, 9)))
((0, (1, 4, 9)), 0, 15, (1, (2, 5)))
((1, (2, 5)), 0, -5, (1, (6,)))
((1, (6,)), 1, -15, (1, (1, 7)))
((1, (1, 7)), 0, -5, (1, (2, 8)))
((1, (2, 8)), 0, -5, (1, (3, 9)))
((1, (3, 9)), 0, 15, (2, (4,)))
((2, (4,)), 1, -15, (2, (1, 5)))
((2, (1, 5)), 1, -15, (2, (1, 2, 6)))
((2, (1, 2, 6)), 0, -5, (2, (2, 3, 7)))
((2, (2, 3, 7)), 1, -15, (2, (1, 3, 4, 8)))
((2, (1, 3, 4, 8)), 1, -15, (2, (2, 4, 5, 9)))
((2, (2, 4, 5, 9)), 0, 15, (3, (3, 5, 6)))
((3, (3, 5, 6)), 1, -15, (3, (1, 4, 6, 7)))
((3, (1, 4, 6, 7)), 0, -5, (3, (2, 5, 7, 8)))
(

In [126]:
new_Q_Table = QL.Q

In [134]:
trajectory[3][0]

(0, (3, 3, 3, 3, 3))

In [137]:
new_Q_Table[trajectory[5][0]]

[0, -15.0]

In [85]:
trajectory = generate_trajectory(model)

In [86]:
len(trajectory)

153

In [87]:
trajectory

[((0, (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
  0,
  -5,
  (0, (1, 1, 1, 1, 1, 1, 1, 1, 1, 1))),
 ((0, (1, 1, 1, 1, 1, 1, 1, 1, 1, 1)),
  0,
  -5,
  (0, (2, 2, 2, 2, 2, 2, 2, 2, 2, 2))),
 ((0, (2, 2, 2, 2, 2, 2, 2, 2, 2, 2)),
  0,
  -5,
  (0, (3, 3, 3, 3, 3, 3, 3, 3, 3))),
 ((0, (3, 3, 3, 3, 3, 3, 3, 3, 3)), 0, -5, (0, (4, 4, 4, 4, 4, 4, 4, 4))),
 ((0, (4, 4, 4, 4, 4, 4, 4, 4)), 1, -15, (0, (5, 5, 5, 5, 5, 5, 5))),
 ((0, (5, 5, 5, 5, 5, 5, 5)), 0, -5, (0, (6, 6, 6, 6, 6, 6, 6))),
 ((0, (6, 6, 6, 6, 6, 6, 6)), 0, -5, (0, (7, 7, 7, 7, 7, 7, 7))),
 ((0, (7, 7, 7, 7, 7, 7, 7)), 0, -5, (0, (8, 8, 8, 8, 8, 8, 8))),
 ((0, (8, 8, 8, 8, 8, 8, 8)), 0, -5, (0, (9, 9, 9, 9, 9, 9, 9))),
 ((0, (9, 9, 9, 9, 9, 9, 9)), 0, 135, (7, ())),
 ((7, ()), 0, -5, (7, ())),
 ((7, ()), 0, -5, (7, ())),
 ((7, ()), 0, -5, (7, ())),
 ((7, ()), 1, -15, (7, (1,))),
 ((7, (1,)), 0, -5, (7, ())),
 ((7, ()), 0, -5, (7, ())),
 ((7, ()), 0, -5, (7, ())),
 ((7, ()), 0, -5, (7, ())),
 ((7, ()), 0, -5, (7, ())),
 ((7, ()), 0, -5, (